version 1.0

cran · Feb 9, 2024 · b92c22c · b92c22c
commit b92c22c
Show file tree

Hide file tree

Showing 40 changed files with 2,125 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,17 @@
+Package: VLMCX
+Type: Package
+Title: Variable Length Markov Chain with Exogenous Covariates
+Version: 1.0
+Date: 2024-02-01
+Imports: graphics, nnet, berryFunctions, stats, utils
+Authors@R: c(person("Adriano Zanin Zambom", "Developer", role = c("aut", "cre","cph"), email = "adriano.zambom@gmail.com"), person("Seonjin Kim", "Developer", role = "aut"), person("Nancy Lopes Garcia", "Developer", role = "aut") )
+Description: Models categorical time series through a Markov Chain when a) covariates are predictors for transitioning into the next state/symbol and b) when the dependence in the past states has variable length. The probability of transitioning to the next state in the Markov Chain is defined by a multinomial regression whose parameters depend on the past states of the chain and, moreover, the number of states in the past needed to predict the next state also depends on the observed states themselves. See Zambom, Kim, and Garcia (2022) <doi:10.1111/jtsa.12615>.
+License: GPL (>= 2)
+NeedsCompilation: no
+Packaged: 2024-02-07 07:14:39 UTC; adrianozambom
+Author: Adriano Zanin Zambom Developer [aut, cre, cph],
+  Seonjin Kim Developer [aut],
+  Nancy Lopes Garcia Developer [aut]
+Maintainer: Adriano Zanin Zambom Developer <adriano.zambom@gmail.com>
+Repository: CRAN
+Date/Publication: 2024-02-08 21:10:07 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,39 @@
+ce371c91a15cff1a483ab501eda17100 *DESCRIPTION
+bc5271f5892328c6f087e62ee26ae2d8 *NAMESPACE
+fe3793a96dd4e7233334aa4bc4715105 *R/AIC.VLMCX.R
+85c9eb537b286d70e0379b23d63797a3 *R/BIC.VLMCX.R
+19b915cc4bf729206df37263de63cfd5 *R/LogLik.VLMCX.R
+45dd9fc728cf2ff9f063de2127aad313 *R/VLMCX.R
+b9f0fdb8c69b565712ed354655f36491 *R/VLMCX_depth.R
+17c137fcd6e79289eee4b0c9297435bb *R/VLMCX_sorttree.R
+e66b8dd17cfb8884d7e057874e287b83 *R/add.children.R
+02252c9a05b31b62cf17f30d84f156ef *R/backtrack.beta.R
+2eefce3349bef7f7c917169f3f13bf61 *R/coef.VLMCX.R
+8a80405240dfc7be645cf241ebb9aed6 *R/context.algorithm.R
+ebdb8f7fe3dc2939a5418dd59083cff3 *R/count.param.R
+4ecbd3fef1d0aa7bc690ce02356b1c97 *R/draw.VLMCX.R
+16c8c9d9336a64a6c52ae0876a4e91ef *R/draw_node.R
+68e577cd657584ac770c138c17b14eed *R/estimate.R
+48ab00f60426b942482469e88c58e475 *R/find.underlying.context.in.tree.R
+5dc24e4e390bdea0fefc11dbbe4f9f06 *R/get.alphabet.from.tree.R
+8c6dd5f6ad7d50426758ae5133163240 *R/maximum.context.R
+d6a3e01733beffa0e95a9b9d673bd5d8 *R/predict.VLMCX.R
+54e4667d0708d97facd74770d6a418f3 *R/print.VLMCX.R
+bec409f3d1c0d91a9b2e44585749bf0d *R/remove.last.beta.R
+58f482d192c0080f13f1bb3125d309fa *R/simulate.R
+7d715790025ea5afaaebacf9a7454775 *R/update.node.R
+75d0ee772ee3f412fceff936f1958562 *R/update.node.from.tree.R
+9163a8fe12dcc96f4519e6251f2241c2 *R/vecIn.R
+0ada4f8b6e171b389128e83e38a3e604 *R/which.children.had.betas.cut.R
+c155ae58f8465af80569ff3f3d4e0cf4 *R/write.node.R
+b24d663575fd6ed3f37b08da1073f7a5 *man/AIC.Rd
+bb096039031b048844ccea4310627e21 *man/BIC.Rd
+a1fde268b909a0519969f81f07eba5af *man/LogLik.Rd
+02897eace442cb0eda98239af0266a2f *man/VLMCX.Rd
+a21a1a8329ab8f6efcec1e614e5292c6 *man/coef.Rd
+8ed09fb1756bad7f8cbc75a652924981 *man/context.algorithm.Rd
+beb56ce8f56ee7efb9a64168ff79f95b *man/draw.Rd
+ba2f67f7805663694a40ffd47ac64cab *man/estimate.Rd
+ed064759044c16784240c97678bccd0d *man/maximum.context.Rd
+03abad3ed125ba668da74d3181b0e626 *man/predict.Rd
+c1456412cf8a86da1ca1c99259b8ef41 *man/simulate.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,28 @@
+export(maximum.context)
+export(context.algorithm)
+export(AIC)
+export(BIC)
+export(estimate)
+export(coef)
+export(draw)
+export(LogLik)
+export(VLMCX)
+export(predict)
+export(simulate)
+S3method(maximum.context, default)
+S3method(context.algorithm, default)
+S3method(AIC, VLMCX)
+S3method(BIC, VLMCX)
+S3method(estimate, default)
+S3method(coef, VLMCX)
+S3method(draw, VLMCX)
+S3method(LogLik, VLMCX)
+S3method(VLMCX, default)
+S3method(simulate, default)
+S3method(predict, VLMCX)
+S3method(print, VLMCX)
+importFrom("berryFunctions", "circle")
+importFrom("graphics", "lines", "text", "par")
+importFrom("nnet", "multinom")
+importFrom("stats", "pchisq", "rnorm")
+importFrom("utils", "combn")
diff --git a/R/AIC.VLMCX.R b/R/AIC.VLMCX.R
@@ -0,0 +1,14 @@
+
+
+AIC <- function(fit)UseMethod("AIC")
+
+## computes the AIC of the model for the data in a VLMCX object using the count.param() and LogLik.VLMCX() functions
+AIC.VLMCX <- function(fit)
+{
+    k = count.param(fit$tree)
+
+    result = -2*LogLik.VLMCX(fit) + 2*k
+
+  return(result)
+}
+
diff --git a/R/BIC.VLMCX.R b/R/BIC.VLMCX.R
@@ -0,0 +1,14 @@
+
+
+BIC <- function(fit)UseMethod("BIC")
+
+## computes the BIC of the model for the data in a VLMCX object using the count.param() and LogLik.VLMCX() functions
+
+BIC.VLMCX <- function(fit)
+{       
+    k = count.param(fit$tree)
+
+    result = -2*LogLik.VLMCX(fit) + log(length(fit$y))*k
+
+  return(result)
+}
diff --git a/R/LogLik.VLMCX.R b/R/LogLik.VLMCX.R
@@ -0,0 +1,83 @@
+LogLik <- function(fit)UseMethod("LogLik")
+
+#######################################################################################################################
+# computes the log-likelihood of the data based on the tree structure and estimated parameters in the fitted VLMCX model
+#######################################################################################################################
+LogLik.VLMCX <- function(fit)
+{
+  ## estimate the parameters based on the tree structure
+  fit$tree = estimate(fit$tree, fit$y, fit$X)
+  alphabet = sort(unique(fit$y))
+
+
+  y = fit$y
+  X = fit$X
+  d = ifelse(is.null(dim(X)), 1, dim(X)[2]) ## number of covariates
+
+  logLikel = log(1/length(alphabet)) ## for the first data point - since there is no past
+  for (i in 2:length(fit$y))
+  {
+    node = find.underlying.context.in.tree(fit$y[1:(i-1)], fit$tree)  
+
+    if (is.null(node)) ## context does not exist in tree: set equal probabilities
+      prob = rep(1/length(alphabet), length(alphabet))
+    else
+    if (is.null(node$alpha)) ## there are no estimated parameters: it is not a leaf, not final node (nor partially final node)
+        prob = rep(1/length(alphabet), length(alphabet))
+    else ## there is alpha and maybe beta
+    {
+      if (is.null(node$beta)) ## check if beta was dropped
+      {
+        alpha_u = 0
+        prob = rep(0,(length(alphabet)-1))
+        for (ind_alphabet in 1:(length(alphabet)-1))
+        {
+          alpha_u = node$alpha[ind_alphabet]
+          prob[ind_alphabet] = exp(as.numeric(alpha_u))  ## probabilities are computed only with alpha
+        }
+
+      }
+      else ## beta exists
+      {
+        alpha_u = 0
+        beta_v = 0
+        prob = rep(0,(length(alphabet)-1))
+        for (ind_alphabet in 1:(length(alphabet)-1))
+        {
+          alpha_u = node$alpha[ind_alphabet]
+          if (d ==1) ## there is only one covariate through time: X is a vector
+          {
+            beta_v = as.numeric(node$beta[,1,ind_alphabet])
+            prob[ind_alphabet] = exp(as.numeric(alpha_u + beta_v%*%X[(i-1):(i-length(beta_v))]))					
+          }
+          else ## there is a vector with several covariates observed through time: X is a matrix
+          {
+            dimension = length(node$beta[,1,ind_alphabet]) ## number of steps into the past
+            beta_v = 0
+            for (ind in 1:dimension)
+              beta_v = beta_v + as.numeric(node$beta[ind,,ind_alphabet]%*%X[(i-ind),])
+            prob[ind_alphabet] = exp(as.numeric(alpha_u + beta_v))					## multinomial regression
+          }
+        }
+      }
+      prob = c(1,prob)/(1 + sum(prob))
+    }
+    prob_this_observation = prob[which(alphabet == y[i])] ## get the proba corresponding to the observed data point
+
+    if (is.nan(prob_this_observation)) ## this happens when exp(alpha+betaX) is too large then we have Inf/(1+Inf): should be 1
+      prob_this_observation = 1
+    if (prob_this_observation != 0) 
+      logLikel = logLikel + log(prob_this_observation)
+
+  }
+  return(logLikel)
+}
+
+
+
+
+
+
+
+
+
diff --git a/R/VLMCX.R b/R/VLMCX.R
@@ -0,0 +1,26 @@
+VLMCX <- function(y, X, alpha.level = 0.05, max.depth = 5, n.min = 5, trace = FALSE)UseMethod("VLMCX")
+
+###############################################################################################################
+###############################################################################################################
+## Main function
+## Starts obtaining the maximum context tree and calls the context algorithm to sequentially prune
+## as in Zambom et al 2022
+###############################################################################################################
+###############################################################################################################
+VLMCX.default <- function(y, X, alpha.level = 0.05, max.depth = 5, n.min = 5, trace = FALSE)
+{
+   max_context = maximum.context(y, X, max.depth = max.depth, n.min = n.min)
+   r = max_context$tree.depth
+
+   ## main call to the pruning algorithm
+   result = context.algorithm(max_context, max_context$tree, alpha.level = alpha.level, max.depth = max.depth, n.min = n.min, trace = trace)
+
+   if (trace == TRUE)
+     draw(result, title = "Estimated Context Tree Structure", print.coef = trace)
+
+   result$tree = estimate(result$tree, y, X)
+
+   return(result)
+}
+
+
diff --git a/R/VLMCX_depth.R b/R/VLMCX_depth.R
@@ -0,0 +1,19 @@
+VLMCX_depth <- function(VLMCXtree)UseMethod("VLMCX_depth")
+
+
+## recursive function that returns the maximum depth of the tree
+VLMCX_depth.default <- function(VLMCXtree)
+{
+
+  if (length(VLMCXtree$child) == 0)
+      result = 0
+  else
+  {
+    depth.children = 0
+    for (ind in 1:length(VLMCXtree$child))
+      depth.children[ind] = VLMCX_depth(VLMCXtree$child[[ind]])
+    result = 1 + max(depth.children)
+
+  }
+  return(result)
+}
diff --git a/R/VLMCX_sorttree.R b/R/VLMCX_sorttree.R
@@ -0,0 +1,30 @@
+VLMCX_sorttree <- function(VLMCXtree)UseMethod("VLMCX_sorttree")
+
+##### recursive function that
+##### sorts the children of each node to be in alphabetical order (corresponding to the order() function)
+VLMCX_sorttree.default <- function(VLMCXtree)
+{
+  node = VLMCXtree
+
+  n.children = length(node$child)
+  if (n.children <= 1)
+    ordered_tree = node
+  else
+  {
+    ordered_tree = node
+
+    children.names = NULL
+    for (ind in 1:n.children)
+      children.names[ind] = node$child[[ind]]$context[length(node$child[[ind]]$context)]
+
+    children.names.order = order(children.names) ## get the children names in alphabetical order
+
+    for (ind in 1:n.children)
+    {
+      ordered_tree$child[[ind]] = node$child[[children.names.order[ind]]] ## assign the ordered children names
+      ordered_tree$child[[ind]] = VLMCX_sorttree(ordered_tree$child[[ind]]) ## recursive call for the child nodes
+    }
+  }
+
+  return(ordered_tree)
+}
diff --git a/R/add.children.R b/R/add.children.R
@@ -0,0 +1,53 @@
+add.children <- function(node, y, d, max.depth = 5, n.min = 5)UseMethod("add.children")
+
+#######################################################################
+## add.children is a recursive function that adds nodes (children) down 
+## the rooted tree as long as max.depth has not been reached and 
+## there is a minimum number of observations of the context in the data
+## - called by "maximum.context()"
+#######################################################################
+add.children.default <- function(node, y, d, max.depth = 5, n.min = 5)
+{
+
+    alphabet = sort(unique(y))
+
+    this.node = node
+
+    index.child = 0
+    for (index in 1:length(alphabet))
+    {
+      if (length(this.node$context) == 1)
+      {
+        if (this.node$context == "x") ## when we are at the root we can possibly add nodes that correspond to the alphabet
+          possible.child = alphabet[index]
+        else
+          possible.child = c(this.node$context, alphabet[index]) ## the child is context wu, where w is the parent and u is the alphabet
+      }
+      else
+        possible.child = c(this.node$context, alphabet[index])
+
+      if (length(possible.child) <= max.depth) ## do not add if max.depth has been reached
+      {
+        n.occur = vecIn(y,rev(possible.child))
+        if ((length(n.occur)!=0))
+          if (length(n.occur) >= n.min*(1+d*length(rev(possible.child))*(length(alphabet)-1))) ## check if there is a minimum number of observations of the context in the data
+          {
+            index.child = index.child + 1
+
+            new_node = NULL
+            new_node$context = possible.child
+            new_node$alpha = rep(0,length(alphabet)-1)
+            new_node$beta = array(0,c(length(new_node$context), d, length(alphabet)-1)) ## steps, d, alphabet
+            new_node$child = list()
+
+            ### RECURSION: call the same function to add the child node (new_node) to the current one
+            new_node = add.children(new_node, y, d = d, max.depth = max.depth, n.min = n.min)
+
+            this.node$child[[index.child]] = new_node
+          }			
+      }
+    }
+
+    return(this.node)
+
+}
diff --git a/R/backtrack.beta.R b/R/backtrack.beta.R
@@ -0,0 +1,49 @@
+
+
+backtrack.beta <- function(result_fit, node, alpha.level = 0.05, trace = TRUE)UseMethod("backtrack.beta")
+
+  ##########################################################################################################
+  ##########################################################################################################
+  ##########################################################################################################
+  ## find the children whose betas were cut - only return those without grandchildren (nodes with grandchild cant be prunned)
+
+backtrack.beta.default <- function(result_fit, node, alpha.level = 0.05, trace = FALSE)
+{
+  X = result_fit$X
+  y = result_fit$y
+  alphabet = sort(unique(y))
+  d = ifelse(is.null(dim(X)), 1, dim(X)[2])
+
+  need.to.backtrack.beta = TRUE
+    while ((need.to.backtrack.beta == TRUE) && (!is.null(node$beta)))
+    {
+      result_fit_H0 = result_fit
+
+      if (dim(node$beta)[1] == 1)
+        node$beta = NULL
+      else
+        node$beta = array(0,c(dim(node$beta)[1]-1, d, length(alphabet)-1)) ## steps, d, alphabet
+      result_fit_H0$tree = update.node(result_fit_H0$tree, node)
+      result_fit_H0$tree = estimate(result_fit_H0$tree, y, X)
+      p.value = 1-pchisq(-2*(LogLik(result_fit_H0) - LogLik(result_fit)), d*(length(alphabet)-1))
+      if (p.value > alpha.level)
+      {
+        if (trace == TRUE)
+          if (is.null(node$beta))
+            cat("\n pruning beta parameters sequentially for node ", node$context, ": now beta depth 0")
+        else
+          cat("\n pruning beta parameters sequentially for node ", node$context, ": now beta depth ", dim(node$beta)[1])
+        result_fit = result_fit_H0
+      }
+      else
+        need.to.backtrack.beta = FALSE
+    }
+    return(result_fit)
+}
+
+
+
+
+
+
+