version 1.0.0

cran · Feb 27, 2024 · baca314 · baca314
commit baca314
Show file tree

Hide file tree

Showing 10 changed files with 468 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,22 @@
+Package: clusEvol
+Type: Package
+Title: A Procedure for Cluster Evolution Analytics
+Version: 1.0.0
+Date: 2024-02-19
+Authors@R: c(person("Víctor", "Morales-Oñate",role=c("aut","cre"),email="victor.morales@uv.cl",comment = c(ORCID = "0000-0003-1922-6571")),person("Bolívar", "Morales-Oñate", role = "aut",email = "bmoralesonate@gmail.com",comment = c(ORCID = "0000-0003-4980-8759")))
+Maintainer: Víctor Morales-Oñate <victor.morales@uv.cl>
+Description: Cluster Evolution Analytics allows us to use exploratory what if questions in the sense that the present information of an object is plugged-in a dataset in a previous time frame so that we can explore its evolution (and of its neighbors) to the present. See the URL for the papers associated with this package, as for instance, Morales-Oñate and Morales-Oñate (2024) <https://mpra.ub.uni-muenchen.de/120220>.
+Depends: R (>= 4.1.0)
+License: GPL (>= 3)
+Encoding: UTF-8
+Imports: ggplot2,plotly,cluster,fpc,viridis,clusterSim,dplyr
+Repository: CRAN
+URL: https://github.com/vmoprojs/clusEvol
+BugReports: https://github.com/vmoprojs/clusEvol/issues
+LazyData: true
+NeedsCompilation: no
+Packaged: 2024-02-23 23:27:37 UTC; victormorales
+Author: Víctor Morales-Oñate [aut, cre]
+    (<https://orcid.org/0000-0003-1922-6571>),
+  Bolívar Morales-Oñate [aut] (<https://orcid.org/0000-0003-4980-8759>)
+Date/Publication: 2024-02-26 18:50:02 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,9 @@
+60665db9e6325e3ac65fd43b1871179e *DESCRIPTION
+0550f6ae158da25b920fe71c0ebd0a44 *NAMESPACE
+f5a6d11e305f15d8dff3dccdcb4e9162 *R/clusEvol.R
+9af57303507736d5e81d7777f16d16c6 *data/actpas.RData
+899ff227b3d63fec5e613739eda8d3cb *data/pwt1001.RData
+d21ee51e8ce6673757f54d9eded0e4b5 *man/actpas.Rd
+84305660e4106539a8ca77f30b84caa4 *man/clusEvol.Rd
+325ad04944ce4e4c5b4a9f8cdd74f8ce *man/plot.clusEvol.Rd
+c71d3326e0e4041c3ec6ec53bd4a79e4 *man/pwt1001.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,10 @@
+export(clusEvol)
+importFrom("stats", "formula","terms","dist")
+importFrom("utils", "stack")
+importFrom("fpc", "cluster.stats")
+importFrom("cluster", "pam")
+importFrom("viridis", "scale_fill_viridis")
+importFrom("ggplot2", "ggplot", "aes", "stat_ecdf", "geom_boxplot","geom_point","geom_line","aes_string","geom_tile")
+importFrom("plotly","ggplotly")
+S3method(print,clusEvol)
+S3method(plot,clusEvol)
diff --git a/R/clusEvol.R b/R/clusEvol.R
@@ -0,0 +1,279 @@
+####################################################
+### File name: clusEvol.r
+####################################################
+
+
+clusEvol <- function (x=NULL,objects=NULL, time = NULL,target.vars = NULL,
+                      time.base=NULL,sel.obj=NULL,init = NULL,logscale = FALSE,
+                      ng = NULL,clm = "pam",scale=TRUE,clstats = FALSE,...){
+
+  # x: dataframe
+  # objects: variable name of objects
+  # time: variable name of time 
+  # labels: objects labels (same lenght as objects)
+  # target.vars: selected variables for CEA
+  # time.base: selected year for CEA
+  # sel.obj: selected object for CEA
+  # init: initiation year
+  # logscale: TRUE if data should be logscaled
+  # ng: number of desired clusters
+  # clm: (pam,kmeans,choose)
+  # ... parameters of the clm chosen
+
+  # revisar paquete que recomienda el clustering del TASK VIEW
+
+  datos <- x[,c(objects,time,target.vars)]
+
+  # ST: if init is missing, take minimum year
+  if(is.null(init)) {
+    init <- min(datos[,time])
+  }else{
+    datos <- datos[which(datos[time]>=init),]
+  }
+  # END: if init is missing, take minimum year
+
+  # yrs.base <- input$yrs1
+  summary(datos)
+
+
+  nas <- apply(datos,2,function(x) sum(is.na(x)))
+  if(any(nas>0)){
+    warning("Selected variables have missing values, use a balaced panel data,NA's will be replaced with mean values")
+
+    if(length(target.vars)>1)
+    {
+      mm <- colMeans(datos[,target.vars],na.rm = TRUE)
+    }else{
+      mm <- mean(datos[,target.vars],na.rm = TRUE)
+    }
+
+    for(tv in 1:length(target.vars))
+    {
+      anas <- is.na(datos[,target.vars[tv]])
+      datos[which(anas),target.vars[tv]] <- mm[tv]
+    }
+
+  }
+
+  yrs <- unique(datos[,time])
+
+  sol.yrs <- NULL #cheks if sel.obj is in everytime period
+  for(t in 1:length(yrs))
+  {
+    fil <- datos[which(datos[,time]==yrs[t]),]
+    sol.yrs <- c(sol.yrs,sel.obj %in% fil[,objects])
+  }
+  if(!all(sol.yrs))
+  {
+    # warning("Times with no selected object: \n")
+    # print(yrs[which(sol.yrs)])
+    stop("Selected object must be included in all analyzed time periods")
+  }
+
+  # ST: dataframe for time.base:
+  refdatos <- datos[datos[,time]==time.base,c(objects,time,target.vars)]
+  if(logscale & length(target.vars)>1){
+    refdatos[,target.vars] <- apply(refdatos[,target.vars],2,log)
+  }
+  if(logscale & length(target.vars)==1){
+    refdatos[,target.vars] <- log(refdatos[,target.vars])
+  }
+  # END: dataframe for time.base
+
+  # ST: ECbase dataframe for sel.obj
+  ECbase <- refdatos[refdatos[,objects]==sel.obj,c(objects,time,target.vars)]
+  # END: ECbase dataframe for sel.obj
+
+  results <- list()
+  contador <- 1
+
+  Ksol = NULL
+  nbelongTot <- NULL
+  kmodelSol <- NULL
+  clusterStats <- NULL
+  for (j in  1:length(yrs))
+  {
+
+    refdatos <- datos[which(datos[,time]==yrs[j]),c(objects,time,target.vars)]
+
+    if(logscale & length(target.vars)>1){
+      refdatos[,target.vars] <- apply(refdatos[,target.vars],2,log)
+    }
+    if(logscale & length(target.vars)==1){
+      refdatos[,target.vars] <- log(refdatos[,target.vars])
+    }
+
+
+    refdatos[which(refdatos[,objects]==sel.obj),] <- ECbase #replace ECU values
+    # Include data imputation here if needed
+
+    temp1 <-  as.matrix(refdatos[,target.vars],ncol = length(target.vars))
+    rownames(temp1) <- refdatos[,objects]
+
+
+    # Clustering:
+    if(clm =="pam")
+    {
+      if(scale)
+      {
+        clm.args <- list(x=scale(temp1),k=ng,...)
+      }else{
+        clm.args <- list(x=temp1,k=ng,...)
+      }
+      kmodel <- do.call(cluster::pam,args = clm.args)
+      kmodelSol[[j]] <- kmodel
+
+      if(clstats)
+      {
+        clusterStats[[j]] <- fpc::cluster.stats(kmodel$diss,kmodel$clustering)
+      }
+
+      clase <- kmodel$clustering
+    }
+
+    if(clm =="kmeans")
+    {
+      if(scale)
+      {
+        clm.args <- list(x=scale(temp1),centers=ng,...)
+      }else{
+        clm.args <- list(x=temp1,centers=ng,...)
+      }
+
+      kmodel <- do.call(stats::kmeans,args = clm.args)
+      kmodelSol[[j]] <- kmodel
+
+      if(clstats)
+      {
+        if(scale)
+        {
+          clusterStats[[j]] <- fpc::cluster.stats(stats::dist(temp1),kmodel$cluster)
+        }else{
+          clusterStats[[j]] <- fpc::cluster.stats((temp1),kmodel$cluster)
+        }
+      }
+
+
+      clase <- kmodel$cluster
+    }
+    if(clm == "choose")
+    {
+      clm.args <- list(x=temp1,...)
+      kmodel <- do.call(clusterSim::cluster.Sim,args = clm.args)
+      kmodelSol[[j]] <- kmodel
+      clase <- kmodel$optClustering
+    }
+
+
+    auxK = cbind(clase,yrs[j])
+    Ksol = rbind(Ksol,auxK )
+
+    # ST: Find sel.obj's group and data
+    gbase <- list()
+    for(i in 1:ng)
+    {
+      gbase[[i]] <- as.vector(refdatos[,1][clase==i])
+    }
+
+    nbelong <- grep(sel.obj,gbase)# number of group which sel.obj belongs to
+    nbelongTot <- c(nbelongTot,nbelong)
+    g.fin <- refdatos[match(gbase[[nbelong]],refdatos[,objects]),]# Data in sel.obj group
+    g.fin$time = yrs[j]
+    results[[contador]] <- g.fin
+    contador <- contador+1
+  }
+
+  Ksol = data.frame(  objects = rownames(Ksol), cluster = Ksol[,1],time =  Ksol[,2])
+  # datos: input data
+  # target.vars: selected variables
+  # results: data frame of negighbours of sel.obj
+  # ECk: cluster that sel.obj belongs to
+  # sumdat: summary statistics of "datos"
+  # kmodelSol: output of clustering algorithm in each iteration
+  # kmodelSol: cluster statistics in each clustering
+  # sl: cluster evolution in time
+
+  sl <- dplyr::bind_rows(results, .id = "time")
+
+  sumdat <- summary(datos[,target.vars])
+
+  clusEvol <- list(datos = datos,target.vars=target.vars,
+                    results = results,ECk = nbelong,ECkTot = nbelongTot,
+                    Clus = Ksol,sumdat=sumdat,kmodelSol=kmodelSol,
+                    clusterStats=clusterStats,sl = sl,sel=c(objects,time),sel.obj=sel.obj)
+  structure(c(clusEvol, call = call), class = c("clusEvol"))
+}
+
+print.clusEvol <- function(x, digits = max(3, getOption("digits") - 3), ...)
+{
+  if(!inherits(x,"clusEvol")) stop("Enter an object obtained from the function clusEvol\n")
+
+  tn <- unique(x$Clus$time)
+  sel.obj <- x$sel.obj
+  # number of elements in sel.obj cluster:
+  nc <- sapply(x$results,nrow)-1
+  names(nc) <- tn
+  # Cluster that sel.obj belongs to:
+  # xs <- data.frame(time = unique(x$Clus$time),clus = x$ECkTot)
+  xs <- x$ECkTot
+  names(xs) <- tn
+  # Clusters in time:
+  tt <- table(x$Clus$time,x$Clus$cluster) 
+
+  cat('\n##################################################################')
+  cat('\nclusEvol: Cluster Evolution Analytics\n')
+  cat('\n\nNumber of neighbours ', sel.obj,'is a group member: ', '\n')
+  print(nc)
+  cat('\n\nCluster that ', sel.obj, 'belongs to: ','\n')
+  print(xs)
+  options(digits = digits)
+  cat('\nClusters in time:\n')
+  print.default(tt, digits = digits, print.gap = 2,
+                quote = FALSE)
+}
+
+
+plot.clusEvol <- function(x,target,type = "heat",plotly=FALSE,...)
+{
+  if(!inherits(x,"clusEvol")) stop("Enter an object obtained from the function clusEvol\n")
+  oldop <- options()
+  on.exit(options(oldop))
+  sl <- x$sl
+  time <- x$sel[2]
+  objs <- x$sel[1]
+  if(type=="heat" | type == "all")
+  {
+    p1 <-ggplot(sl,aes_string(time,objs,fill =target ))+
+      geom_tile(color= "white") + 
+      scale_fill_viridis(option ="C")
+
+    if(plotly)
+    {
+      suppressWarnings(print(plotly::ggplotly(p1)))
+    }else{print(p1)}
+  }
+
+  if(type=="line" | type == "all")
+  {
+    p2 <- ggplot(sl, aes_string(x=time, y=target, group=objs)) +
+      geom_line(aes_string(color=objs))+
+      geom_point(aes_string(color=objs))
+    if(plotly)
+    {
+      print(plotly::ggplotly(p2))
+    }else{print(p2)}
+  }
+  if(type=="boxplot" | type == "all")
+  {
+    p3 <- ggplot(sl, aes_string(x=time, y=target, color=objs)) +
+      geom_boxplot()
+    if(plotly)
+    {
+      print(plotly::ggplotly(p3))
+    }else{print(p3)}
+  }
+
+}
+
+
+
diff --git a/data/actpas.RData b/data/actpas.RData
diff --git a/data/pwt1001.RData b/data/pwt1001.RData
diff --git a/man/actpas.Rd b/man/actpas.Rd
@@ -0,0 +1,13 @@
+\name{actpas}
+\docType{data}
+\alias{actpas}
+\title{Assets and liabilities operations}
+\description{
+  Ecuador's amount of Assets and Liabilities Operations of the National Financial System: https://contenido.bce.fin.ec/home1/economia/tasas/IndiceSFN.htm
+}
+\usage{actpas}
+\format{A dataframe containing 358 observations and 25 columns.}
+\references{
+  Morales-Oñate, V., and Morales-Oñate, B. (2024). \emph{Cluster Evolution Analytics}. \url{https://mpra.ub.uni-muenchen.de/120220/}
+}
+\keyword{datasets}