<a href="https://colab.research.google.com/github/cseveriano/evolving_clustering/blob/master/notebooks/Evolving_Clustering_Experiment_2_Static_Dataset_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount Results Directory

In [0]:
path = "/content/gdrive/My Drive/Evolving_Results/"
from google.colab import drive
drive.mount("/content/gdrive")

## Install Libraries

In [0]:
#@title
!apt-get update
!apt-get install r-base
!pip install rpy2
!apt-get install libmagick++-dev
#!apt-get install r-cran-rjava

import os       #importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  os.environ["LD_LIBRARY_PATH"] = "/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server"
  !java -version       #check java version

install_java()

!R CMD javareconf

#!apt-get install r-cran-rjava
#!apt-get install libgdal-dev libproj-dev

!R -e 'install.packages(c("magick",  "animation", "stream", "rJava", "streamMOA"))'

##Install R Packages

In [0]:
# enables the %%R magic, not necessary if you've already done this
%load_ext rpy2.ipython

In [0]:
%%R
dyn.load("/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/libjvm.so")
library("stream")
library("streamMOA")

##Read Data Stream
     




In [0]:
%%R
experiment <- function(){
  df <- read.csv("https://query.data.world/s/zry3yzt4ruwrlonziw2pfkegsdugw2", header=TRUE, stringsAsFactors=FALSE);
  nsamples <- nrow(df)
  df <- df[sample(nsamples),]
  stream <- DSD_Memory(df[,c("x", "y")], class=df[,"class"], k=max(df[,"class"]))
  return (get_points(stream, n = nsamples, class = TRUE))
}

##Run Benchmark Models

### Benchmark methods:
* DenStream
* ClusStream
* Stream KM++

### Benchmark metrics:
* cRand



In [0]:
# Experiment parameters
nclusters = 8
metric = "cRand"
trials = 30

In [0]:
%%R -i metric -i trials -i path -i nclusters

alg_names <- c("DenStream", "Clustream", "StreamKM")
trials_df <- data.frame(matrix(ncol = length(alg_names), nrow = 0))
colnames(trials_df) <- alg_names

for (i in 1:(trials)){
  algorithms <- list("DenStream" = DSC_DenStream(epsilon=0.1, mu=19, beta=0.4),
                     "Clustream" = DSC_CluStream(m = 10, horizon = 100, t = 1, k=NULL),
                     "StreamKM" = DSC_StreamKM(sizeCoreset = 100, numClusters = nclusters)
  )
  writeLines(sprintf("Trial: %d", i))
    
  evaluation <- sapply(algorithms, FUN = function(alg) {
    
    df <- read.csv("https://query.data.world/s/zry3yzt4ruwrlonziw2pfkegsdugw2", header=TRUE, stringsAsFactors=FALSE);
    nsamples <- nrow(df)
    df <- df[sample(nsamples),]
    stream <- DSD_Memory(df[,c("x", "y")], class=df[,"class"], k=max(df[,"class"]))
    update(alg, stream, n=nsamples)
    reset_stream(stream)
    evaluate(alg, stream, measure = (), n = nsamples, type = "macro", assign = "macro")
  })
    
  trials_df[nrow(trials_df) + 1,] = as.data.frame(evaluation)[,'evaluation']
}

write.csv(trials_df, paste0(path,"results_DS4_benchmark.csv"))

##Run Evolving Clustering

* Convert to X,y format
* run prequential routine
* plot results

In [0]:
!pip install -U git+https://github.com/cseveriano/evolving_clustering

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from evolving import EvolvingClustering, load_dataset, Metrics, Benchmarks, util
from sklearn.metrics import adjusted_rand_score
import time
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = robjects.r

In [0]:
evol_trials_df = pd.DataFrame(columns=["microTEDAclus"])

for i in np.arange(trials):

  named_tuple = time.localtime() # get struct_time
  time_string = time.strftime("%m/%d/%Y, %H:%M:%S", named_tuple)
  print("Trial: ",i ," at ",time_string)

  stream_df = pandas2ri.ri2py_dataframe(r.experiment())
  X = stream_df[['x', 'y']].values
  y = stream_df['class'].values
  evol_model = EvolvingClustering.EvolvingClustering(variance_limit=0.0008, debug=False)

  evol_model.fit(X)
  y_hat = evol_model.predict(X)

  error = adjusted_rand_score(y, y_hat)
  evol_trials_df = evol_trials_df.append({'microTEDAclus': error}, ignore_index=True)
  evol_trials_df.to_csv(path+'results_DS4_evolving.csv', index=False)