<a href="https://colab.research.google.com/github/cseveriano/evolving_clustering/blob/master/notebooks/Evolving_Clustering_Experiment_6_RBF_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## RBF Generator with Events

## Mount Results Directory

In [0]:
path = "/content/gdrive/My Drive/Evolving_Results/"
from google.colab import drive
drive.mount("/content/gdrive")

## Install Libraries

In [0]:
#@title
!apt-get update
!apt-get install r-base
!pip install rpy2
!apt-get install libmagick++-dev
#!apt-get install r-cran-rjava

import os       #importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  os.environ["LD_LIBRARY_PATH"] = "/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server"
  !java -version       #check java version

install_java()

!R CMD javareconf

#!apt-get install r-cran-rjava
#!apt-get install libgdal-dev libproj-dev

!R -e 'install.packages(c("magick",  "animation", "stream", "rJava", "streamMOA"))'

##Install R Packages

In [0]:
# enables the %%R magic, not necessary if you've already done this
%load_ext rpy2.ipython

In [0]:
%%R
dyn.load("/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/libjvm.so")
library("stream")
library("streamMOA")

##Generate Concept Drift Data Stream
     




In [0]:
# Experiment parameters
nclusters = 3
nsamples = 10000
train_size = 100
window_size = 100
metric = "cRand"
trials = 30

In [0]:
%%R -i nsamples

experiment <- function(nsamples){
  gen <- DSD_RandomRBFGeneratorEvents(k = 3, d = 2, numClusterRange = 3L, 
                             kernelRadius = 0.07, kernelRadiusRange = 0, densityRange = 0, 
                             speed =100L, speedRange = 0L, noiseLevel = 0,
                             noiseInCluster = FALSE, eventFrequency = 1000L,
                             eventMergeSplitOption = TRUE, eventDeleteCreate = TRUE, 
                             modelSeed = NULL, instanceSeed = NULL)

  return (get_points(gen, n = nsamples, class = TRUE))
}

##Run Benchmark Models

### Benchmark methods:
* DenStream
* ClusStream
* Stream KM++

### Benchmark metrics:
* cRand



In [0]:
%%R
run_benchmarks <- function(trial, nsamples, train_size, window_size, metric, eval_data, path){
  alg_names <- c("DenStream", "Clustream", "StreamKM")

  algorithms <- list("DenStream" = DSC_DenStream(epsilon=0.01, mu=4, beta=0.2),
                     "Clustream" = DSC_CluStream(m = 10, horizon = 1000, t = 1, k=NULL),
                     "StreamKM" = DSC_StreamKM(sizeCoreset = 100, numClusters = 3, length = nsamples)
  )
  writeLines(sprintf("Trial: %d", trial))
    
  evaluation <- lapply(algorithms, FUN = function(alg) {
    stream <- DSD_Memory(eval_data[,c("X1", "X2")], class=eval_data[,"class"], k=max(eval_data[,"class"]))

    update(alg, stream, n=train_size)
    evaluate_cluster(alg, stream, horizon = window_size, n = nsamples - train_size, measure = metric,
                     type = "macro", assign = "macro")
  })
    
  Position <- evaluation[[1]][,"points"]
  errors <- as.data.frame(sapply(evaluation, FUN = function(x) x[,metric]))
  write.csv(errors, paste0(path,sprintf("results_6_%02d_benchmark.csv",trial)))
    
}

##Run Evolving Clustering

* Convert to X,y format
* run prequential routine
* plot results

In [0]:
!pip install -U git+https://github.com/cseveriano/evolving_clustering

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from evolving import EvolvingClustering, load_dataset, Metrics, Benchmarks, util
from sklearn.metrics import adjusted_rand_score
import time
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = robjects.r

In [0]:
evol_trials_df = pd.DataFrame()

for i in np.arange(trials):
  named_tuple = time.localtime() # get struct_time
  time_string = time.strftime("%m/%d/%Y, %H:%M:%S", named_tuple)
  print("Trial: ",i," at ",time_string)

  stream_df = pandas2ri.ri2py_dataframe(r.experiment(nsamples))
  
  print("Running benchmarks")
  r.run_benchmarks(i, nsamples, train_size, window_size, metric, stream_df, path)
  
  X = stream_df[['X1', 'X2']].values
  y = stream_df['class'].values
  evol_model = EvolvingClustering.EvolvingClustering(macro_cluster_update=1,  variance_limit=0.001, debug=False)
  accum_error, error_list = Benchmarks.prequential_evaluation(evol_model, X, y, adjusted_rand_score, train_size, window_size)
  evol_trials_df["Trial-"+str(i)] = error_list
  evol_trials_df.to_csv(path+'results_6_evolving.csv')
  print("Error: ", np.mean(error_list))