## Data Processing Comparison

Comparison between
* DenStream
* CluStream
* microTEDAClus

In [0]:
path = "/content/gdrive/My Drive/Evolving_Results/major_review/Final/"
from google.colab import drive
drive.mount("/content/gdrive")

### Import Python Libraries

In [0]:
!pip install -U git+https://github.com/cseveriano/evolving_clustering@prune-cluster
!pip install numba

In [0]:
import importlib
importlib.reload(EvolvingClustering)

In [0]:
!pip install -U git+https://github.com/cseveriano/evolving_clustering
!pip install numba

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import adjusted_rand_score
from sklearn import datasets
from sklearn import preprocessing
import time

from benchmarks.denstream.DenStream import DenStream
from benchmarks.clustream.CluStream import CluStream
from evolving import EvolvingClustering
from evolving import Benchmarks

## Load Stream Dataset

### Load Stream Gaussian

In [0]:
#@title
!apt-get update
!apt-get install r-base
!pip install rpy2
!apt-get install libmagick++-dev
#!apt-get install r-cran-rjava

import os       #importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  os.environ["LD_LIBRARY_PATH"] = "/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64:/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server"
  !java -version       #check java version

install_java()

!R CMD javareconf

#!apt-get install r-cran-rjava
#!apt-get install libgdal-dev libproj-dev

!R -e 'install.packages(c("magick",  "animation", "stream", "rJava", "streamMOA"))'

In [0]:
%load_ext rpy2.ipython

In [0]:
%%R
dyn.load("/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/libjvm.so")
library("stream")
library("streamMOA")

In [0]:
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = robjects.r

In [0]:
%%R

gaussian_data_generator <- function(dimension, nsamples){
  stream <- DSD_Gaussians(k=3, d=dimension)
  return (get_points(stream, n = nsamples, class = TRUE))
}

In [0]:
nsamples = 4000
window_size = 100
train_size = 100
dims = 50
exp_name = "stream_gaussian"

stream_df = pandas2ri.ri2py_dataframe(r.gaussian_data_generator(dims, nsamples))
X_columns = stream_df.columns[:-1]
X = stream_df[X_columns].values
y = stream_df['class'].values
X = preprocessing.scale(X)
minmaxscaler = preprocessing.MinMaxScaler()
minmaxscaler.fit(X)
X = minmaxscaler.transform(X)

### Parameter Tuning

In [0]:
tuning_size = window_size
def fit_predict(method, data, labels, window_size, metric):
        train_data = data[:window_size]
        test_data = data[window_size:(window_size*2)]

        method.fit(train_data)
        y_hat = method.predict(test_data)
        y = labels[window_size:(window_size*2)]

        return metric(y, y_hat)


### Evolving

In [0]:
#@title
def clustering_objective(params):
  print(params)

  try:
    evol_model = EvolvingClustering.EvolvingClustering(variance_limit=params['variance_limit'], debug=False)
    error = fit_predict(evol_model, X, y, tuning_size, adjusted_rand_score)
  except Exception:
    traceback.print_exc()
    error = -1

  return {'loss': -error, 'status': STATUS_OK}

In [0]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt import space_eval
import itertools
import traceback

###### OPTIMIZATION ROUTINES ###########
l1 = list(np.arange(0.1,1,0.1))
l2 = list(np.arange(0.01,0.1,0.01))
l3 = list(np.arange(0.001,0.01,0.001))
l4 = list(np.arange(0.0001,0.001,0.0005))
l5 = list(np.arange(0.00001,0.0001,0.00005))
space = {'variance_limit': hp.choice('variance_limit', list(itertools.chain(l3, l4, l5)))}

trials = Trials()
best = fmin(clustering_objective, space, algo=tpe.suggest, max_evals=300, trials=trials)
print('best: ')
print(space_eval(space, best))

### DenStream

In [0]:
lambdas = [0.001, 0.01, 0.1, 0.25, 0.5 , 1]
epsilons = [0.01, 0.1, 1, 10, 20]
mus = [2, 3, 4, 8, 10, 15, 20]
betas = [0.001, 0.01, 0.1, 0.4, 0.5, 0.6, 1, 2]

In [0]:
tuning_df = pd.DataFrame(columns=['Lambda', 'Epsilon', 'Mu', 'Beta', 'Error'])

for lambd in lambdas:
  for epsilon in epsilons:
    for mu in mus:
      for beta in betas:
        print("Parameters: lambda=",lambd, " eps=",epsilon, " mu=",mu, " beta=", beta)
        try:
          denstream_model = DenStream(lambd=lambd, eps=epsilon, beta=beta, mu=mu)
          error = fit_predict(denstream_model, X, y, tuning_size, adjusted_rand_score)
          tuning_df = tuning_df.append({'Lambda': lambd, 'Epsilon': epsilon, 'Mu': mu, 'Beta': beta, 'Error': error}, ignore_index=True)
        except Exception as e:
          #logging.exception("Erro nos parametros!")
          print("Erro nos parametros: ",e)

In [0]:
denstream_best_result = tuning_df.sort_values(by=['Error']).loc[tuning_df['Error'] > 0].iloc[-1]
denstream_best_lambda = denstream_best_result.Lambda
denstream_best_eps = denstream_best_result.Epsilon
denstream_best_mu = denstream_best_result.Mu
denstream_best_beta = denstream_best_result.Beta

In [0]:
del denstream_model
del denstream_best_result
del tuning_df

### CluStream

In [0]:
microclusters = [10, 100, 200, 300, 400, 500]
horizons = [10, 100, 500, 1000]
ts = [1,2,4,8, 10, 20]

In [0]:
clustream_tuning_df = pd.DataFrame(columns=['MC', 'Horizon', 'RadiusFactor', 'Error'])

for mc in microclusters:
  for h in horizons:
    for t in ts:
      print("Parameters: MC=",mc, " Horizon=",h, " Radius=", t)
      try:
        clustream_model = CluStream(q=mc, m=mc, radius_factor = t, delta=h, k=5, init_number=100)
        error = fit_predict(clustream_model, X, y, tuning_size, adjusted_rand_score)
        clustream_tuning_df = clustream_tuning_df.append({'MC': mc, 'Horizon': h, 'RadiusFactor': t, 'Error': error}, ignore_index=True)
      except:
        print("Error in parameter configuration")

In [0]:
clustream_best_result = clustream_tuning_df.sort_values(by=['Error']).loc[clustream_tuning_df['Error'] > 0].iloc[-1]
clustream_best_mc = int(clustream_best_result.MC)
clustream_best_horizon = int(clustream_best_result.Horizon)
clustream_best_radius = clustream_best_result.RadiusFactor

In [0]:
del clustream_model
del clustream_best_result
del clustream_tuning_df

## Run Optimized Models

In [0]:
import os
import psutil

### Get Memory Usage before the execution

In [0]:
process = psutil.Process(os.getpid())
base_memory_usage = process.memory_info().rss / (2 ** 20) # memory consumption in MB (function returns value in bytes)

In [0]:
base_memory_usage

### Run models

In [0]:
evol_model = EvolvingClustering.EvolvingClustering(variance_limit=0.0001, decay=1000, debug=True)
evolving_results = Benchmarks.prequential_evaluation(evol_model, X, y, adjusted_rand_score, train_size, window_size, elapsed_time=True)

In [0]:
denstream_model = DenStream(lambd=denstream_best_lambda, eps=denstream_best_eps, mu=denstream_best_mu, beta=denstream_best_beta)
denstream_results = Benchmarks.prequential_evaluation(denstream_model, X, y, adjusted_rand_score, train_size, window_size, elapsed_time=True)

In [0]:
clustream_model = CluStream(q=clustream_best_mc, m=clustream_best_mc, radius_factor = clustream_best_radius, delta=clustream_best_horizon, k=5, init_number=100)
clustream_results = Benchmarks.prequential_evaluation(clustream_model, X, y, adjusted_rand_score, train_size, window_size, elapsed_time=True)

### Plot Results

### Memory Consumption vs Samples

In [0]:
fig = plt.figure(figsize=(14,6))

windows = np.arange(train_size+window_size,len(X)+window_size,window_size)
#plt.plot(windows,denstream_results['memory_usage_list'],'o-', color='blue',label='DenStream')
plt.plot(windows,evolving_results['memory_usage_list'],'o-', color='orange',label='microTEDAclus')
#plt.plot(windows,clustream_results['memory_usage_list'],'o-', color='green',label='CluStream')

plt.xlabel('Samples')
plt.ylabel('Memory Consumption (MB)')
plt.legend()
plt.show()

### Adjusted Rand vs Samples

In [0]:
fig = plt.figure(figsize=(18,6))

windows = np.arange(train_size+window_size,len(X)+window_size,window_size)
#plt.plot(windows,denstream_results['error_list'],'o-', color='blue',label='DenStream')
plt.plot(windows,evolving_results['error_list'],'o-', color='orange',label='microTEDAclus')
#plt.plot(windows,clustream_results['error_list'],'o-', color='green',label='CluStream')

plt.xlabel('Samples')
plt.ylabel('Adj Rand Index')
plt.legend()
plt.show()

### Save results to csv

In [0]:
from google.colab import files

In [0]:
memory_usage = [((m * 1000) / (2 ** 20)) - base_memory_usage for m in evolving_results['memory_usage_list']]
evolving_results_df = pd.DataFrame({'Memory': memory_usage, 'cRand': evolving_results['error_list'], 'Time': evolving_results['elapsed_time_list']})
evolving_results_df.to_csv(path + exp_name + "_memory_usage_evolving.csv") 

In [0]:
memory_usage = [((m * 1000) / (2 ** 20)) - base_memory_usage for m in denstream_results['memory_usage_list']]
denstream_results_df = pd.DataFrame({'Memory': memory_usage, 'cRand': denstream_results['error_list'], 'Time': denstream_results['elapsed_time_list']})
denstream_results_df.to_csv(path + exp_name + "_memory_usage_denstream.csv") 

In [0]:
memory_usage = [((m * 1000) / (2 ** 20)) - base_memory_usage for m in clustream_results['memory_usage_list']]
clustream_results_df = pd.DataFrame({'Memory': memory_usage, 'cRand': clustream_results['error_list'], 'Time': clustream_results['elapsed_time_list']})
clustream_results_df.to_csv(path + exp_name + "_memory_usage_clustream.csv") 