In [1]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random as rdm
from tqdm.notebook import tqdm
import time

In [2]:
cd ../

/home/guillaume/recimpute


In [3]:
%load_ext autoreload
%autoreload 2
from Datasets.Dataset import Dataset
from Clustering.AbstractClustering import AbstractClustering
from Clustering.ShapeBasedClustering import ShapeBasedClustering
import Clustering.ConFree_kClustering as cfkc

In [None]:
if not os.path.exists('Experiments/results'):
    os.makedirs('Experiments/results')

In [4]:
Dataset.CONF['USE_ALL'] = True
clusterer = ShapeBasedClustering()
ALL_DATASETS = Dataset.instantiate_from_dir(clusterer)

In [None]:
FILTERED_DATASETS = rdm.sample(ALL_DATASETS, 3)
FILTERED_DATASETS

# Run comparisons

In [7]:
comparisons_filename = 'Experiments/results/clusteringexperiment_comparisons.json'

In [8]:
# create the comparisons data frame
my_columns = pd.MultiIndex.from_tuples(
    list(itertools.product(['8-Shape', 'Gridsearch', 'Iterative', 'IncrementalClustering'], 
                           ['Clusters Assignment', 'Runtime', 'NCC Score', 'Average Correlation', 'Nb mono-sequence clusters', 'Nb clusters']))
)
comparison_df = pd.DataFrame(index=map(lambda ds: ds.name, FILTERED_DATASETS), columns=my_columns)

In [9]:
def get_avg_mean_corr(ds, timeseries, clusters_assignment):
    mean_corrs = [
        clusterer._get_dataset_mean_corr(ds.get_cluster_by_id(timeseries, cid, clusters_assignment))
        for cid in clusters_assignment['Cluster ID'].unique()
    ]
    return np.mean(mean_corrs)

In [10]:
def save_results(comparison_df, column, ds, timeseries, runtime):
    cass = ds.load_cassignment(clusterer)
    ncc_score = clusterer._compute_run_score(timeseries, cass)
    avg_corr = get_avg_mean_corr(ds, timeseries, cass)
    nb_monoseq_clusters = sum(cass['Cluster ID'].value_counts() == 1)
    nb_clusters = cass['Cluster ID'].nunique()
    
    #comparison_df.at[ds.name, (column, 'Clusters Assignment')] = cass
    comparison_df.at[ds.name, (column, 'Runtime')] = runtime
    comparison_df.at[ds.name, (column, 'NCC Score')] = ncc_score
    comparison_df.at[ds.name, (column, 'Average Correlation')] = avg_corr
    comparison_df.at[ds.name, (column, 'Nb mono-sequence clusters')] = nb_monoseq_clusters
    comparison_df.at[ds.name, (column, 'Nb clusters')] = nb_clusters 
    
    #comparison_df.to_csv(comparisons_filename)
    return comparison_df

In [11]:
def k_shape_8(ds, timeseries):
    labels = clusterer.kshape_helper(8, timeseries)
    data = [
        (tid, cid) # time series id, assigned cluster's id
        for tid, cid in zip(timeseries.index, labels)
    ]
    cass = pd.DataFrame(data=data, columns=['Time Series ID', 'Cluster ID']).sort_values('Time Series ID')
    clusterer.save_clusters(ds, cass)
    return ds

In [None]:
for ds in tqdm(FILTERED_DATASETS, total=len(FILTERED_DATASETS)): 
    
    timeseries = ds.load_timeseries(transpose=True)
    
    print("1/4") #  *- IncrementalClustering -*
    try:
        clusterer.CONF['APPLY_MERGING'] = True
        
        start_time = time.time()
        updated_ds = clusterer.cluster(ds)
        runtime = time.time() - start_time
        comparison_df = save_results(comparison_df, 'IncrementalClustering', ds, timeseries, runtime)
    except:
        pass
    
    
    print("2/4") #  *- Iterative -*
    try:
        clusterer.CONF['APPLY_MERGING'] = False

        start_time = time.time()
        updated_ds = clusterer.cluster(ds)
        runtime = time.time() - start_time
        comparison_df = save_results(comparison_df, 'Iterative', ds, timeseries, runtime)
    except:
        pass
    
    
    print("3/4") #  *- # - Gridsearch -*
    try:
        start_time = time.time()
        updated_ds = clusterer.old__cluster(ds)
        runtime = time.time() - start_time
        comparison_df = save_results(comparison_df, 'Gridsearch', ds, timeseries, runtime)
    except:
        pass
    
    
    print("4/4") #  *- 8-Shape -*
    try:
        start_time = time.time()
        updated_ds = k_shape_8(ds, timeseries)
        runtime = time.time() - start_time
        comparison_df = save_results(comparison_df, '8-Shape', ds, timeseries, runtime)
    except:
        pass

In [None]:
pd.set_option('display.max_columns', None)
comparison_df.head(2)

# Results analysis

In [None]:
comparison_df.mean()

In [None]:
def bar_plots(xlabels, ylabel):
    plt.rcdefaults()
    fig, ax = plt.subplots()

    x_pos = np.arange(len(xlabels))
    ys = comparison_df.loc[:, (slice(None), ylabel)].mean().tolist()

    ax.bar(x_pos, ys, align='center')
    ax.set_xticks(x_pos)
    ax.set_xticklabels(xlabels, rotation=45, horizontalalignment='right')
    ax.set_ylabel(ylabel)
    ax.set_title( 'Average over all data sets of: %s' % (ylabel) )

    plt.show()

In [None]:
bar_plots(
    xlabels=comparison_df.columns.levels[0].tolist(),
    ylabel='Runtime',
)

In [None]:
bar_plots(
    xlabels=comparison_df.columns.levels[0].tolist(),
    ylabel='Average Correlation',
)

In [None]:
bar_plots(
    xlabels=comparison_df.columns.levels[0].tolist(),
    ylabel='Nb mono-sequence clusters',
)

In [None]:
bar_plots(
    xlabels=comparison_df.columns.levels[0].tolist(),
    ylabel='Nb clusters',
)