# Run Experiments 
<div style="text-align: right"> Lucas Pugens Fernandes</div>

This notebook is meant to execute clustering experiments according to the needs

## Loading datasets
Comment or uncomment lines according to which databases you want to test

Each element from the list is composed by a tuple with:

1. dataset identifier (string)
2. dataset object (*according to the skit-learn format*)

In [None]:
from pkg.datasets import thin_sections
from sklearn.datasets import load_breast_cancer, load_iris

datasets = []
datasets.append(('thin_sections', thin_sections(reduced_data=True)))
# datasets.append(('iris', load_iris()))
# datasets.append(('breast_cancer', load_breast_cancer()))

## Weightening

Append new weightened datasets to the test

In [None]:
import copy

weights = [{'feature': 'Porosity', 'weight': 1000},
           {'feature': 'Main/single size mode(mm):', 'weight': 1000}]

weightened_thin_sections = copy.deepcopy(datasets[0][1])

for weight in weights:
    index_weight = datasets[0][1].feature_names.index(weight['feature'])

    for i, point in enumerate(weightened_thin_sections.data):
        weightened_thin_sections.data[i][index_weight] = point[index_weight] * weight['weight']
    
datasets.append(('wgth_thin_sections', weightened_thin_sections))

## Loading algorthms

Comment or uncomment lines according to which algorithms you want to test.

Each element from the list is composed by a tuple with:
1. algorithm name identifier (string)
2. algorithm constructor    
3. key arguments to be used when calling the constructor

In [None]:
# from pkg.clustering import Wlac
from sklearn.cluster import DBSCAN, MiniBatchKMeans, SpectralClustering, AffinityPropagation, AgglomerativeClustering
from scipy.cluster.hierarchy import linkage
from itertools import product

algorithms = []
# algorithms.append(('WLAC', Wlac, {}))
# algorithms.append(('DBSCAN', DBSCAN, {})) PROBLEMS
# algorithms.append(('KMeans', MiniBatchKMeans, {'n_clusters' : 10}))
# algorithms.append(('SC', SpectralClustering, {'n_clusters' : 10}))
# algorithms.append(('AP', AffinityPropagation, {}))

# algorithms.append(('AC_complete_euc', AgglomerativeClustering, {'n_clusters' : 10, 'linkage': 'complete', 'affinity': 'euclidean'}))
algorithms.append(('AC_complete_man', AgglomerativeClustering, {'n_clusters' : 10, 'linkage': 'complete', 'affinity': 'manhattan'}))


## Generating scenarios
Generates _N_ scenarios for each combination of algorithms and datasets into the _scenarios_ array, each with a different random initializer.

Only **_random-dependent_** algorithms will be run _N_ times.

Each scenario os composed by a tuple with:

1. The dataset
2. The algorithm
3. The random seed initializer (**in the cases of _random-dependent_ algorithm**)

In [None]:
from itertools import product
from inspect import signature

N = 1000

random_algorithms = [algorithm for algorithm in algorithms if 'random_state' in str(signature(algorithm[1]))]
predictable_algorithms = [algorithm for algorithm in algorithms if 'random_state' not in str(signature(algorithm[1]))]

scenarios = list(product(datasets, random_algorithms, range(N)))
scenarios += list(product(datasets, predictable_algorithms))

del random_algorithms
del predictable_algorithms

## Parallel running
The scenarios are mapped to a pool of threads running in parallel.

The *CPU_USAGE* defines the percentage of available hardware threads to be used during the process.

In [None]:
from multiprocessing import Pool, cpu_count
from pkg import track_job, execute_clustering
from math import floor

CPU_USAGE = 1

cores = floor(cpu_count()*CPU_USAGE)
p = Pool(cores)
print(str(cores) + ' CORES')
print('Starting clusters:')
results = []
job = p.map_async(execute_clustering, scenarios, callback=results.append, error_callback=lambda x: print(str(x)))
track_job(job)
results = results[0]
print('Done clustering')

## Plotting results
Plots are generated from the results obtained

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score, homogeneity_score, completeness_score, fowlkes_mallows_score

metrics = [{'label': 'Adjusted Rand score', 'func': adjusted_rand_score},
           {'label': 'Adjusted mutual info score', 'func': adjusted_mutual_info_score},
           {'label': 'Homogeneity score', 'func': homogeneity_score},
           {'label': 'Completness score', 'func': completeness_score},
           {'label': 'Fowlkes mallows score', 'func': fowlkes_mallows_score}]

print(results[0]['algorithm_obj'].labels_)

for metric in metrics:
    for dataset_label, dataset_obj in datasets:
        results_for_dataset = [result for result in results if result['dataset_label'] == dataset_label]

        metric_by_algorithm = [[] for _ in algorithms]
        for i, algorithm in enumerate(algorithms):
            labeling_for_algorithm = [result['algorithm_obj'].labels_ for result in results_for_dataset if result['algorithm_label'] == algorithm[0]]
            for labels in labeling_for_algorithm:
                metric_by_algorithm[i].append((metric['func'](dataset_obj.target, labels)))
            
        
        plt.figure(figsize=(12,6))
        plt.boxplot(metric_by_algorithm,
                        labels=[label for label, _, _ in algorithms],
                        showmeans=True,
                        meanline=True)
        plt.title(dataset_label)
        plt.ylabel(metric['label'])
plt.show()
