 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"><li><span><a href="http://localhost:8888/notebooks/kmeans.ipynb#Cell-Formatting" data-toc-modified-id="Cell-Formatting-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Cell Formatting</a></span></li><li><span><a href="http://localhost:8888/notebooks/kmeans.ipynb#Data-Collection" data-toc-modified-id="Data-Collection-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Collection</a></span><ul class="toc-item"><li><span><a href="http://localhost:8888/notebooks/kmeans.ipynb#Weightening" data-toc-modified-id="Weightening-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Weightening</a></span></li><li><span><a href="http://localhost:8888/notebooks/kmeans.ipynb#Data-scalling" data-toc-modified-id="Data-scalling-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Data scalling</a></span></li><li><span><a href="http://localhost:8888/notebooks/kmeans.ipynb#Data-display" data-toc-modified-id="Data-display-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Data display</a></span></li></ul></li><li><span><a href="http://localhost:8888/notebooks/kmeans.ipynb#Clustering-Stage" data-toc-modified-id="Clustering-Stage-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Clustering Stage</a></span><ul class="toc-item"><li><span><a href="http://localhost:8888/notebooks/kmeans.ipynb#display-features-list-with-indexes" data-toc-modified-id="display-features-list-with-indexes-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>display features list with indexes</a></span></li><li><span><a href="http://localhost:8888/notebooks/kmeans.ipynb#data-plot-per-cluster" data-toc-modified-id="data-plot-per-cluster-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>data plot per cluster</a></span></li></ul></li><li><span><a href="http://localhost:8888/notebooks/kmeans.ipynb#Clustering-Metrics" data-toc-modified-id="Clustering-Metrics-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Clustering Metrics</a></span></li></ul></div>

# Cell Formatting

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Data Collection

In [None]:
import pandas as pd
from IPython.display import display

dataset = pd.read_csv(
    './../POLVO/IA_FILES/reduced_thin_sections.csv', delimiter=',')

pd.options.display.max_columns = None
pd.options.display.max_rows = None

target_labels = dataset['Petrofacie']
petrofacies = list(set(target_labels))
dataset.index = [
    str(i) + '-' + str(petrofacie)
    for i, petrofacie in enumerate(dataset['Petrofacie'])
]
targets = list(dataset['Petrofacie'])

del dataset['Petrofacie']
# del dataset['Main/single size mode(mm):']
    
feature_names = list(dataset.columns.values)

## Weightening

In [None]:
# dataset.loc[:, 'Main/single size mode(mm):'] *= 10

## Data scalling

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from copy import deepcopy

dataset[:] = StandardScaler().fit_transform(X=dataset)

## Data display

In [None]:
display(dataset)

# Clustering Stage

In [None]:
from sklearn.cluster import KMeans
import numpy as np

np.set_printoptions(linewidth=162)
km = KMeans(n_clusters=10, init='random', n_init=2, max_iter=30, n_jobs=8, algorithm='fwkm').fit(dataset)

In [None]:
leaves = list(range(dataset.shape[0]))

labels = list(km.labels_)
set_labels = sorted(list(set(labels)), key=lambda x: labels.index(x))

## display features list with indexes

In [None]:
pd.options.display.max_colwidth = 180
display(pd.DataFrame(feature_names, columns=['FEATURE']))

## data plot per cluster
Boxplot features grouped according to the samples clustered

In [None]:
import numpy as np
import matplotlib.pyplot as plt

number_attributes = len(dataset.columns)
clusters = []
for label in set_labels:
    cluster_indexes = [leaves[i] for i, x in enumerate(labels) if x == label]
    cluster = np.matrix(dataset.iloc[cluster_indexes, :])
    clusters.append(cluster)

    plt.figure(figsize=(30, 7))
    instances_labels = list(dataset.index[cluster_indexes])
    ordered_petrofacies = [
        instance_label.split('-')[1] for instance_label in instances_labels
    ]
    plt.suptitle("Cluster " + str(label) + '; ' + str(cluster.shape[0]) +
                 ' instances: ' + ','.join(instances_labels))

    plt.subplot(121)
    if cluster.shape[
            0] == 1:  # TEMPORARY WORKAROUND TO https://github.com/matplotlib/matplotlib/pull/8116
        cluster = np.vstack((cluster[0, :], cluster[0, :]))
    plt.boxplot(cluster, labels=range(0, len(dataset.columns)))
    plt.xlabel('feature index')
    plt.ylabel('percentage')
    plt.yticks(np.arange(0, np.amax(cluster)+1, 2.0))

    plt.subplot(122)
    cmap = plt.get_cmap('tab10')
    colors_mapping = dict(
        zip(petrofacies, cmap(np.linspace(0, 1, len(petrofacies)))))
    plot_colors = []
    for petrofacie in ordered_petrofacies:
        plot_colors.append(colors_mapping[petrofacie])

    for petrofacie, color, instance in zip(ordered_petrofacies, plot_colors,
                                           cluster):
        instance_arr = np.squeeze(np.asarray(instance))
        plt.plot(range(0, number_attributes),
            instance_arr,
            c=color,
            linewidth=.5)
        plt.scatter(
            range(0, number_attributes),
            instance_arr,
            c=color,
            label=petrofacie)

    plt.xlabel('feature index')
    plt.ylabel('percentage')
    plt.xticks(np.arange(0, number_attributes, 1.0))
    plt.yticks(np.arange(0, np.amax(cluster)+1, 2.0))
    plt.grid()
    plt.legend()

    plt.show()

# Clustering Metrics

In [None]:
from sklearn import metrics

ordered_labels = [-1]*len(targets)
for i, leaf in enumerate(leaves):
    ordered_labels[leaf] = labels[i]
    
ordered_labels =[label-1 for label in ordered_labels]

print('adjusted_rand_score :\t\t' + str(metrics.cluster.adjusted_rand_score(targets, ordered_labels)))
print('adjusted_mutual_info_score :\t' + str(metrics.cluster.adjusted_mutual_info_score(targets, ordered_labels)))
print('homogeneity_score :\t\t' + str(metrics.cluster.homogeneity_score(targets, ordered_labels)))
print('completeness_score :\t\t' + str(metrics.cluster.completeness_score(targets, ordered_labels)))
print('v_measure_score :\t\t' + str(metrics.cluster.v_measure_score(targets, ordered_labels)))
