In [None]:
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

df = pd.read_csv('data/data_ellsberg.csv')

In [None]:
emotions = [
    'Hopeful', 
    'Curiosity', 
    'Enlightenment',
    'Thrilled', 
    'Anticipatory',
    'Satisfied'
]

x_labels = []
emotions_cols = []
for x in ['1A', '1B', '2A', '2B']:
    emotions_cols += [f'{emotion}{x}' for emotion in emotions]
    x_labels.append([f'{emotion[:4]}{x}' for emotion in emotions])

df = df[emotions_cols]

In [None]:
K = 2
K_range = range(2, 11)
distance_metric='euclidean'

In [None]:
def calc_silhouette(df, preds):
    return silhouette_score(df, preds, metric=distance_metric)

In [None]:
def build_kmeans(points, k):
    clusterer = KMeans(n_clusters=k, n_init='auto')
    preds = clusterer.fit_predict(points)
    return clusterer, preds

In [None]:
import numpy as np

def show_results(clusterer, k, plot_graph=False):
    print('labels:', clusterer.labels_, '\n') 
    centers = clusterer.cluster_centers_

    print('centroids:')
    for i in range(k):
        print(i,':',centers[i,:],'\n')

    if plot_graph:
        games_count = len(x_labels)
        labels_count = len(emotions)
        _, axs = plt.subplots(games_count, figsize=(10, 10))
        for ig in range(games_count):
            graph_labels = x_labels[ig]
            graph_centers = centers[:,ig*labels_count:(ig+1)*labels_count]
            for ik in range(k):
                axs[ig].plot(graph_labels, graph_centers[ik,:], label=f'Cluster #{ik}')
            axs[ig].legend(loc='upper right')

In [None]:
clusterer, preds = build_kmeans(df, K)
show_results(clusterer, K, plot_graph=True)
calc_silhouette(df, preds)

In [None]:
sils = []
for k in K_range:
    clusterer, preds = build_kmeans(df, k)
    sils.append(calc_silhouette(df, preds))

plt.plot(K_range, sils, 'bx-')
plt.xlabel('k')
plt.ylabel('silheuettes')
#plt.title('Sillheuette score per number of clusters (k)')
plt.show()

# Typically, mean silhouette over 0.6 is considered a "good" clustering solution
