In [None]:
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

df = pd.read_csv('data/data_ellsberg.csv')

In [None]:
emotions = [
    'Hopeful', 
    'Curiosity', 
    'Enlightenment',
    'Thrilled', 
    'Anticipatory',
    'Satisfied'
]

feature_cols = ['Openess','Conciensiousness','Extroversion','Agreability','Stability','Locus']

x_labels = []
emotions_cols = []
for x in ['1A', '1B', '2A', '2B']:
    emotions_cols += [f'{emotion}{x}' for emotion in emotions]
    x_labels.append([f'{emotion[:4]}{x}' for emotion in emotions])

df = df[feature_cols + emotions_cols]

#### Step 1: Find optimal K when clustering by personality traits

In [None]:
K_range = range(2, 11)
distance_metric='euclidean'

In [None]:
def calc_silhouette(df, preds):
    return silhouette_score(df, preds, metric=distance_metric)

In [None]:
def build_kmeans(points, k):
    clusterer = KMeans(n_clusters=k, n_init='auto')
    preds = clusterer.fit_predict(points)
    return clusterer, preds

In [None]:
sils = []
for k in K_range:
    clusterer, preds = build_kmeans(df[feature_cols], k)
    sils.append(calc_silhouette(df[feature_cols], preds))

plt.plot(K_range, sils, 'bx-')
plt.xlabel('k')
plt.ylabel('silheuettes')
#plt.title('Sillheuette score per number of clusters (k)')
plt.show()

# Typically, mean silhouette over 0.6 is considered a "good" clustering solution


#### Use optimal K (= 2)

In [None]:
K = 2

#### Step 2: For each of the clusters we found, perform second level clustering based on emotions

In [None]:
feature_clusterer, preds = build_kmeans(df[feature_cols], K)
df['fcluster'] = feature_clusterer.labels_
df = df.drop(feature_cols, axis=1)

In [None]:
print(df.head())
print(df['fcluster'].value_counts())

In [None]:
def show_results(clusterer, k, print_labels=False, print_centers=False, plot_graph=False):
    if print_labels:
        print('labels:', clusterer.labels_, '\n') 

    centers = clusterer.cluster_centers_

    if print_centers:
        print('centroids:')
        for i in range(k):
            print(i,':',centers[i,:],'\n')

    if plot_graph:
        games_count = len(x_labels)
        labels_count = len(emotions)
        _, axs = plt.subplots(games_count, figsize=(10, 10))
        for ig in range(games_count):
            graph_labels = x_labels[ig]
            graph_centers = centers[:,ig*labels_count:(ig+1)*labels_count]
            for ik in range(k):
                axs[ig].plot(graph_labels, graph_centers[ik,:], label=f'Emotion cluster #{ik}')
                axs[ig].legend(loc='upper right')

In [None]:
def show_k_results(df, k, plot_graph=True):
    k_df = df[df['fcluster'] == k].drop('fcluster', axis=1)
    emotion_clusterer, _ = build_kmeans(k_df, K)
    show_results(emotion_clusterer, K, plot_graph=plot_graph)

#### Emotion-based clusters within personality cluster #1

In [None]:
show_k_results(df, 0)

#### Emotion-based clusters within personality cluster #2

In [None]:
show_k_results(df, 1)