In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from kmeans import KMeans
from kmeans_printer import print_per_game_kmeans
from pca import print_pca

from symmetric_uncertainty import SUT

from data_common import PERSONALITY_FEATURES, EMOTIONS_FEATURES, EMOTIONS_LABELS, DAA, CLUSTER

df = pd.read_csv('data/data_ellsberg.csv')
df = df[PERSONALITY_FEATURES + EMOTIONS_FEATURES + [DAA]]

#### Step 1: Find optimal K when clustering by personality traits

In [None]:
random_state = 18
K_range = range(2, 11)
sils = [KMeans(df, k, features=PERSONALITY_FEATURES, random_state=random_state).silhouette for k in K_range]
plt.plot(K_range, sils, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouettes')
plt.show()

#### Use optimal K (= 2)

In [None]:
K = 2

#### Step 2: For each of the clusters we found, perform second level clustering based on emotions

In [None]:
PERSONALITY_CLUSTER = 'PersonalityCluster'

km = KMeans(df, K, features=PERSONALITY_FEATURES)
df[PERSONALITY_CLUSTER] = km.labels
print(df[PERSONALITY_CLUSTER].value_counts())

In [None]:
print_pca(df, features=km.features, labels=km.labels)

In [None]:
df[CLUSTER] = df[PERSONALITY_CLUSTER]
print(SUT(df))

In [None]:
# Use same global properties for KMeans search
K_range = range(2, 11)

#### Emotion-based clusters within personality cluster #1

In [None]:
df0 = df[df[PERSONALITY_CLUSTER] == 0].copy()
sils = [KMeans(df0, k, features=EMOTIONS_FEATURES, random_state=random_state).silhouette for k in K_range]
print(sils)
plt.plot(K_range, sils, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette')
plt.show()

In [None]:
km = KMeans(df0, K, features=EMOTIONS_FEATURES, random_state=random_state)
print_per_game_kmeans(km, EMOTIONS_LABELS, plot_graph=True)

In [None]:
df0[CLUSTER] = km.labels
print(SUT(df0))

#### Emotion-based clusters within personality cluster #2

In [None]:
df1 = df[df[PERSONALITY_CLUSTER] == 1].copy()
sils = [KMeans(df1, k, features=EMOTIONS_FEATURES, random_state=18).silhouette for k in K_range]
print(sils)
plt.plot(K_range, sils, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette')
plt.show()

In [None]:
km = KMeans(df1, K, features=EMOTIONS_FEATURES, random_state=random_state)
print_per_game_kmeans(km, EMOTIONS_LABELS, plot_graph=True)

In [None]:
df1[CLUSTER] = km.labels
print(SUT(df1))