In [None]:
import numpy as np
import pandas as pd

from kmeans import KMeans
from pca import print_pca, print_nmf

from data_common import EMOTIONS_FEATURES, PERSONALITY_FEATURES, DAA, SEX

df = pd.read_csv('data/data_ellsberg.csv')

In [None]:
df['Sex'] = np.where(df['Sex'] == 'Male', 0, 1)

df = df[PERSONALITY_FEATURES + EMOTIONS_FEATURES + [DAA, SEX]]

not_daa_df = df.loc[df[DAA] == 0]
not_daa_df = not_daa_df[PERSONALITY_FEATURES + EMOTIONS_FEATURES + [SEX]]

daa_df = df.loc[df[DAA] == 1]
daa_df = daa_df[PERSONALITY_FEATURES + EMOTIONS_FEATURES + [SEX]]

In [None]:
all_mean = df.mean(axis=0)
all_std = df.std(axis=0)

not_daa_mean = not_daa_df.mean(axis=0)
not_daa_std = not_daa_df.std(axis=0)

daa_mean = daa_df.mean(axis=0)
daa_std = daa_df.std(axis=0)

In [None]:
def to_table(not_daa, daa):
    summary = pd.concat([not_daa, daa], axis=1)
    summary = summary.T
    summary.index = ['notDAA', 'DAA']
    return summary

def to_summary(all_mean, all_std, not_daa_mean, not_daa_std, daa_mean, daa_std):
    diff = ((daa_mean - not_daa_mean) / daa_mean) * 100
    values = zip(all_mean.index, all_mean, all_std, not_daa_mean, not_daa_std, daa_mean, daa_std, diff)
    for i, alm, als, ndm, nds, dm, ds, df in values:
        print(f'{i:20} {alm:6.3f}({als:5.3f})\t\t{ndm:6.3f}({nds:5.3f})\t\t{dm:6.3f}({ds:5.3f})\t\t{df:6.2f}%')

to_summary(all_mean, all_std, not_daa_mean, not_daa_std, daa_mean, daa_std)

In [None]:
person_km = KMeans(df, 1, features=PERSONALITY_FEATURES, random_state=69)
print_pca(df, person_km.features, person_km.labels)
print_nmf(df, person_km.features, person_km.labels)

In [None]:
emotions_km = KMeans(df, 1, features=EMOTIONS_FEATURES, random_state=69)
print_pca(df, emotions_km.features, emotions_km.labels)
print_nmf(df, emotions_km.features, emotions_km.labels)