# Cluster Comparison - T vs B in km and bmm for k=5

The km5 clusters do a better job of distinguishing T and B than bmm5.

* km5 - S2
    * witness
      * strong duchenne   **(p = 0.0018)** 
      * 6 only            **(p = 0.0056)**
    * interrogator
      * polite            (p = 0.0620)

* bmm5
    * witness
      * strong duchenne   (p = 0.155)
      * 6 only            (p = 0.0718)
    * interrogator
      * polite            **p = 0.0213**
      
* bmm9
    * witness
      * strong duchenne(8)   **(p = 0.0008)**
      * 6 only(0)            (p = 0.0372)
      * 6 only(6)            (p = 0.0615)
      * med hi duchenne(1)   **(p = 0.0391) Truth higher**
    * interrogator
      * polite(2)            **(p = 0.0211)**
      * polite(4)            (p = 0.2175)
      * 6 only(0)            **(p = 0.0111)**
      

In [18]:
#datafile = 'all_frames.pkl.xz' # FG/UBICOMP data N=151
datafile_km = '../data/all_frames_wclust.pkl.xz' # with AU6_AU12 clusters
datafile_bmm = '../data/all_frames_clust.bmm.pkl.xz' # bmm with 5 clusters
datafile_bmm9 = '../data/all_frames_clust.bmm9.pkl.xz' # bmm clusters

CONFIDENCE_TOL = 0.90 # only use data with conf > this

#-----------------

import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.options.display.float_format = '{:,.4f}'.format
from IPython.display import display
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_ind
# import in scripts dir
import sys
sys.path.append('../scripts')
import compare 

cluster_name2i = {'duchenne':0,'neutral':1,'strong duchenne':2,'6 only':3,'polite':4}
cluster_name2i_bmm9 = {'0':0,'1':1,'2':2,'3':3,'4':4,'5':5,'6':6,'7':7,'8':8}

def load_cluster_data(datafile):
    print('\n...loading cluster data: ', datafile)
    if 'pkl' in datafile:
        df = pd.read_pickle(datafile)
    else:
        df = pd.read_csv(datafile, skipinitialspace=True) 
    df = df[df['confidence'] >= CONFIDENCE_TOL]
    # confidence of .90 causes nfiles 302-->298

    if 'bmm9' in datafile:
        for cluster_name in cluster_name2i_bmm9.keys():
            df[cluster_name] = (df['AU06_AU12_cluster'] == cluster_name2i_bmm9[cluster_name]).astype(int)
    else:
        for cluster_name in cluster_name2i.keys():
            df[cluster_name] = (df['AU06_AU12_cluster'] == cluster_name2i[cluster_name]).astype(int)

    print('n:',df.shape[0])
    print('# files = ', df['Filename'].nunique())

    print('\ncolumns: ')
    for c in df.columns:
        print(c,end=',')
    print('-------')
    return df
        
df_km = load_cluster_data(datafile_km)
df_bmm = load_cluster_data(datafile_bmm)
df_bmm9 = load_cluster_data(datafile_bmm9)



...loading cluster data:  ../data/all_frames_wclust.pkl.xz
n: 1246319
# files =  298

columns: 
Filename,filetype,segment,timestamp,confidence,success,pose_Tx,pose_Ty,pose_Tz,pose_Rx,pose_Ry,pose_Rz,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,AU09_r,AU10_r,AU12_r,AU14_r,AU15_r,AU17_r,AU20_r,AU23_r,AU25_r,AU26_r,AU45_r,AU01_c,AU02_c,AU04_c,AU05_c,AU06_c,AU07_c,AU09_c,AU10_c,AU12_c,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,AU06_AU12_cluster,duchenne,neutral,strong duchenne,6 only,polite,-------

...loading cluster data:  ../data/all_frames_clust.bmm.pkl.xz
n: 1246319
# files =  298

columns: 
Filename,filetype,confidence,segment,timestamp,AU06_AU12_cluster,duchenne,neutral,strong duchenne,6 only,polite,-------

...loading cluster data:  ../data/all_frames_clust.bmm9.pkl.xz
n: 1246319
# files =  298

columns: 
Filename,filetype,confidence,segment,timestamp,AU06_AU12_cluster,0,1,2,3,4,5,6,7,8,-------


In [19]:
# cluster statistics
print('Average cluster dist over all frames')
print('K-MEANS')
g = df_km.groupby('filetype')
cluster_dist = g[list(cluster_name2i.keys())].mean()
display(cluster_dist)

print('\n\nBETA MIXTURE bmm5')
g = df_bmm.groupby('filetype')
cluster_dist = g[list(cluster_name2i.keys())].mean()
display(cluster_dist)

print('\n\nBETA MIXTURE bmm9')
g = df_bmm9.groupby('filetype')
cluster_dist = g[list(cluster_name2i_bmm9.keys())].mean()
display(cluster_dist)

Average cluster dist over all frames
K-MEANS


Unnamed: 0_level_0,duchenne,neutral,strong duchenne,6 only,polite
filetype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
I-B,0.171,0.37,0.058,0.163,0.237
I-T,0.157,0.44,0.059,0.184,0.16
W-B,0.134,0.439,0.073,0.127,0.228
W-T,0.13,0.417,0.044,0.179,0.231




BETA MIXTURE bmm5


Unnamed: 0_level_0,duchenne,neutral,strong duchenne,6 only,polite
filetype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
I-B,0.44,0.143,0.199,0.056,0.162
I-T,0.418,0.191,0.196,0.089,0.106
W-B,0.403,0.169,0.177,0.073,0.178
W-T,0.478,0.137,0.147,0.08,0.158




BETA MIXTURE bmm9


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
filetype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
I-B,0.045,0.064,0.153,0.143,0.081,0.227,0.052,0.211,0.024
I-T,0.066,0.064,0.099,0.191,0.06,0.213,0.082,0.2,0.024
W-B,0.049,0.065,0.167,0.169,0.091,0.206,0.067,0.149,0.037
W-T,0.072,0.051,0.147,0.137,0.085,0.256,0.072,0.161,0.019


### statistical comparisons (Mann Whitney  Utest & Cohen's d)

In [24]:
def compare_t_b(df_seg,filetype_t,filetype_b, cluster_names):
    """ averages two groups over filename, then compares 
    : df_seg: the subset of df for a given segment, e.g. S2
    : filetype_t: the filetype which forms the t group, e.g. I-T or W-T
    : filetype_b: the filetype which forms the b group, e.g. I-B or W-B
    """
    gt = df_seg[(df_seg['filetype'] == filetype_t)].groupby('Filename')
    gb = df_seg[(df_seg['filetype'] == filetype_b)].groupby('Filename')
    gt_dist = gt[cluster_names].mean()
    gb_dist = gb[cluster_names].mean()

    statss = []
    for c in cluster_names:
        #mw_stat, mw_p = mannwhitneyu(gwt_dist[c],gwb_dist[c],alternative='two-sided')
        #mw_stat, mw_p = mannwhitneyu(gwb_dist[c],gwt_dist[c],use_continuity=False)
        comp = compare.Compare(gt_dist[c],gb_dist[c])
        statss.append(np.array(comp.calc_stats()))
        #a_mean,b_mean,tt_p,mw_p2,coh_d = stats
    #df_cluster_dist_w = pd.DataFrame(np.vstack((cluster_names,statss[0],statss[1],statss[2],statss[3],statss[4])).T)
    df_cluster_dist = pd.DataFrame(np.array(statss))
    df_cluster_dist.columns=['T mean','B mean','tt p-val','MW p-val','d']
    df_cluster_dist.index = cluster_names
    print(filetype_t,' vs ', filetype_b)
    display(df_cluster_dist)
    print()
    
def do_compare(df_seg, cluster_names):
    compare_t_b(df_seg,'W-T','W-B', cluster_names)
    compare_t_b(df_seg,'I-T','I-B', cluster_names)

#---------------------------------------------
# comparison of T and B for different segments
print('SEGMENT = S2 ONLY - KMEANS')
df_seg = df_km[(df_km['segment'] == 'S2')]
cluster_names = list(cluster_name2i.keys())
do_compare(df_seg, cluster_names)

print('SEGMENT = S2 ONLY - BMM5')
df_seg = df_bmm[(df_bmm['segment'] == 'S2')]
cluster_names = list(cluster_name2i.keys())
do_compare(df_seg, cluster_names)

print('SEGMENT = S2 ONLY - BMM9')
df_seg = df_bmm9[(df_bmm9['segment'] == 'S2')]
cluster_names = list(cluster_name2i_bmm9.keys())
do_compare(df_seg, cluster_names)

SEGMENT = S2 ONLY - KMEANS
W-T  vs  W-B


Unnamed: 0,T mean,B mean,tt p-val,MW p-val,d
duchenne,0.1361,0.137,0.9739,0.2982,0.0055
neutral,0.3725,0.378,0.922,0.4977,0.0163
strong duchenne,0.0206,0.0625,0.0068,0.0018,0.4562
6 only,0.2203,0.1643,0.221,0.0056,-0.205
polite,0.2506,0.2582,0.8581,0.3988,0.0298



I-T  vs  I-B


Unnamed: 0,T mean,B mean,tt p-val,MW p-val,d
duchenne,0.1477,0.1625,0.6544,0.104,0.0741
neutral,0.4274,0.3225,0.0647,0.0766,-0.3073
strong duchenne,0.0394,0.0517,0.4893,0.0935,0.1142
6 only,0.1744,0.1817,0.8639,0.2095,0.0283
polite,0.2111,0.2816,0.1021,0.062,0.2712



SEGMENT = S2 ONLY - BMM5
W-T  vs  W-B


Unnamed: 0,T mean,B mean,tt p-val,MW p-val,d
duchenne,0.5338,0.4682,0.1617,0.0856,-0.2342
neutral,0.1113,0.1215,0.7794,0.4764,0.0467
strong duchenne,0.1276,0.1689,0.2328,0.1549,0.1991
6 only,0.0789,0.0572,0.3567,0.0718,-0.1545
polite,0.1483,0.1843,0.3097,0.1839,0.1695



I-T  vs  I-B


Unnamed: 0,T mean,B mean,tt p-val,MW p-val,d
duchenne,0.5124,0.4742,0.4307,0.2216,-0.1304
neutral,0.1505,0.0943,0.1616,0.198,-0.2324
strong duchenne,0.1539,0.1802,0.4753,0.1386,0.118
6 only,0.0657,0.0611,0.866,0.0537,-0.0278
polite,0.1174,0.1901,0.0385,0.0213,0.3442



SEGMENT = S2 ONLY - BMM9
W-T  vs  W-B


Unnamed: 0,T mean,B mean,tt p-val,MW p-val,d
0,0.0737,0.0522,0.119,0.0372,-0.2616
1,0.038,0.0605,0.1359,0.0391,0.2489
2,0.1384,0.1707,0.3435,0.2209,0.1581
3,0.1113,0.1214,0.7793,0.4764,0.0468
4,0.0862,0.104,0.2865,0.203,0.178
5,0.2808,0.2415,0.2312,0.1045,-0.2005
6,0.0716,0.0517,0.3812,0.0615,-0.1468
7,0.1951,0.1706,0.4741,0.2764,-0.1199
8,0.0049,0.0274,0.0033,0.0008,0.4982



I-T  vs  I-B


Unnamed: 0,T mean,B mean,tt p-val,MW p-val,d
0,0.0832,0.0448,0.0571,0.0111,-0.3177
1,0.0424,0.0559,0.4032,0.1349,0.1381
2,0.1096,0.1791,0.0421,0.0211,0.3379
3,0.1504,0.0944,0.1625,0.198,-0.2319
4,0.0795,0.0877,0.618,0.2175,0.0825
5,0.267,0.2491,0.6302,0.4284,-0.0796
6,0.0589,0.057,0.9454,0.0693,-0.0113
7,0.1934,0.2146,0.6009,0.079,0.0866
8,0.0155,0.0173,0.8005,0.1604,0.0417





### S2 + S3

In [7]:
print('\nSEGMENT = S2 & S3 - KMEANS')
df_seg = df_km[((df_km['segment'] == 'S2') | (df_km['segment'] == 'S3'))]
do_compare(df_seg)

print('\nSEGMENT = S2 & S3 - BMM')
df_seg = df_bmm[((df_bmm['segment'] == 'S2') | (df_bmm['segment'] == 'S3'))]
do_compare(df_seg)


SEGMENT = S2 & S3 - KMEANS
W-T  vs  W-B


Unnamed: 0,T mean,B mean,tt p-val,MW p-val,d
duchenne,0.126,0.136,0.675,0.422,0.069
neutral,0.4,0.406,0.91,0.425,0.019
strong duchenne,0.031,0.057,0.039,0.063,0.345
6 only,0.212,0.161,0.216,0.009,-0.206
polite,0.232,0.24,0.833,0.363,0.035



I-T  vs  I-B


Unnamed: 0,T mean,B mean,tt p-val,MW p-val,d
duchenne,0.157,0.155,0.945,0.296,-0.011
neutral,0.424,0.348,0.172,0.138,-0.226
strong duchenne,0.045,0.049,0.741,0.127,0.055
6 only,0.201,0.196,0.91,0.177,-0.019
polite,0.174,0.252,0.032,0.026,0.356




SEGMENT = S2 & S3 - BMM
W-T  vs  W-B


Unnamed: 0,T mean,B mean,tt p-val,MW p-val,d
duchenne,0.508,0.447,0.151,0.081,-0.239
neutral,0.118,0.139,0.567,0.425,0.095
strong duchenne,0.133,0.166,0.248,0.241,0.192
6 only,0.089,0.065,0.321,0.011,-0.165
polite,0.152,0.184,0.355,0.166,0.153



I-T  vs  I-B


Unnamed: 0,T mean,B mean,tt p-val,MW p-val,d
duchenne,0.456,0.46,0.925,0.48,0.016
neutral,0.171,0.122,0.241,0.178,-0.194
strong duchenne,0.174,0.181,0.809,0.203,0.04
6 only,0.087,0.069,0.508,0.041,-0.109
polite,0.112,0.168,0.066,0.033,0.305



