In [71]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm

from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
import scikit_posthocs as sp

datadf = pd.read_csv('data/data-and-metrics.csv')

def anovaAndTukey(metric):
    dflim = datadf
    dflim = dflim.reset_index()   
    dflim['q_rank'] = pd.qcut(dflim[metric], 10, labels = False) 
    dflim['q_rank_cat'] = dflim['q_rank'].astype(str) + '_' + dflim['target'].astype(str) 

    model = ols('timetaken ~ C(' + metric + ') + C(target) +  C(q_rank):C(' + metric + ')', data=dflim).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    anova_table.to_csv('data/anova_table_' + metric + '_raw.csv')

    
    model = ols('timetaken ~ C(q_rank) + C(target) +  C(q_rank):C(target)', data=dflim).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    anova_table.to_csv('data/anova_table_' + metric + '_ranked.csv')
    

    tkhsd_results = pairwise_tukeyhsd(endog=dflim['timetaken'], groups=dflim['q_rank_cat'], alpha=0.05)
    tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
    tkdf.to_csv('data/tukey_adhoc_' + metric + '.csv')

          
def anovaAndTukeyByBaseMap(metric, basemap):
    dflim = datadf.loc[datadf['basemap'] == basemap]
    dflim = dflim.reset_index()   
    dflim['q_rank'] = pd.qcut(dflim[metric], 10, labels = False) 
    dflim['q_rank_cat'] = dflim['q_rank'].astype(str) + '_' + dflim['target'].astype(str) 

    model = ols('timetaken ~ C(' + metric + ') + C(target) +  C(' + metric + '):C(target)', data=dflim).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    anova_table.to_csv('data/anova_table_' + metric + '_raw_' + basemap + '.csv')

    
    model = ols('timetaken ~ C(q_rank) + C(target) +  C(q_rank):C(target)', data=dflim).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    anova_table.to_csv('data/anova_table_' + metric + '_ranked_' + basemap + '.csv')

    
    tkhsd_results = pairwise_tukeyhsd(endog=dflim['q_rank'], groups=dflim['q_rank_cat'], alpha=0.05)
    tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
    tkdf.to_csv('data/tukey_adhoc_' + metric + '_' + basemap + '.csv')


In [29]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['timetaken'], groups=datadf['target'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,gestalt,less gestalt,163.0674,0.001,140.4308,185.704,True
1,gestalt,no target,364.8881,0.001,345.3794,384.3968,True
2,less gestalt,no target,201.8206,0.001,182.1211,221.5202,True


In [54]:
dflim = datadf.loc[datadf['distractor'] == 'random']

tkhsd_results = pairwise_tukeyhsd(endog=dflim['timetaken'], groups=dflim['target'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,gestalt,less gestalt,124.463,0.001,89.3011,159.625,True
1,gestalt,no target,328.9537,0.001,298.6713,359.2361,True
2,less gestalt,no target,204.4907,0.001,173.8301,235.1513,True


In [4]:
dflim = datadf.loc[datadf['distractor'] == 'clustered']

tkhsd_results = pairwise_tukeyhsd(endog=dflim['timetaken'], groups=dflim['target'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,gestalt,less gestalt,192.3323,0.001,162.2121,222.4525,True
1,gestalt,no target,393.5649,0.001,367.6028,419.527,True
2,less gestalt,no target,201.2326,0.001,175.0466,227.4186,True


In [5]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['timetaken'], groups=datadf['basemap'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,dark,imagery,123.8426,0.001,98.4011,149.2841,True
1,dark,none,22.8542,0.076,-1.5477,47.2561,False
2,dark,streets,328.091,0.001,302.6121,353.57,True
3,imagery,none,-100.9885,0.001,-125.412,-76.5649,True
4,imagery,streets,204.2484,0.001,178.7487,229.7481,True
5,none,streets,305.2369,0.001,280.7743,329.6994,True


In [6]:
dflim = datadf.loc[datadf['distractor'] == 'clustered']

tkhsd_results = pairwise_tukeyhsd(endog=dflim['timetaken'], groups=dflim['basemap'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,dark,imagery,116.3502,0.001,82.1414,150.559,True
1,dark,none,28.4975,0.106,-3.8073,60.8023,False
2,dark,streets,341.7534,0.001,307.501,376.0058,True
3,imagery,none,-87.8527,0.001,-120.1798,-55.5256,True
4,imagery,streets,225.4032,0.001,191.1298,259.6766,True
5,none,streets,313.2559,0.001,280.8827,345.6291,True


In [7]:
dflim = datadf.loc[datadf['distractor'] == 'random']

tkhsd_results = pairwise_tukeyhsd(endog=dflim['timetaken'], groups=dflim['basemap'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,dark,imagery,132.5768,0.001,93.2367,171.917,True
1,dark,none,-9.1276,0.9,-47.0728,28.8175,False
2,dark,streets,310.9828,0.001,271.5741,350.3916,True
3,imagery,none,-141.7044,0.001,-179.7156,-103.6933,True
4,imagery,streets,178.406,0.001,138.9338,217.8783,True
5,none,streets,320.1105,0.001,282.0284,358.1926,True


In [8]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['correct'], groups=datadf['target'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,gestalt,less gestalt,-0.0636,0.001,-0.0686,-0.0587,True
1,gestalt,no target,0.0487,0.001,0.0444,0.0529,True
2,less gestalt,no target,0.1123,0.001,0.108,0.1166,True


In [9]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['correct'], groups=datadf['basemap'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,dark,imagery,-0.0002,0.9,-0.0058,0.0054,False
1,dark,none,-0.0123,0.001,-0.0177,-0.0069,True
2,dark,streets,-0.035,0.001,-0.0406,-0.0294,True
3,imagery,none,-0.0121,0.001,-0.0175,-0.0067,True
4,imagery,streets,-0.0348,0.001,-0.0404,-0.0292,True
5,none,streets,-0.0227,0.001,-0.0281,-0.0173,True


In [10]:
model = ols('timetaken ~ C(posthoccat2) + C(target) +  C(posthoccat2):C(target)', data=datadf).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print('ANOVA Table')
print(anova_table)



ANOVA Table
                                sum_sq        df            F        PR(>F)
C(posthoccat2)            4.926793e+09      40.0   104.607669  0.000000e+00
C(target)                 2.469626e+09       2.0  1048.721840  0.000000e+00
C(posthoccat2):C(target)  8.043156e+08      80.0     8.538777  1.217267e-92
Residual                  1.236859e+11  105046.0          NaN           NaN




In [11]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['timetaken'], groups=datadf['posthoccat'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf.to_csv('data/tukey_adhoc.csv')




In [12]:
posthoc = sp.posthoc_conover(datadf, val_col='timetaken', group_col='posthoccat', p_adjust = 'holm')
posthoc.to_csv('data/kruskal_adhoc.csv')



In [72]:
anovaAndTukey('number_of_patches')
anovaAndTukey('area_mn')
anovaAndTukey('shape_index_mn')
anovaAndTukey('euclidean_nearest_neighbor_mn')
anovaAndTukey('contagion')



