In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm

from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
import scikit_posthocs as sp

datadf = pd.read_csv('data/data-and-metrics.csv')

def anovaAndTukey(metric):
    dflim = datadf
    dflim = dflim.reset_index()   
    dflim['q_rank'] = pd.qcut(dflim[metric], 10, labels = False) 
    dflim['q_rank_cat'] = dflim['q_rank'].astype(str) + '_' + dflim['target'].astype(str) 

    model = ols('timetaken ~ C(' + metric + ') + C(target) +  C(q_rank):C(' + metric + ')', data=dflim).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    anova_table.to_csv('data/anova_table_' + metric + '_raw.csv')

    
    model = ols('timetaken ~ C(q_rank) + C(target) +  C(q_rank):C(target)', data=dflim).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    anova_table.to_csv('data/anova_table_' + metric + '_ranked.csv')
    

    tkhsd_results = pairwise_tukeyhsd(endog=dflim['timetaken'], groups=dflim['q_rank_cat'], alpha=0.05)
    tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
    tkdf.to_csv('data/tukey_adhoc_' + metric + '.csv')

          
def anovaAndTukeyByBaseMap(metric, basemap):
    dflim = datadf
    dflim['q_rank'] = pd.qcut(dflim[metric], 10, labels = False) 
    dflim['q_rank_cat'] = dflim['q_rank'].astype(str) + '_' + dflim['target'].astype(str) 

    dflim = datadf.loc[datadf['basemap'] == basemap]
    dflim = dflim.reset_index()   

    model = ols('timetaken ~ C(' + metric + ') + C(target) +  C(' + metric + '):C(target)', data=dflim).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    anova_table.to_csv('data/anova_table_' + metric + '_raw_' + basemap + '.csv')

    
    model = ols('timetaken ~ C(q_rank) + C(target) +  C(q_rank):C(target)', data=dflim).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    anova_table.to_csv('data/anova_table_' + metric + '_ranked_' + basemap + '.csv')

    
    tkhsd_results = pairwise_tukeyhsd(endog=dflim['q_rank'], groups=dflim['q_rank_cat'], alpha=0.05)
    tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
#     tkdf.to_csv('data/tukey_adhoc_' + metric + '_' + basemap + '.csv')


In [2]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['correct'], groups=datadf['target'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,gestalt,less gestalt,-0.0625,0.001,-0.0673,-0.0576,True
1,gestalt,no target,0.0476,0.001,0.0435,0.0518,True
2,less gestalt,no target,0.1101,0.001,0.1059,0.1143,True


In [3]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['timetaken'], groups=datadf['target'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,gestalt,less gestalt,165.9848,0.001,143.7538,188.2158,True
1,gestalt,no target,372.7876,0.001,353.6121,391.9631,True
2,less gestalt,no target,206.8028,0.001,187.4727,226.133,True


In [4]:
datadf['gp'] = datadf['target'] + '-' + datadf['distractor']
tkhsd_results = pairwise_tukeyhsd(endog=datadf['timetaken'], groups=datadf['gp'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,gestalt-clustered,gestalt-no distractor,-68.0037,0.8358,-207.9144,71.9071,False
1,gestalt-clustered,gestalt-random,-80.0353,0.001,-122.7086,-37.362,True
2,gestalt-clustered,less gestalt-clustered,196.6551,0.001,158.5334,234.7769,True
3,gestalt-clustered,less gestalt-no distractor,-23.3556,0.9,-162.4149,115.7038,False
4,gestalt-clustered,less gestalt-random,44.2421,0.04,1.0378,87.4465,True
5,gestalt-clustered,no target-clustered,408.4634,0.001,375.5401,441.3868,True
6,gestalt-clustered,no target-no distractor,137.0227,0.001,36.4707,237.5747,True
7,gestalt-clustered,no target-random,246.6285,0.001,210.7947,282.4624,True
8,gestalt-no distractor,gestalt-random,-12.0317,0.9,-153.2886,129.2253,False
9,gestalt-no distractor,less gestalt-clustered,264.6588,0.001,124.7095,404.608,True


In [5]:
datadf['gp'] = datadf['target'] + '-' + datadf['color']
tkhsd_results = pairwise_tukeyhsd(endog=datadf['timetaken'], groups=datadf['gp'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf.to_csv('data/target-color-posthoc.csv')
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,gestalt-blue,gestalt-matches,208.5133,0.0010,147.9697,269.0570,True
1,gestalt-blue,gestalt-missing,46.7708,0.9000,-108.3640,201.9056,False
2,gestalt-blue,gestalt-mixed,56.9119,0.3285,-15.5819,129.4056,False
3,gestalt-blue,gestalt-red,60.9200,0.0399,1.2497,120.5903,True
4,gestalt-blue,less gestalt-blue,55.0998,0.1174,-5.1187,115.3182,False
...,...,...,...,...,...,...,...
100,no target-matches,no target-mixed,-156.1426,0.0010,-208.2956,-103.9896,True
101,no target-matches,no target-red,-107.2179,0.0010,-150.1420,-64.2937,True
102,no target-missing,no target-mixed,218.0607,0.0010,104.5494,331.5720,True
103,no target-missing,no target-red,266.9854,0.0010,157.4073,376.5635,True


In [6]:
dflim = datadf.loc[datadf['distractor'] == 'random']

tkhsd_results = pairwise_tukeyhsd(endog=dflim['timetaken'], groups=dflim['target'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,gestalt,less gestalt,124.2774,0.001,90.0217,158.5332,True
1,gestalt,no target,326.6638,0.001,297.1855,356.1422,True
2,less gestalt,no target,202.3864,0.001,172.5065,232.2663,True


In [7]:
dflim = datadf.loc[datadf['distractor'] == 'clustered']

tkhsd_results = pairwise_tukeyhsd(endog=dflim['timetaken'], groups=dflim['target'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,gestalt,less gestalt,196.6551,0.001,166.9447,226.3655,True
1,gestalt,no target,408.4634,0.001,382.8044,434.1224,True
2,less gestalt,no target,211.8083,0.001,186.0222,237.5944,True


In [8]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['timetaken'], groups=datadf['basemap'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,dark,imagery,119.3026,0.001,94.2999,144.3054,True
1,dark,none,21.614,0.0943,-2.3623,45.5904,False
2,dark,streets,324.1004,0.001,299.0612,349.1397,True
3,imagery,none,-97.6886,0.001,-121.6858,-73.6914,True
4,imagery,streets,204.7978,0.001,179.7386,229.857,True
5,none,streets,302.4864,0.001,278.4511,326.5216,True


In [9]:
datadf['gp'] = datadf['target'] + '-' + datadf['basemap']
tkhsd_results = pairwise_tukeyhsd(endog=datadf['timetaken'], groups=datadf['gp'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf.to_csv('data/basemap-target-posthoc.csv')


In [10]:
dflim = datadf.loc[datadf['distractor'] == 'clustered']

tkhsd_results = pairwise_tukeyhsd(endog=dflim['timetaken'], groups=dflim['basemap'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,dark,imagery,113.2299,0.001,79.4463,147.0135,True
1,dark,none,28.425,0.1004,-3.4687,60.3187,False
2,dark,streets,340.1497,0.001,306.3255,373.974,True
3,imagery,none,-84.8049,0.001,-116.7233,-52.8866,True
4,imagery,streets,226.9198,0.001,193.0723,260.7673,True
5,none,streets,311.7247,0.001,279.7633,343.6862,True


In [11]:
dflim = datadf.loc[datadf['distractor'] == 'random']

tkhsd_results = pairwise_tukeyhsd(endog=dflim['timetaken'], groups=dflim['basemap'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,dark,imagery,125.8746,0.001,87.5545,164.1948,True
1,dark,none,-13.4813,0.7592,-50.4443,23.4817,False
2,dark,streets,302.6512,0.001,264.2584,341.044,True
3,imagery,none,-139.3559,0.001,-176.3737,-102.3381,True
4,imagery,streets,176.7766,0.001,138.3311,215.2222,True
5,none,streets,316.1325,0.001,279.0395,353.2255,True


In [12]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['correct'], groups=datadf['target'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,gestalt,less gestalt,-0.0625,0.001,-0.0673,-0.0576,True
1,gestalt,no target,0.0476,0.001,0.0435,0.0518,True
2,less gestalt,no target,0.1101,0.001,0.1059,0.1143,True


In [13]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['correct'], groups=datadf['basemap'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,dark,imagery,0.0001,0.9,-0.0054,0.0056,False
1,dark,none,-0.012,0.001,-0.0173,-0.0067,True
2,dark,streets,-0.0345,0.001,-0.04,-0.029,True
3,imagery,none,-0.0121,0.001,-0.0174,-0.0068,True
4,imagery,streets,-0.0346,0.001,-0.0401,-0.0291,True
5,none,streets,-0.0225,0.001,-0.0278,-0.0172,True


In [14]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['correct'], groups=datadf['color'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,blue,matches,-0.0826,0.001,-0.088,-0.0772,True
1,blue,missing,0.0004,0.9,-0.0133,0.0142,False
2,blue,mixed,-0.0173,0.001,-0.0238,-0.0108,True
3,blue,red,-0.0231,0.001,-0.0284,-0.0177,True
4,matches,missing,0.0831,0.001,0.0693,0.0968,True
5,matches,mixed,0.0654,0.001,0.0588,0.0719,True
6,matches,red,0.0595,0.001,0.0542,0.0649,True
7,missing,mixed,-0.0177,0.0063,-0.032,-0.0035,True
8,missing,red,-0.0235,0.001,-0.0373,-0.0098,True
9,mixed,red,-0.0058,0.1069,-0.0123,0.0007,False


In [15]:
model = ols('timetaken ~ C(posthoccat2) + C(target) +  C(posthoccat2):C(target)', data=datadf).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print('ANOVA Table')
print(anova_table)



ANOVA Table
                                sum_sq        df            F         PR(>F)
C(posthoccat2)            5.384048e+09      40.0   113.839770   0.000000e+00
C(target)                 2.682808e+09       2.0  1134.500516   0.000000e+00
C(posthoccat2):C(target)  9.028637e+08      80.0     9.545030  1.410190e-107
Residual                  1.295977e+11  109608.0          NaN            NaN




In [16]:
tkhsd_results = pairwise_tukeyhsd(endog=datadf['timetaken'], groups=datadf['posthoccat'], alpha=0.05)
tkdf = pd.DataFrame(data=tkhsd_results._results_table.data[1:], columns=tkhsd_results._results_table.data[0])
tkdf.to_csv('data/tukey_adhoc.csv')




In [17]:
posthoc = sp.posthoc_conover(datadf, val_col='timetaken', group_col='posthoccat', p_adjust = 'holm')
posthoc.to_csv('data/kruskal_adhoc.csv')



In [18]:
anovaAndTukey('number_of_patches')
anovaAndTukey('area_mn')
anovaAndTukey('shape_index_mn')
anovaAndTukey('euclidean_nearest_neighbor_mn')
anovaAndTukey('contagion')



