In [30]:
from __future__ import division

import numpy as np
import pandas as pd

from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests 

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import combinations

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [27]:
data = pd.read_table('AUCs.txt', delimiter='\t')

In [46]:
data.columns

Index(['Unnamed: 0', 'C4.5', 'C4.5+m', 'C4.5+cf', 'C4.5+m+cf'], dtype='object')

In [43]:
np.mean(data['C4.5'].values)

0.8049285714285714

In [50]:
np.mean(data['C4.5+m+cf'].values)

0.8272142857142858

In [19]:
columns = data.drop('Unnamed: 0', axis=1)

In [23]:
%%time 
corr_data = []

for i, lhs_column in enumerate(columns.columns):
    for j, rhs_column in enumerate(columns.columns):
        if i >= j:
            continue
        
        corr, p = pearsonr(columns[lhs_column], columns[rhs_column])
        corr_data.append([lhs_column, rhs_column, corr, p])
        
#stats.wilcoxon

Wall time: 20 ms


In [31]:
%%time 
corr_data = []

for i, lhs_column in enumerate(columns.columns):
    for j, rhs_column in enumerate(columns.columns):
        if i >= j:
            continue
        
        corr, p = stats.wilcoxon(columns[lhs_column], columns[rhs_column])
        corr_data.append([lhs_column, rhs_column, corr, p])

Wall time: 7.99 ms


In [34]:
correlation = pd.DataFrame.from_records(corr_data)
correlation.columns = ['product_A', 'product_B', 'corr', 'p']
correlation[(correlation.p < 0.05)]

Unnamed: 0,product_A,product_B,corr,p
0,C4.5,C4.5+m,6.5,0.010757
2,C4.5,C4.5+m+cf,11.0,0.015906
3,C4.5+m,C4.5+cf,17.0,0.046333
5,C4.5+cf,C4.5+m+cf,10.0,0.022909


In [57]:
reject, p_corrected, a1, a2 = multipletests(correlation.p, 
                                            alpha = 0.05, 
                                            method = 'holm') 

In [58]:
correlation['p_corrected'] = p_corrected
correlation['reject'] = reject
correlation

Unnamed: 0,product_A,product_B,corr,p,p_corrected,reject
0,C4.5,C4.5+m,6.5,0.010757,0.064543,False
1,C4.5,C4.5+cf,43.0,0.861262,0.861262,False
2,C4.5,C4.5+m+cf,11.0,0.015906,0.079532,False
3,C4.5+m,C4.5+cf,17.0,0.046333,0.138998,False
4,C4.5+m,C4.5+m+cf,22.0,0.327826,0.655651,False
5,C4.5+cf,C4.5+m+cf,10.0,0.022909,0.091636,False


In [59]:
reject, p_corrected, a1, a2 = multipletests(correlation.p, 
                                            alpha = 0.05, 
                                            method = 'fdr_bh') 

In [60]:
correlation['p_corrected'] = p_corrected
correlation['reject'] = reject
correlation

Unnamed: 0,product_A,product_B,corr,p,p_corrected,reject
0,C4.5,C4.5+m,6.5,0.010757,0.045818,True
1,C4.5,C4.5+cf,43.0,0.861262,0.861262,False
2,C4.5,C4.5+m+cf,11.0,0.015906,0.045818,True
3,C4.5+m,C4.5+cf,17.0,0.046333,0.069499,False
4,C4.5+m,C4.5+m+cf,22.0,0.327826,0.393391,False
5,C4.5+cf,C4.5+m+cf,10.0,0.022909,0.045818,True
