In [1]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr
from statsmodels.sandbox.stats.multicomp import multipletests 

In [12]:
import itertools
from statsmodels.stats.descriptivestats import sign_test 
from statsmodels.stats.weightstats import zconfint
from scipy.stats import wilcoxon

In [2]:
ds = pd.read_csv('AUCs.txt', sep = '\t', header = 0)
ds.head()

Unnamed: 0.1,Unnamed: 0,C4.5,C4.5+m,C4.5+cf,C4.5+m+cf
0,adult (sample),0.763,0.768,0.771,0.798
1,breast cancer,0.599,0.591,0.59,0.569
2,breast cancer wisconsin,0.954,0.971,0.968,0.967
3,cmc,0.628,0.661,0.654,0.657
4,ionosphere,0.882,0.888,0.886,0.898


In [4]:
ds_columns = [u'C4.5', u'C4.5+m', u'C4.5+cf', u'C4.5+m+cf']
print ds_columns

[u'C4.5', u'C4.5+m', u'C4.5+cf', u'C4.5+m+cf']


In [28]:
%%time 
corr_data = []

for i, lhs_column in enumerate(ds_columns):
    for j, rhs_column in enumerate(ds_columns):
        if i >= j:
            continue
        
        corr, p = wilcoxon(ds[lhs_column], ds[rhs_column])
        corr_data.append([lhs_column, rhs_column, corr, p])

Wall time: 3 ms


In [29]:
ds_correlation = pd.DataFrame.from_records(corr_data)
ds_correlation.columns = ['method_A', 'method_B', 'corr', 'p']
ds_correlation.head()

Unnamed: 0,method_A,method_B,corr,p
0,C4.5,C4.5+m,6.5,0.010757
1,C4.5,C4.5+cf,43.0,0.861262
2,C4.5,C4.5+m+cf,11.0,0.015906
3,C4.5+m,C4.5+cf,17.0,0.046333
4,C4.5+m,C4.5+m+cf,22.0,0.327826


In [31]:
ds_correlation

Unnamed: 0,method_A,method_B,corr,p
0,C4.5,C4.5+m,6.5,0.010757
1,C4.5,C4.5+cf,43.0,0.861262
2,C4.5,C4.5+m+cf,11.0,0.015906
3,C4.5+m,C4.5+cf,17.0,0.046333
4,C4.5+m,C4.5+m+cf,22.0,0.327826
5,C4.5+cf,C4.5+m+cf,10.0,0.022909


In [9]:
sign_test(ds, 0.5)

(35.0, 1.6940658945086007e-21)

In [11]:
np.mean(ds, axis = 0)

C4.5         0.804929
C4.5+m       0.820429
C4.5+cf      0.808786
C4.5+m+cf    0.827214
dtype: float64

In [30]:
(ds_correlation.p < 0.05).value_counts()

True     4
False    2
Name: p, dtype: int64

In [16]:
wilcoxon(ds['C4.5'], ds['C4.5+m'])

WilcoxonResult(statistic=6.5, pvalue=0.01075713311978963)

In [17]:
wilcoxon(ds['C4.5'], ds['C4.5+cf'])

WilcoxonResult(statistic=43.0, pvalue=0.86126233009534803)

In [18]:
wilcoxon(ds['C4.5'], ds['C4.5+m+cf'])

WilcoxonResult(statistic=11.0, pvalue=0.015906444101703374)

In [19]:
wilcoxon(ds['C4.5+m'], ds['C4.5+cf'])

WilcoxonResult(statistic=17.0, pvalue=0.046332729793395394)

In [20]:
wilcoxon(ds['C4.5+m'], ds['C4.5+m+cf'])

WilcoxonResult(statistic=22.0, pvalue=0.32782567584464062)

In [21]:
wilcoxon(ds['C4.5+cf'], ds['C4.5+m+cf'])

WilcoxonResult(statistic=10.0, pvalue=0.022909099354356588)

In [32]:
reject, p_corrected, a1, a2 = multipletests(ds_correlation.p, 
                                            alpha = 0.05, 
                                            method = 'holm') 

In [33]:
ds_correlation['p_corrected'] = p_corrected
ds_correlation['reject'] = reject

In [34]:
ds_correlation.reject.value_counts()

False    6
Name: reject, dtype: int64

In [35]:
reject, p_corrected, a1, a2 = multipletests(ds_correlation.p, 
                                            alpha = 0.05, 
                                            method = 'fdr_bh') 

In [36]:
ds_correlation['p_corrected'] = p_corrected
ds_correlation['reject'] = reject

In [37]:
ds_correlation.reject.value_counts()

True     3
False    3
Name: reject, dtype: int64