In [38]:
import pandas as pd
import rpy2.robjects.numpy2ri
import rpy2.robjects as R
from rpy2.robjects.packages import importr
import numpy as np
rpy2.robjects.numpy2ri.activate()

R_STATS = importr('stats')

In [10]:
train_preds = pd.read_csv('../../evaluation_validation_set/confidence_adjusted_tables/NN_reducedV3.2_removeN5_nfeatures21_pMax0.94248563.tsv',
                         sep='\t', index_col=0)
train_preds.columns = ['C1', 'C2', 'C3', 'C4', 'C5']
train_preds['PredictedCluster'] = train_preds.idxmax(axis=1)

train_preds

Unnamed: 0,C1,C2,C3,C4,C5,PredictedCluster
DLBCL10969,0.504256,0.108601,0.110899,0.045825,0.230419,C1
DLBCL10915,0.055453,0.128668,0.060657,0.511689,0.243533,C4
DLBCL11471,0.247199,0.338038,0.093798,0.142958,0.178006,C2
DLBCL10973,0.108441,0.019201,0.187602,0.562698,0.122059,C4
DLBCL_RICOVER_151,0.069136,0.222397,0.056773,0.595760,0.055935,C4
...,...,...,...,...,...,...
DLBCL11667,0.005229,0.005537,0.977830,0.008373,0.003031,C3
DLBCL_C_D_1163_NULLPAIR,0.003185,0.004233,0.003372,0.007574,0.981636,C5
DLBCL_RICOVER_1248,0.002678,0.003860,0.001586,0.003233,0.988643,C5
DLBCL11000,0.001828,0.002737,0.001959,0.003682,0.989794,C5


In [12]:
test_preds = pd.read_csv('../../evaluation_test_set/NN_reducedV3.2_removeN5_nfeatures21_testsetEval.tsv',
                        sep='\t', index_col=0)
test_preds['PredictedCluster'] = 'C' + test_preds['PredictedCluster'].astype(int).astype(str)
test_preds

Unnamed: 0,C1,C2,C3,C4,C5,Confidence,PredictedCluster,TrueCluster,Correctness
DLBCL_LS3808,0.195475,0.029776,0.244105,0.217299,0.313345,0.313345,C5,5,True
DLBCL_LS3387,0.344948,0.011999,0.305924,0.306299,0.030830,0.344948,C1,3,False
DLBCL10470,0.033844,0.291798,0.260483,0.370571,0.043304,0.370571,C4,2,False
DLBCL10538,0.034717,0.284288,0.040720,0.253860,0.386415,0.386415,C5,2,False
DLBCL11193,0.068798,0.403388,0.127777,0.351442,0.048595,0.403388,C2,2,True
...,...,...,...,...,...,...,...,...,...
DLBCL11508,0.996331,0.000502,0.000684,0.001736,0.000746,0.996331,C1,1,True
DLBCL_RICOVER_290,0.996651,0.000412,0.000705,0.001457,0.000776,0.996651,C1,1,True
DLBCL10477,0.000444,0.997602,0.000839,0.000392,0.000723,0.997602,C2,2,True
DLBCL10507,0.000442,0.997948,0.000757,0.000377,0.000476,0.997948,C2,2,True


In [14]:
all_preds = pd.concat([train_preds['PredictedCluster'], test_preds['PredictedCluster']])
all_preds

DLBCL10969           C1
DLBCL10915           C4
DLBCL11471           C2
DLBCL10973           C4
DLBCL_RICOVER_151    C4
                     ..
DLBCL11508           C1
DLBCL_RICOVER_290    C1
DLBCL10477           C2
DLBCL10507           C2
DLBCL11438           C2
Name: PredictedCluster, Length: 699, dtype: object

In [16]:
lg_preds = pd.read_csv('../../data_tables/phenotypes/lymphgenclasses.tsv', sep='\t', index_col=0)
lg_preds = lg_preds.loc[all_preds.index]
lg_preds

Unnamed: 0,Cohort,COO,Sample Preparation,LymphGenClass [Wright] call
DLBCL10969,Schmitz et al.,ABC,Frozen,BN2
DLBCL10915,Schmitz et al.,ABC,Frozen,N1
DLBCL11471,Schmitz et al.,Unclass,Frozen,Other
DLBCL10973,Schmitz et al.,ABC,Frozen,N1
DLBCL_RICOVER_151,Chapuy et al.,,FFPE,ST2
...,...,...,...,...
DLBCL11508,Schmitz et al.,GCB,Frozen,BN2
DLBCL_RICOVER_290,Chapuy et al.,ABC,FFPE,BN2
DLBCL10477,Schmitz et al.,ABC,Frozen,A53
DLBCL10507,Schmitz et al.,ABC,Frozen,A53


In [20]:
lg_preds['DLBclass'] = all_preds
lg_preds

Unnamed: 0,Cohort,COO,Sample Preparation,LymphGenClass [Wright] call,DLBclass
DLBCL10969,Schmitz et al.,ABC,Frozen,BN2,C1
DLBCL10915,Schmitz et al.,ABC,Frozen,N1,C4
DLBCL11471,Schmitz et al.,Unclass,Frozen,Other,C2
DLBCL10973,Schmitz et al.,ABC,Frozen,N1,C4
DLBCL_RICOVER_151,Chapuy et al.,,FFPE,ST2,C4
...,...,...,...,...,...
DLBCL11508,Schmitz et al.,GCB,Frozen,BN2,C1
DLBCL_RICOVER_290,Chapuy et al.,ABC,FFPE,BN2,C1
DLBCL10477,Schmitz et al.,ABC,Frozen,A53,C2
DLBCL10507,Schmitz et al.,ABC,Frozen,A53,C2


In [36]:
def fisher_exact_2x2(matrix, alt='greater'):
    return R_STATS.fisher_test(matrix, alternative=alt)

In [41]:
stats_table = pd.DataFrame(index=sorted(list(set(all_preds))),
                           columns=sorted(list(set(lg_preds['LymphGenClass [Wright] call']))))

stats_table

Unnamed: 0,A53,BN2,BN2/A53,BN2/EZB,BN2/EZB/N1/ST2,BN2/MCD,BN2/ST2,EZB,EZB/A53,EZB/MCD,EZB/N1,EZB/N1/ST2/A53,EZB/ST2,EZB/ST2/A53,MCD,MCD/A53,N1,Other,ST2,ST2/A53
C1,,,,,,,,,,,,,,,,,,,,
C2,,,,,,,,,,,,,,,,,,,,
C3,,,,,,,,,,,,,,,,,,,,
C4,,,,,,,,,,,,,,,,,,,,
C5,,,,,,,,,,,,,,,,,,,,


In [42]:
for c in list(set(all_preds)):
    for lg_c in list(set(lg_preds['LymphGenClass [Wright] call'])):
        m = np.array([[0,0]] * 2)
        m[0][0] = lg_preds.loc[(lg_preds['LymphGenClass [Wright] call'] == lg_c) &
                               (lg_preds['DLBclass'] == c)].shape[0]
        m[0][1] = lg_preds.loc[(lg_preds['LymphGenClass [Wright] call'] == lg_c) &
                               (lg_preds['DLBclass'] != c)].shape[0]
        m[1][0] = lg_preds.loc[(lg_preds['LymphGenClass [Wright] call'] != lg_c) &
                               (lg_preds['DLBclass'] == c)].shape[0]
        m[1][1] = lg_preds.loc[(lg_preds['LymphGenClass [Wright] call'] != lg_c) &
                               (lg_preds['DLBclass'] != c)].shape[0]
        p = fisher_exact_2x2(m)[0][0]
        
        stats_table.loc[c, lg_c] = p
    


In [50]:
stats_table.loc['C1']['BN2'], stats_table.loc['C3']['EZB'], stats_table.loc['C5']['MCD']

(8.825059417191187e-49, 5.86834079938898e-53, 1.2424741850812898e-34)

In [44]:
stats_table.index.name = 'DLBclass'
stats_table.to_csv('../../data_tables/qval_dfs/lymphgen_dlbclass_fisher2x2.tsv', sep='\t')
stats_table

Unnamed: 0_level_0,A53,BN2,BN2/A53,BN2/EZB,BN2/EZB/N1/ST2,BN2/MCD,BN2/ST2,EZB,EZB/A53,EZB/MCD,EZB/N1,EZB/N1/ST2/A53,EZB/ST2,EZB/ST2/A53,MCD,MCD/A53,N1,Other,ST2,ST2/A53
DLBclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
C1,0.999996,0.0,0.101127,0.101127,1.0,0.101127,0.585979,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999999,1.0,0.798461,0.976565,0.99316,1.0
C2,0.0,0.999998,0.562215,0.562215,1.0,1.0,1.0,0.999932,5e-06,1.0,0.240343,0.240343,1.0,0.240343,1.0,0.917127,0.444771,0.003642,1.0,1.0
C3,0.999936,1.0,1.0,1.0,0.16309,1.0,1.0,0.0,0.376127,1.0,1.0,1.0,0.014941,1.0,1.0,1.0,0.919402,0.999985,0.977943,1.0
C4,0.998272,0.999239,1.0,1.0,1.0,1.0,0.010519,0.983753,0.950573,1.0,1.0,1.0,0.46511,1.0,0.990783,1.0,0.038926,0.343478,0.0,0.144492
C5,0.836813,1.0,1.0,1.0,1.0,0.586531,1.0,1.0,1.0,0.254649,1.0,1.0,1.0,1.0,0.0,0.00011,0.907808,0.036376,1.0,1.0
