In [57]:
import pandas as pd
import os
if "R_HOME" not in os.environ:
    os.environ['R_HOME'] = '/Library/Frameworks/R.framework/Resources/'
import numpy as np
import rpy2.robjects.numpy2ri
import rpy2.robjects as R
from rpy2.robjects.packages import importr
rpy2.robjects.numpy2ri.activate()
import statsmodels.stats.multitest as sm

R.r('set.seed')(1)

R_STATS = importr('stats')

In [15]:
def fisher_exact_2x2(matrix, alt='greater'):
    return R_STATS.fisher_test(matrix, alternative=alt)

In [3]:
labels = pd.read_csv('../../data_tables/confidence_tables/baseline_probabilities.connectivity_based.sensitivity_power2.Sep_23_2022.tsv',
                    sep='\t', index_col=0)
labels.head()

Unnamed: 0,C1,C2,C3,C4,C5,cluster,confidence
DLBCL10877,0.985597,0.00221,0.000684,0.010885,0.000625,1,0.985597
DLBCL10928,0.984814,0.001006,0.000431,0.013195,0.000553,1,0.984814
DLBCL_MC_F648_JKW,0.98474,0.003482,0.000594,0.010454,0.00073,1,0.98474
DLBCL10844,0.984523,0.000563,0.000372,0.011383,0.003159,1,0.984523
DLBCL11501,0.983851,0.002258,0.000601,0.012017,0.001273,1,0.983851


In [69]:
table_s1 = pd.read_csv('../../data_tables/tableS1_classifier_merged.tsv', sep='\t', index_col=0)
samples_with_os = table_s1.loc[~table_s1['OS.status (1=dead)'].isna()].index

In [23]:
c3_samples = labels.loc[labels['cluster'] == 3].index
c3_samples

Index(['DLBCL_RICOVER_1283', 'DLBCL_C_D_1127_NULLPAIR', 'DLBCL11656',
       'DLBCL_C_D_1157_NULLPAIR', 'DLBCL10465', 'DLBCL11558', 'DLBCL11670',
       'DLBCL_LS4592', 'DLBCL_LS1620', 'DLBCL_RICOVER_744',
       ...
       'DLBCL_RICOVER_522', 'DLBCL_MAYO_DLBCL_234', 'DLBCL11590',
       'DLBCL_RICOVER_866', 'DLBCL_LS2325', 'DLBCL_RICOVER_258', 'DLBCL10936',
       'DLBCL_RICOVER_408', 'DLBCL_RICOVER_978', 'DLBCL11455'],
      dtype='object', length=117)

In [72]:
samples = pd.read_csv('../../data_tables/sample_sets/ShippStaudtSets.purity0.2.txt', sep='\t', index_col=0)
shipp_samples_c3 = samples.loc[samples['cohort'] == 'Shipp']
shipp_samples_c3 = shipp_samples_c3.loc[shipp_samples_c3.index.isin(c3_samples)].index
shipp_samples_c3_os = shipp_samples_c3[shipp_samples_c3.isin(samples_with_os)]

staudt_samples_c3 = samples.loc[samples['cohort'] != 'Shipp']
staudt_samples_c3 = staudt_samples_c3.loc[staudt_samples_c3.index.isin(c3_samples)].index
staudt_samples_c3_os = staudt_samples_c3[staudt_samples_c3.isin(samples_with_os)]

In [76]:
len(shipp_samples_c3_os), len(staudt_samples_c3_os)

(50, 43)

In [26]:
qval_df = pd.read_csv('../../data_tables/qval_dfs/fisher_exact_5x2.Sep_23_2022.combined.tsv', 
                      sep='\t', index_col=0)
drivers = qval_df.loc[qval_df['q'] <= 0.1].index
drivers

Index(['SV.BCL6', 'DTX1', 'CD70', 'TNFAIP3', 'NOTCH2', 'BCL10', 'B2M', 'CD58',
       'X5Q.AMP', 'FAS',
       ...
       'X6Q14.1.DEL', 'SF3B1', 'ZC3H12A', 'ATP2A2', 'X6P21.33.DEL',
       'X1Q32.1.AMP', 'BCL11A', 'IRF4', 'LYN', 'ETS1'],
      dtype='object', length=163)

In [27]:
gsm = pd.read_csv('../../data_tables/gsm/DLBCL.699.fullGSM.Sep_23_2022.tsv', sep='\t', index_col=0)
gsm.loc['PLOIDY'] = (gsm.loc['PLOIDY'].astype(float) > 2.5).astype(int)
gsm.loc['COO_ABC'] = gsm.loc['COO'].map({'ABC': 1, 'GCB': 0, 'UNC': 0, 'na': 'na'})
gsm.loc['COO_GCB'] = gsm.loc['COO'].map({'ABC': 0, 'GCB': 1, 'UNC': 0, 'na': 'na'})
gsm.loc['COO_UNC'] = gsm.loc['COO'].map({'ABC': 0, 'GCB': 0, 'UNC': 1, 'na': 'na'})
gsm = gsm.drop('COO')
gsm = gsm.drop('PURITY')
gsm = gsm.loc[~gsm.index.str.contains('CCF')]
rows = [i for i in gsm.index if i not in ['COO_ABC', 'COO_GCB', 'COO_UNC']]
for idx in rows:
    gsm.loc[idx] = gsm.loc[idx].astype(float).astype(int)
gsm.head()

Unnamed: 0_level_0,DLBCL11470,DLBCL10900,DLBC_FF_A7CQ_TP_NB,DLBCL10462,DLBCL_RICOVER_1081,DLBCL_LS1098,DLBCL_RICOVER_299,DLBCL11558,DLBCL_C_D_1105_NULLPAIR,DLBCL11447,...,DLBCL11455,DLBCL_RICOVER_685,DLBCL_LS146,DLBCL_RICOVER_111,DLBCL_RICOVER_173,DLBCL11515,DLBCL10491,DLBCL_RICOVER_1046,DLBCL10547,DLBCL10998
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
STAT3,0,0,0,0,2,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
STK33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OSBPL10,0,2,1,0,0,0,0,0,0,1,...,2,0,0,0,0,2,0,0,0,0
BCL11A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
PIM1,2,2,2,2,0,0,2,0,0,2,...,0,2,0,0,0,0,0,0,2,0


In [31]:
all_c3 = list(shipp_samples_c3) + list(staudt_samples_c3)
gsm_c3 = gsm[all_c3]
gsm_c3.head()

Unnamed: 0_level_0,DLBCL_MAYO_DLBCL_234,DLBCL_RICOVER_978,DLBCL_C_D_1110_NULLPAIR,DLBCL_LS4593,DLBCL_LS4592,DLBCL_LS3820,DLBCL_LS3615,DLBCL_LS297,DLBCL_LS2328,DLBCL_LS2325,...,DLBCL11656,DLBCL11666,DLBCL11667,DLBCL11669,DLBCL11670,DLBCL11672,DLBCL11675,DLBCL11680,DLBCL11683,DLBCL11685
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
STAT3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
STK33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
OSBPL10,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,2
BCL11A,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PIM1,0,2,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0


In [77]:
coo_ploidy = ['COO_ABC', 'COO_GCB', 'COO_UNC', 'PLOIDY']
groups = ['Staudt', 'Shipp']
sets = [staudt_samples_c3, shipp_samples_c3]

cols = ['p', 'Shipp_wt', 'Shipp_mt', 'Staudt_wt', 'Staudt_mt',
        'p_os', 'Shipp_os_wt', 'Shipp_os_mt', 'Staudt_os_wt', 'Staudt_os_mt']

stats_table_coo = np.array([[0] * len(cols)] * 4)
stats_table_coo = pd.DataFrame(stats_table_coo)
stats_table_coo.index = coo_ploidy

stats_table_coo.columns = cols

stats_table_coo

Unnamed: 0,p,Shipp_wt,Shipp_mt,Staudt_wt,Staudt_mt,p_os,Shipp_os_wt,Shipp_os_mt,Staudt_os_wt,Staudt_os_mt
COO_ABC,0,0,0,0,0,0,0,0,0,0
COO_GCB,0,0,0,0,0,0,0,0,0,0
COO_UNC,0,0,0,0,0,0,0,0,0,0
PLOIDY,0,0,0,0,0,0,0,0,0,0


In [78]:
for c in stats_table_coo.index:
    sh_g = gsm.loc[c, shipp_samples_c3]
    st_g = gsm.loc[c, staudt_samples_c3]
    
    sh_mt = (sh_g == 1).sum()
    sh_wt = (sh_g == 0).sum()
    st_mt = (st_g == 1).sum()
    st_wt = (st_g == 0).sum()
    
    m = np.array([[0, 0]] * 2)
    m[0][0] = sh_mt
    m[0][1] = sh_wt
    m[1][0] = st_mt
    m[1][1] = st_wt
    
    p = fisher_exact_2x2(m, 'two.sided')[0][0]
    
    stats_table_coo.loc[c, 'p'] = p
    stats_table_coo.loc[c, 'Shipp_wt'] = sh_wt
    stats_table_coo.loc[c, 'Shipp_mt'] = sh_mt
    stats_table_coo.loc[c, 'Staudt_wt'] = st_wt
    stats_table_coo.loc[c, 'Staudt_mt'] = st_mt
    
for c in stats_table_coo.index:
    sh_g = gsm.loc[c, shipp_samples_c3_os]
    st_g = gsm.loc[c, staudt_samples_c3_os]
    
    sh_mt = (sh_g == 1).sum()
    sh_wt = (sh_g == 0).sum()
    st_mt = (st_g == 1).sum()
    st_wt = (st_g == 0).sum()
    
    m = np.array([[0, 0]] * 2)
    m[0][0] = sh_mt
    m[0][1] = sh_wt
    m[1][0] = st_mt
    m[1][1] = st_wt
    
    p = fisher_exact_2x2(m, 'two.sided')[0][0]
    
    stats_table_coo.loc[c, 'p_os'] = p
    stats_table_coo.loc[c, 'Shipp_os_wt'] = sh_wt
    stats_table_coo.loc[c, 'Shipp_os_mt'] = sh_mt
    stats_table_coo.loc[c, 'Staudt_os_wt'] = st_wt
    stats_table_coo.loc[c, 'Staudt_os_mt'] = st_mt
    
stats_table_coo

Unnamed: 0,p,Shipp_wt,Shipp_mt,Staudt_wt,Staudt_mt,p_os,Shipp_os_wt,Shipp_os_mt,Staudt_os_wt,Staudt_os_mt
COO_ABC,0.29771,43,2,55,7,1.0,39,2,41,2
COO_GCB,0.315974,6,39,14,48,0.754031,6,35,5,38
COO_UNC,0.7576,41,4,55,7,0.709551,37,4,40,3
PLOIDY,1.0,52,3,58,4,1.0,47,3,40,3


In [79]:
stats_table = np.array([[0] * len(cols)] * len(drivers))
stats_table = pd.DataFrame(stats_table)

stats_table.columns = cols
stats_table.index = drivers

stats_table

Unnamed: 0,p,Shipp_wt,Shipp_mt,Staudt_wt,Staudt_mt,p_os,Shipp_os_wt,Shipp_os_mt,Staudt_os_wt,Staudt_os_mt
SV.BCL6,0,0,0,0,0,0,0,0,0,0
DTX1,0,0,0,0,0,0,0,0,0,0
CD70,0,0,0,0,0,0,0,0,0,0
TNFAIP3,0,0,0,0,0,0,0,0,0,0
NOTCH2,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
X1Q32.1.AMP,0,0,0,0,0,0,0,0,0,0
BCL11A,0,0,0,0,0,0,0,0,0,0
IRF4,0,0,0,0,0,0,0,0,0,0
LYN,0,0,0,0,0,0,0,0,0,0


In [80]:
for c in stats_table.index:
    sh_g = gsm.loc[c, shipp_samples_c3]
    st_g = gsm.loc[c, staudt_samples_c3]
    
    sh_mt = (sh_g != 0).sum()
    sh_wt = (sh_g == 0).sum()
    st_mt = (st_g != 0).sum()
    st_wt = (st_g == 0).sum()
    
    m = np.array([[0, 0]] * 2)
    m[0][0] = sh_mt
    m[0][1] = sh_wt
    m[1][0] = st_mt
    m[1][1] = st_wt
    
    p = fisher_exact_2x2(m, 'two.sided')[0][0]
    
    stats_table.loc[c, 'p'] = p
    stats_table.loc[c, 'Shipp_wt'] = sh_wt
    stats_table.loc[c, 'Shipp_mt'] = sh_mt
    stats_table.loc[c, 'Staudt_wt'] = st_wt
    stats_table.loc[c, 'Staudt_mt'] = st_mt
    
for c in stats_table.index:
    sh_g = gsm.loc[c, shipp_samples_c3_os]
    st_g = gsm.loc[c, staudt_samples_c3_os]
    
    sh_mt = (sh_g != 0).sum()
    sh_wt = (sh_g == 0).sum()
    st_mt = (st_g != 0).sum()
    st_wt = (st_g == 0).sum()
    
    m = np.array([[0, 0]] * 2)
    m[0][0] = sh_mt
    m[0][1] = sh_wt
    m[1][0] = st_mt
    m[1][1] = st_wt
    
    p = fisher_exact_2x2(m, 'two.sided')[0][0]
    
    stats_table.loc[c, 'p_os'] = p
    stats_table.loc[c, 'Shipp_os_wt'] = sh_wt
    stats_table.loc[c, 'Shipp_os_mt'] = sh_mt
    stats_table.loc[c, 'Staudt_os_wt'] = st_wt
    stats_table.loc[c, 'Staudt_os_mt'] = st_mt

stats_table

Unnamed: 0,p,Shipp_wt,Shipp_mt,Staudt_wt,Staudt_mt,p_os,Shipp_os_wt,Shipp_os_mt,Staudt_os_wt,Staudt_os_mt
SV.BCL6,0.621224,54,1,59,3,0.462366,50,0,42,1
DTX1,0.044275,47,8,60,2,0.013945,43,7,43,0
CD70,0.600181,53,2,61,1,1.000000,48,2,42,1
TNFAIP3,0.302527,49,6,59,3,0.444672,45,5,41,2
NOTCH2,1.000000,52,3,59,3,1.000000,48,2,41,2
...,...,...,...,...,...,...,...,...,...,...
X1Q32.1.AMP,1.000000,51,4,57,5,1.000000,46,4,40,3
BCL11A,1.000000,53,2,60,2,1.000000,48,2,42,1
IRF4,0.444825,53,2,57,5,0.659450,48,2,40,3
LYN,0.470085,54,1,62,0,1.000000,49,1,43,0


In [85]:
stats_table_all = pd.concat([stats_table, stats_table_coo])
stats_table_all.insert(0, 'q', sm.multipletests(stats_table_all['p'], method='fdr_bh')[1])
stats_table_all.insert(6, 'q_os', sm.multipletests(stats_table_all['p_os'], method='fdr_bh')[1])
stats_table_all = stats_table_all.sort_values(by='p', ascending=True)
stats_table_all

Unnamed: 0,q,p,Shipp_wt,Shipp_mt,Staudt_wt,Staudt_mt,q_os,p_os,Shipp_os_wt,Shipp_os_mt,Staudt_os_wt,Staudt_os_mt
X12P13.2.DEL,0.563093,0.004092,48,7,62,0,0.900124,0.013945,43,7,43,0
IRF2BP2,0.563093,0.006744,55,0,54,8,0.900124,0.018522,50,0,38,5
X6P21.1.AMP,0.682411,0.020763,50,5,62,0,0.900124,0.059290,45,5,43,0
X9P21.3.DEL,0.682411,0.023488,46,9,60,2,1.000000,0.169546,43,7,41,2
BCL2,0.682411,0.024520,16,39,31,31,1.000000,0.084782,14,36,20,23
...,...,...,...,...,...,...,...,...,...,...,...,...
X17P.DEL,1.000000,1.000000,47,8,53,9,1.000000,0.767447,43,7,38,5
MYD88.L265P,1.000000,1.000000,55,0,62,0,1.000000,1.000000,50,0,43,0
X18P.AMP,1.000000,1.000000,45,10,50,12,1.000000,1.000000,41,9,36,7
X2Q22.2.DEL,1.000000,1.000000,54,1,60,2,1.000000,0.211080,50,0,41,2


In [86]:
stats_table_all.to_csv('../../data_tables/qval_dfs/c3_staudt_vs_shipp.tsv', sep='\t')