In [1]:
import pandas as pd
from scipy.stats import fisher_exact as fe
import numpy as np
import os

In [2]:
if "R_HOME" not in os.environ:
    os.environ['R_HOME'] = '/Library/Frameworks/R.framework/Resources/'
import rpy2.robjects.numpy2ri
import rpy2.robjects as R
from rpy2.robjects.packages import importr

rpy2.robjects.numpy2ri.activate()
R.r('set.seed')(1)
R_STATS = importr('stats')

def fisher_exact_5x2(matrix, numiter=100000):
    return R_STATS.fisher_test(matrix, simulate_p_value=True, B=numiter)

In [3]:
table_s1 = pd.read_csv('../../data_tables/tableS1_classifier_merged.tsv', sep='\t', index_col=0)
print(table_s1.columns)

Index(['order', 'MatchID', 'Gender', 'Cohort', 'COO', 'Sample Preparation',
       'Mean Target Coverage', 'Median Target Coverage', 'Ploidy (ABSOLUTE)',
       'Purity (ABSOLUTE)', 'Pair Status (Tumor Only/Normal)',
       'Number of Mutations', 'Fraction Genome Deleted',
       'Fraction Genome Amplified', 'Number of Drivers - Mutations',
       'Number of Drivers - Non-Silent Mutations', 'Number of Drivers - SCNAs',
       'Number of Drivers - Amplifications', 'Number of Drivers - Deletions',
       'Number of Drivers - SVs', 'Number of WT Mutations (0)',
       'Number of WT SCNAs (0)', 'Number of WT Amplifications (0)',
       'Number of WT Deletions (0)', 'Number of WT SVs (0)',
       'Mutation Density', 'Mutation Density (Non Silent)',
       'Age-at first diagnosis', 'IPI_AGE', 'IPI_ECOG', 'IPI_STAGE', 'IPI_LDH',
       'IPI_EXBM', 'IPI', 'Biopsy Type', 'R-CHOp-like\nChemo', 'PFS-years',
       'PFS_STAT', 'OS.time', 'OS.status (1=dead)',
       'LymphGenClass [Schmitz]\ncall'

In [4]:
table_s1['IPI'], table_s1['PredictedCluster']

(ID
 DLBCL_C_D_1128_NULLPAIR    0.0
 DLBCL_C_D_1156_NULLPAIR    0.0
 DLBCL11539                 NaN
 DLBCL10515                 NaN
 DLBCL10942                 NaN
                           ... 
 DLBCL_LS3499               NaN
 DFCIDL008_DT               NaN
 DLBCL_C_D_1163_NULLPAIR    NaN
 DFCIDL004_DT               NaN
 DLBCL_C_D_1125_NULLPAIR    NaN
 Name: IPI, Length: 699, dtype: float64,
 ID
 DLBCL_C_D_1128_NULLPAIR    1.0
 DLBCL_C_D_1156_NULLPAIR    1.0
 DLBCL11539                 1.0
 DLBCL10515                 1.0
 DLBCL10942                 1.0
                           ... 
 DLBCL_LS3499               4.0
 DFCIDL008_DT               5.0
 DLBCL_C_D_1163_NULLPAIR    5.0
 DFCIDL004_DT               5.0
 DLBCL_C_D_1125_NULLPAIR    5.0
 Name: PredictedCluster, Length: 699, dtype: float64)

In [5]:
tab_1b

NameError: name 'tab_1b' is not defined

In [6]:
# Table 1b							
# DLBclass	Cohort	n-cohort	IPI low	IPI low int	IPI high int	IPI high	
# 1	all						
# 	Chapuy						
# 	Schmitz						
# 2	all						
# 	  Chapuy						
# 	  Schmitz						
# 3	all						
# 	Chapuy						
# 	Schmitz						
# 4	all						
# 	Chapuy						
# 	Schmitz						
# 5	all						
# 	Chapuy						
# 	Schmitz						

tab_1b = pd.DataFrame(index=[x + y for x in ['1', '2', '3', '4', '5'] for y in ['_all', '_Chapuy', '_Schmitz']],
                      columns=['n', 'IPI low', 'IPI low int', 'IPI high int', 'IPI high', 'missing data',
                               'p value (missing excluded)'])

for idx in tab_1b.index:
    ss = idx.split('_')
    cohort = ss[1]
    cluster = float(ss[0])
    
    subset_tab = table_s1.loc[(table_s1['PredictedCluster'] == cluster)]
    cohort_condition = (subset_tab['Cohort'] == 'Chapuy et al.') | (subset_tab['Cohort'] == 'Schmitz et al.')
    if cohort == 'Schmitz':
        cohort_condition = (subset_tab['Cohort'] == 'Schmitz et al.')
    if cohort == 'Chapuy':
        cohort_condition = (subset_tab['Cohort'] == 'Chapuy et al.')
    
    
    tab_1b.loc[idx, 'n'] = subset_tab.loc[cohort_condition].shape[0]
    tab_1b.loc[idx, 'IPI low'] = subset_tab.loc[cohort_condition &
                                                ((subset_tab['IPI'] == 0.0) |
                                                 ((subset_tab['IPI'] == 1.0)))].shape[0]
    tab_1b.loc[idx, 'IPI low int'] = subset_tab.loc[cohort_condition &
                                                    ((subset_tab['IPI'] == 2.0))].shape[0]
    tab_1b.loc[idx, 'IPI high int'] = subset_tab.loc[cohort_condition &
                                                     ((subset_tab['IPI'] == 3.0))].shape[0]
    tab_1b.loc[idx, 'IPI high'] = subset_tab.loc[cohort_condition &
                                                ((subset_tab['IPI'] == 4.0) |
                                                 ((subset_tab['IPI'] == 5.0)))].shape[0]
    tab_1b.loc[idx, 'missing data'] = subset_tab.loc[(subset_tab['IPI'].isna() &
                                                      cohort_condition)].shape[0]

    

for idx in tab_1b.index:
    suffix = idx[1::]
    m = np.array([[0] * 4] * 2)
    m[0,0] = tab_1b.loc[idx, 'IPI low']
    m[0,1] = tab_1b.loc[idx, 'IPI low int']
    m[0,2] = tab_1b.loc[idx, 'IPI high int']
    m[0,3] = tab_1b.loc[idx, 'IPI high']
    
    m[1,0] = tab_1b.loc['1' + suffix, 'IPI low'] + tab_1b.loc['2' + suffix, 'IPI low'] + \
             tab_1b.loc['3' + suffix, 'IPI low'] + tab_1b.loc['4' + suffix, 'IPI low'] + \
             tab_1b.loc['5' + suffix, 'IPI low'] - tab_1b.loc[idx, 'IPI low']
    
    m[1,1] = tab_1b.loc['1' + suffix, 'IPI low int'] + tab_1b.loc['2' + suffix, 'IPI low int'] + \
             tab_1b.loc['3' + suffix, 'IPI low int'] + tab_1b.loc['4' + suffix, 'IPI low int'] + \
             tab_1b.loc['5' + suffix, 'IPI low int'] - tab_1b.loc[idx, 'IPI low int']
    
    m[1,2] = tab_1b.loc['1' + suffix, 'IPI high int'] + tab_1b.loc['2' + suffix, 'IPI high int'] + \
             tab_1b.loc['3' + suffix, 'IPI high int'] + tab_1b.loc['4' + suffix, 'IPI high int'] + \
             tab_1b.loc['5' + suffix, 'IPI high int'] - tab_1b.loc[idx, 'IPI high int']
    
    m[1,3] = tab_1b.loc['1' + suffix, 'IPI high'] + tab_1b.loc['2' + suffix, 'IPI high'] + \
             tab_1b.loc['3' + suffix, 'IPI high'] + tab_1b.loc['4' + suffix, 'IPI high'] + \
             tab_1b.loc['5' + suffix, 'IPI high'] - tab_1b.loc[idx, 'IPI high']
    p = fisher_exact_5x2(m)
    tab_1b.loc[idx, 'p value (missing excluded)'] = p[0][0]
    

tab_1b.to_csv('../../data_tables/table1B_IPIvsCohort.tsv', sep='\t')
tab_1b
    

Unnamed: 0,n,IPI low,IPI low int,IPI high int,IPI high,missing data,p value (missing excluded)
1_all,139,43,21,20,11,44,0.647414
1_Chapuy,43,10,5,10,8,10,0.316747
1_Schmitz,96,33,16,10,3,34,0.944471
2_all,168,55,31,26,16,40,0.478375
2_Chapuy,79,23,16,17,11,12,0.826442
2_Schmitz,89,32,15,9,5,28,0.420176
3_all,113,36,28,21,7,21,0.651903
3_Chapuy,52,14,15,13,6,4,0.443986
3_Schmitz,61,22,13,8,1,17,0.888851
4_all,99,38,23,10,3,25,0.055709


In [180]:
p[0][0]

0.6475735242647573

In [165]:
m

array([[43, 21],
       [ 0,  0]])

In [75]:
table_s1.loc[(table_s1['PredictedCluster'] == 1.0), 'COO'].value_counts().sum()

133

In [102]:
table_s1['COO'].value_counts()

ABC        308
GCB        224
Unclass    114
Name: COO, dtype: int64

In [139]:
# Table 1c							
# DLBclass	Cohort	n-cohort n(%)	ABC	GCB	unclasss	missing data	p value for ABC or GCB
# 1	all	139 (100)	63 (45)	25 (18)	45 (32)	6 (4)	
# 	    IPI low						
# 	    IPI low int						
# 	    IPI int high						
# 	    IPI high						
# 	Chapuy						
# 	    IPI low						
# 	    IPI low int						
# 	    IPI int high						
# 	    IPI high						
# 	Schmitz						
# 	    IPI low						
# 	    IPI low int						
# 	    IPI int high						
# 	    IPI high						
# 2	all						
# 	    IPI low						
# 	    IPI low int						

tab_1c = pd.DataFrame(index=[x + y + z for x in ['1', '2', '3', '4', '5'] 
                             for y in ['_all', '_Chapuy', '_Schmitz'] 
                             for z in ['', '_IPI low', '_IPI low int', '_IPI high int', '_IPI high']],
                      columns=['n', 'ABC', 'GCB', 'unclassified', 'missing COO', 'missing IPI', 'p value (ABC vs GCB)'])

for idx in tab_1c.index:
    ss = idx.split('_')
    cohort = ss[1]
    cluster = float(ss[0])
    ipi_val = 'none'
    if len(ss) == 3:
        ipi_val = ss[2]
    
    subset_tab = table_s1.loc[(table_s1['PredictedCluster'] == cluster)]
    
    cohort_condition = (subset_tab['Cohort'] == 'Chapuy et al.') | (subset_tab['Cohort'] == 'Schmitz et al.')
    if cohort == 'Schmitz':
        cohort_condition = (subset_tab['Cohort'] == 'Schmitz et al.')
    if cohort == 'Chapuy':
        cohort_condition = (subset_tab['Cohort'] == 'Chapuy et al.')
        
    ipi_condition = (subset_tab['IPI'] != -1)
    if ipi_val == 'IPI low':
        ipi_condition = (subset_tab['IPI'] == 0.0) | (subset_tab['IPI'] == 1.0)
    if ipi_val == 'IPI low int':
        ipi_condition = (subset_tab['IPI'] == 2.0)
    if ipi_val == 'IPI high int':
        ipi_condition = (subset_tab['IPI'] == 3.0)
    if ipi_val == 'IPI high':
        ipi_condition = (subset_tab['IPI'] == 4.0) | (subset_tab['IPI'] == 5.0)
    
    
    tab_1c.loc[idx, 'n'] = subset_tab.loc[cohort_condition & 
                                          ipi_condition].shape[0]
    if ipi_val == 'none':
        tab_1c.loc[idx, 'missing IPI'] = subset_tab.loc[cohort_condition & 
                                                        ipi_condition, 'IPI'].isna().sum()
    
    tab_1c.loc[idx, 'missing COO'] = subset_tab.loc[cohort_condition & 
                                                    ipi_condition, 'COO'].isna().sum()
    
    tab_1c.loc[idx, 'ABC'] = subset_tab.loc[cohort_condition & 
                                            ipi_condition & 
                                            (subset_tab['COO'] == 'ABC')].shape[0]
    tab_1c.loc[idx, 'GCB'] = subset_tab.loc[cohort_condition & 
                                            ipi_condition & 
                                            (subset_tab['COO'] == 'GCB')].shape[0]
    tab_1c.loc[idx, 'unclassified'] = subset_tab.loc[cohort_condition & 
                                                     ipi_condition & 
                                                     (subset_tab['COO'] == 'Unclass')].shape[0]
    
for idx in tab_1c.index:
    suffix = idx[1::]
    m = np.array([[0] * 2] * 2)
    m[0,0] = tab_1c.loc[idx, 'ABC']
    m[0,1] = tab_1c.loc[idx, 'GCB']
    m[1,0] = tab_1c.loc['1' + suffix, 'ABC'] + tab_1c.loc['2' + suffix, 'ABC'] + \
             tab_1c.loc['3' + suffix, 'ABC'] + tab_1c.loc['4' + suffix, 'ABC'] + \
             tab_1c.loc['5' + suffix, 'ABC'] - tab_1c.loc[idx, 'ABC']
    m[1,1] = tab_1c.loc['1' + suffix, 'GCB'] + tab_1c.loc['2' + suffix, 'GCB'] + \
             tab_1c.loc['3' + suffix, 'GCB'] + tab_1c.loc['4' + suffix, 'GCB'] + \
             tab_1c.loc['5' + suffix, 'GCB'] - tab_1c.loc[idx, 'GCB']
    p = fe(m)
    tab_1c.loc[idx, 'p value (ABC vs GCB)'] = p[1]
    

tab_1c.to_csv('../../data_tables/table1C_COOvsIPI.tsv', sep='\t')
tab_1c

Unnamed: 0,n,ABC,GCB,unclassified,missing COO,missing IPI,p value (ABC vs GCB)
1_all,139,63,25,45,6,44,0.00451
1_all_IPI low,43,18,8,16,1,,0.032289
1_all_IPI low int,21,10,1,9,1,,0.02096
1_all_IPI high int,20,8,4,5,3,,1.0
1_all_IPI high,11,8,0,3,0,,0.146011
...,...,...,...,...,...,...,...
5_Schmitz,116,101,5,10,0,37,0.0
5_Schmitz_IPI low,37,30,2,5,0,,0.0
5_Schmitz_IPI low int,22,17,1,4,0,,0.000368
5_Schmitz_IPI high int,16,15,1,0,0,,0.05011


In [158]:
# Table 2b						
# DLBclass	Cohort	n-cohort	ABC	GCB	unclasss	missing data
# 1	all	139 (100)	63 (45)	25 (18)	45 (32)	6 (4)
# 	   HC (conf>0.7)	106 (100)	47 (44)	20 (19)	36 (34)	3 (3)
# 	   LC (conf<0.7)	33 (100)	16 (48)	5 (15)	9 (27)	3 (9)
# 	Chapuy	43 (100)	23 (53)	7 (16)	7 (16)	6 (14)
# 	HC (conf>0.7)					
# 	LC (conf<0.7)					
# 	Schmitz	96 (100)	40 (42)	18 (19)	38 (40)	0 (0)
# 	HC (conf>0.7)					
# 	LC (conf<0.7)					
# 2	all	168 (100)	73 (43)	55 (33)	25 (15)	15 (9)
# 	HC (conf>0.7)	126 (100)	55 (44)	42 (33)	20 (16)	9 (7)
# 	LC (conf<0.7)	42 (100)	18 (43)	13 (31)	5 (12)	6 (14)
# 	  Chapuy	79 (100)	30 (38)	24 (30)	10 (13)	15 (19)
# 	HC (conf>0.7)					
# 	LC (conf<0.7)					
# 	  Schmitz	89 (100)	43 (48)	31 (35)	15 (17)	0 (0)
# 	HC (conf>0.7)					
# 	LC (conf<0.7)					
# 3	all	113 (100)	6 (5)	89 (79)	8 (7)	10 (9)
# 	HC (conf>0.7)	90 (100)	4 (4)	74 (82)	4 (4)	8 (9)

tab_2b = pd.DataFrame(index=[x + y + z for x in ['1', '2', '3', '4', '5'] 
                             for y in ['_all', '_Chapuy', '_Schmitz'] 
                             for z in ['', '_HC', '_LC']],
                      columns=['n', 'ABC', 'GCB', 'unclassified', 'missing COO', 'p value (ABC vs GCB)'])

for idx in tab_2b.index:
    ss = idx.split('_')
    cohort = ss[1]
    cluster = float(ss[0])
    hc_val = 'none'
    if len(ss) == 3:
        hc_val = ss[2]
    
    subset_tab = table_s1.loc[(table_s1['PredictedCluster'] == cluster)]
    
    cohort_condition = (subset_tab['Cohort'] == 'Chapuy et al.') | (subset_tab['Cohort'] == 'Schmitz et al.')
    if cohort == 'Schmitz':
        cohort_condition = (subset_tab['Cohort'] == 'Schmitz et al.')
    if cohort == 'Chapuy':
        cohort_condition = (subset_tab['Cohort'] == 'Chapuy et al.')
        
    hc_condition = (subset_tab['Confidence'] >= 0.70) | (subset_tab['Confidence'] < 0.70)
    if hc_val == 'LC':
        hc_condition = subset_tab['Confidence'] < 0.70
    if hc_val == 'HC':
        hc_condition = subset_tab['Confidence'] >= 0.70
    
    
    tab_2b.loc[idx, 'n'] = subset_tab.loc[cohort_condition & 
                                          hc_condition].shape[0]
    
    tab_2b.loc[idx, 'missing COO'] = subset_tab.loc[cohort_condition & 
                                                    hc_condition, 'COO'].isna().sum()
    
    tab_2b.loc[idx, 'ABC'] = subset_tab.loc[cohort_condition & 
                                            hc_condition & 
                                            (subset_tab['COO'] == 'ABC')].shape[0]
    tab_2b.loc[idx, 'GCB'] = subset_tab.loc[cohort_condition & 
                                            hc_condition & 
                                            (subset_tab['COO'] == 'GCB')].shape[0]
    tab_2b.loc[idx, 'unclassified'] = subset_tab.loc[cohort_condition & 
                                                     hc_condition & 
                                                     (subset_tab['COO'] == 'Unclass')].shape[0]
    
for idx in tab_2b.index:
    suffix = idx[1::]
    m = np.array([[0] * 2] * 2)
    m[0,0] = tab_2b.loc[idx, 'ABC']
    m[0,1] = tab_2b.loc[idx, 'GCB']
    m[1,0] = tab_2b.loc['1' + suffix, 'ABC'] + tab_2b.loc['2' + suffix, 'ABC'] + \
             tab_2b.loc['3' + suffix, 'ABC'] + tab_2b.loc['4' + suffix, 'ABC'] + \
             tab_2b.loc['5' + suffix, 'ABC'] - tab_2b.loc[idx, 'ABC']
    m[1,1] = tab_2b.loc['1' + suffix, 'GCB'] + tab_2b.loc['2' + suffix, 'GCB'] + \
             tab_2b.loc['3' + suffix, 'GCB'] + tab_2b.loc['4' + suffix, 'GCB'] + \
             tab_2b.loc['5' + suffix, 'GCB'] - tab_2b.loc[idx, 'GCB']
    p = fe(m)
    tab_2b.loc[idx, 'p value (ABC vs GCB)'] = p[1]
    
tab_2b.to_csv('../../data_tables/table2B_COOvsConfidence.tsv', sep='\t')
tab_2b

Unnamed: 0,n,ABC,GCB,unclassified,missing COO,p value (ABC vs GCB)
1_all,139,63,25,45,6,0.00451
1_all_HC,106,47,20,36,3,0.022148
1_all_LC,33,16,5,9,3,0.093858
1_Chapuy,43,23,7,7,6,0.004833
1_Chapuy_HC,34,20,5,6,3,0.001813
1_Chapuy_LC,9,3,2,1,3,1.0
1_Schmitz,96,40,18,38,0,0.186587
1_Schmitz_HC,72,27,15,30,0,0.730865
1_Schmitz_LC,24,13,3,8,0,0.086151
2_all,168,73,55,25,15,0.837715


In [163]:
# Table 4										
# DLBclass	Cohort	n-cohort	BN2	A53	EZB	ST2	MCD	N1	Ambigous	Other
# 1	All	139 (100)	79 (57)	1 (1)	2 (1)	2 (1)	3 (2)	2 (1)	6 (4)	44 (32)
# 	HC (conf>0.7)	106 (100)	68 (64)	0 (0)	1 (1)	0 (0)	2 (2)	2 (2)	5 (5)	28 (26)
# 	LC (conf<0.7)	33 (100)	11 (33)	1 (3)	1 (3)	2 (6)	1 (3)	0 (0)	1 (3)	16 (48)
# 	Chapuy	43 (100)	24 (56)	1 (2)	0 (0)	1 (2)	0 (0)	1 (2)	1 (2)	15 (35)
										
										
# 	Schmitz	96 (100)	55 (57)	0 (0)	2 (2)	1 (1)	3 (3)	1 (1)	5 (5)	29 (30)
										
										
# 2	All	168 (100)	7 (4)	46 (27)	10 (6)	1 (1)	4 (2)	4 (2)	19 (11)	77 (46)
# 	HC (conf>0.7)	126 (100)	6 (5)	40 (32)	7 (6)	1 (1)	2 (2)	3 (2)	13 (10)	54 (43)
# 	LC (conf<0.7)	42 (100)	1 (2)	6 (14)	3 (7)	0 (0)	2 (5)	1 (2)	6 (14)	23 (55)
# 	Chapuy	79 (100)	2 (3)	25 (32)	3 (4)	0 (0)	1 (1)	1 (1)	12 (15)	35 (44)

tab_4 = pd.DataFrame(index=[x + y + z for x in ['1', '2', '3', '4', '5'] 
                             for y in ['_all', '_Chapuy', '_Schmitz'] 
                             for z in ['', '_HC', '_LC']],
                      columns=['n', 'BN2', 'A53', 'EZB', 'ST2', 'MCD', 'N1', 'Ambiguous', 'Other'])

for idx in tab_4.index:
    ss = idx.split('_')
    cohort = ss[1]
    cluster = float(ss[0])
    hc_val = 'none'
    if len(ss) == 3:
        hc_val = ss[2]
    
    subset_tab = table_s1.loc[(table_s1['PredictedCluster'] == cluster)]
    
    cohort_condition = (subset_tab['Cohort'] == 'Chapuy et al.') | (subset_tab['Cohort'] == 'Schmitz et al.')
    if cohort == 'Schmitz':
        cohort_condition = (subset_tab['Cohort'] == 'Schmitz et al.')
    if cohort == 'Chapuy':
        cohort_condition = (subset_tab['Cohort'] == 'Chapuy et al.')
        
    hc_condition = (subset_tab['Confidence'] >= 0.70) | (subset_tab['Confidence'] < 0.70)
    if hc_val == 'LC':
        hc_condition = subset_tab['Confidence'] < 0.70
    if hc_val == 'HC':
        hc_condition = subset_tab['Confidence'] >= 0.70
    
    
    tab_4.loc[idx, 'n'] = subset_tab.loc[cohort_condition & 
                                          hc_condition].shape[0]
    
    
    tab_4.loc[idx, 'BN2'] = subset_tab.loc[cohort_condition & 
                                            hc_condition & 
                                            (subset_tab['LymphGenClass [Wright]\ncall'] == 'BN2')].shape[0]
    tab_4.loc[idx, 'A53'] = subset_tab.loc[cohort_condition & 
                                            hc_condition & 
                                            (subset_tab['LymphGenClass [Wright]\ncall'] == 'A53')].shape[0]
    tab_4.loc[idx, 'EZB'] = subset_tab.loc[cohort_condition & 
                                            hc_condition & 
                                            (subset_tab['LymphGenClass [Wright]\ncall'] == 'EZB')].shape[0]
    tab_4.loc[idx, 'ST2'] = subset_tab.loc[cohort_condition & 
                                            hc_condition & 
                                            (subset_tab['LymphGenClass [Wright]\ncall'] == 'ST2')].shape[0]
    tab_4.loc[idx, 'MCD'] = subset_tab.loc[cohort_condition & 
                                            hc_condition & 
                                            (subset_tab['LymphGenClass [Wright]\ncall'] == 'MCD')].shape[0]
    tab_4.loc[idx, 'N1'] = subset_tab.loc[cohort_condition & 
                                            hc_condition & 
                                            (subset_tab['LymphGenClass [Wright]\ncall'] == 'N1')].shape[0]
    tab_4.loc[idx, 'Other'] = subset_tab.loc[cohort_condition & 
                                            hc_condition & 
                                            (subset_tab['LymphGenClass [Wright]\ncall'] == 'Other')].shape[0]
    tab_4.loc[idx, 'Ambiguous'] = subset_tab.loc[cohort_condition & 
                                            hc_condition & 
                                            (subset_tab['LymphGenClass [Wright]\ncall'].str.contains('/'))].shape[0]
    
    
tab_4.to_csv('../../data_tables/table4_DLBCLASSvsLymphgen.tsv', sep='\t')
tab_4

Unnamed: 0,n,BN2,A53,EZB,ST2,MCD,N1,Ambiguous,Other
1_all,139,79,1,2,2,3,2,6,44
1_all_HC,106,68,0,1,0,2,2,5,28
1_all_LC,33,11,1,1,2,1,0,1,16
1_Chapuy,43,24,1,0,1,0,1,1,15
1_Chapuy_HC,34,22,0,0,0,0,1,0,11
1_Chapuy_LC,9,2,1,0,1,0,0,1,4
1_Schmitz,96,55,0,2,1,3,1,5,29
1_Schmitz_HC,72,46,0,1,0,2,1,5,17
1_Schmitz_LC,24,9,0,1,1,1,0,0,12
2_all,168,7,46,10,1,4,4,19,77


In [161]:
subset_tab['LymphGenClass [Wright]\ncall']

ID
DLBCL10971                 Other
DLBCL10492                 Other
DLBCL10856                   A53
DLBCL10858                   MCD
DLBCL_RICOVER_258          Other
                           ...  
DLBCL11514                   MCD
DFCIDL008_DT               Other
DLBCL_C_D_1163_NULLPAIR      MCD
DFCIDL004_DT               Other
DLBCL_C_D_1125_NULLPAIR      MCD
Name: LymphGenClass [Wright]\ncall, Length: 180, dtype: object