In [64]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 999)
pd.options.display.max_columns = None

In [75]:
ratios_df = pd.read_csv('coverage_ratios_tumor_cells.csv')
meta = pd.read_csv('../metadata_all_cells_4.10.19.csv')

In [76]:
sub_l = []
sub_l.append('cells')

for col in ratios_df.columns:
    if 'EGFR' in col or 'KRAS' in col or 'BRAF' in col:
        sub_l.append(col)

ratios_df_sub = ratios_df[sub_l]
ratios_df_sub = ratios_df_sub.replace('0:0', np.nan) # replace zeros with NaN

In [77]:
muts_list = list(ratios_df_sub.columns)
muts_list.remove('cells')

In [78]:
# this bit is important
samples_list = []
for idx, row in ratios_df.iterrows():
        cell = row.cells
        keep = meta.cell_id == cell
        meta_sub = meta[keep]
        meta_sub = meta_sub.reset_index(drop=True)
        sample = meta_sub.sample_name.iloc[0]
        samples_list.append(sample)
        
samples_list = list(set(samples_list))

In [79]:
total_counts = pd.DataFrame(columns=muts_list, index=samples_list)
total_counts = total_counts.applymap(lambda x: 0)

variant_counts = pd.DataFrame(columns=muts_list, index=samples_list)
variant_counts = variant_counts.applymap(lambda x: 0)

In [83]:
for idx, row in ratios_df_sub.iterrows():
        cell = row.cells
        keep = meta.cell_id == cell
        meta_sub = meta[keep]
        meta_sub = meta_sub.reset_index(drop=True)
        sample = meta_sub.sample_name.iloc[0]
        samples_list.append(sample)
        
        for i in range(len(row)):
            ratio = row[i]
            mut = ratios_df_sub.columns[i] # get the name of the mutation 

            if not pd.isna(ratio) and '_' not in ratio: # dont want to catch cell names
                variant_count = int(ratio.split(':')[0])
                total_count = int(ratio.split(':')[1])
   
                variant_counts[mut][sample] += variant_count

In [81]:
total_counts.to_csv('total_counts_by_sample.csv', index_label='sample')
total_counts

Unnamed: 0,EGFR_R521K,EGFR_E746_A750delELREA,KRAS_G12C,EGFR_K745_A750T,EGFR_G42D,EGFR_L858R,KRAS_G13D,EGFR_A237V,KRAS_A146P,KRAS_Q61H,BRAF_V600E,EGFR_R1100S,KRAS_Q61L,KRAS_A146V,BRAF_G466E,BRAF_A762E,EGFR_Q1020H,EGFR_V536M,EGFR_P589L,EGFR_D1014N,KRAS_G12A,KRAS_C185S,KRAS_G12F,EGFR_P1019L,KRAS_G13C,EGFR_K754E,EGFR_F856L,EGFR_G857V,EGFR_V843L,EGFR_R831H
LT_S48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LT_S66,246,0,0,0,17,0,0,0,0,149,176,0,0,0,0,0,0,0,0,22,0,0,0,0,29,0,34,0,0,0
LT_S56,0,0,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LT_S21,1937,12,30,5,34,800,53,48,30,0,0,0,0,10,0,6,0,4,0,0,4,0,30,0,13,0,9,0,20,0
LT_S07,54,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LT_S53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26,0,0,0,0,0,0,24,0,0,0
LT_S71,0,0,0,405,0,0,20,0,0,0,0,59,0,0,0,0,0,0,0,0,0,0,0,37,0,0,0,0,0,0
LT_S55,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LT_S45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LT_S49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [84]:
variant_counts.to_csv('variant_counts_by_sample.csv', index_label='sample')
variant_counts

Unnamed: 0,EGFR_R521K,EGFR_E746_A750delELREA,KRAS_G12C,EGFR_K745_A750T,EGFR_G42D,EGFR_L858R,KRAS_G13D,EGFR_A237V,KRAS_A146P,KRAS_Q61H,BRAF_V600E,EGFR_R1100S,KRAS_Q61L,KRAS_A146V,BRAF_G466E,BRAF_A762E,EGFR_Q1020H,EGFR_V536M,EGFR_P589L,EGFR_D1014N,KRAS_G12A,KRAS_C185S,KRAS_G12F,EGFR_P1019L,KRAS_G13C,EGFR_K754E,EGFR_F856L,EGFR_G857V,EGFR_V843L,EGFR_R831H
LT_S48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LT_S66,157,0,0,0,10,0,0,0,0,40,176,0,0,0,0,0,0,0,0,13,0,0,0,0,5,0,5,0,0,0
LT_S56,0,0,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LT_S21,1937,4,8,2,12,638,14,14,4,0,0,0,0,3,0,2,0,2,0,0,3,0,8,0,2,0,2,0,3,0
LT_S07,54,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LT_S53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,4,0,0,0
LT_S71,0,0,0,373,0,0,9,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0
LT_S55,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LT_S45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LT_S49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [86]:
#pd.read_csv('total_counts_by_sample.csv')
pd.read_csv('variant_counts_by_sample.csv')

Unnamed: 0,sample,EGFR_R521K,EGFR_E746_A750delELREA,KRAS_G12C,EGFR_K745_A750T,EGFR_G42D,EGFR_L858R,KRAS_G13D,EGFR_A237V,KRAS_A146P,KRAS_Q61H,BRAF_V600E,EGFR_R1100S,KRAS_Q61L,KRAS_A146V,BRAF_G466E,BRAF_A762E,EGFR_Q1020H,EGFR_V536M,EGFR_P589L,EGFR_D1014N,KRAS_G12A,KRAS_C185S,KRAS_G12F,EGFR_P1019L,KRAS_G13C,EGFR_K754E,EGFR_F856L,EGFR_G857V,EGFR_V843L,EGFR_R831H
0,LT_S48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,LT_S66,157,0,0,0,10,0,0,0,0,40,176,0,0,0,0,0,0,0,0,13,0,0,0,0,5,0,5,0,0,0
2,LT_S56,0,0,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,LT_S21,1937,4,8,2,12,638,14,14,4,0,0,0,0,3,0,2,0,2,0,0,3,0,8,0,2,0,2,0,3,0
4,LT_S07,54,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,LT_S53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,4,0,0,0
6,LT_S71,0,0,0,373,0,0,9,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0
7,LT_S55,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,LT_S45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,LT_S49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
