In [1]:
# This notebook takes as input GOI_out_AA.csv files (from getMutationCounts_overall_and_GOI.py),
# patient metadata, seurat metadata, fusionsDF, and creates both by CELL and 
# by SAMPLE summaryTables. The goal with this table is to provide an answer to questions like
# 'which patients have which mutations?', and 'how many cells have clinically relevant
# mutations?' 

In [2]:
import summarizeModule
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # want to disable this SettingWithCopyWarning

In [3]:
# READ IN ALL OF THESE BY-GENE AMINO-ACID LEVEL MUTATION COUNTS OBJECTS
mutsPATH = '/Users/lincoln.harris/code/SNP_calling_pipeline/getMutationCounts/'
egfrPATH = mutsPATH + 'egfr_germline_out_AA.csv'
brafPATH = mutsPATH + 'braf_germline_out_AA.csv'
krasPATH = mutsPATH + 'kras_germline_out_AA.csv'

egfr_df = pd.read_csv(egfrPATH, header=None, names=['cell', 'mutations'])
braf_df = pd.read_csv(brafPATH, header=None, names=['cell', 'mutations'])
kras_df = pd.read_csv(krasPATH, header=None, names=['cell', 'mutations'])
egfr_df

Unnamed: 0,cell,mutations
0,K21_B003995,[]
1,L22_1001000408,['Q787Q']
2,A8_B001557,[]
3,L22_B001016,[]
4,C20_B002073,[]
5,D8_B001474,[]
6,C5_B002572,[]
7,E17_B003116,[]
8,A15_B000420,[]
9,H10_B002573,[]


In [4]:
# FIRST STEP IS TO GENERATE THE mutationsDF
mutationsDF = pd.DataFrame(columns=['cell', 'brafMut', 'egfrMut', 'krasMut'])
mutationsDF['cell'] = egfr_df['cell']
mutationsDF['egfrMut'] = egfr_df['mutations'] # fill in EGFR first -- this is ok bc the cell order is based on egfr_df
summarizeModule.mutationsDF_fillIn('braf', braf_df, mutationsDF) 
summarizeModule.mutationsDF_fillIn('kras', kras_df, mutationsDF)
mutationsDF

Unnamed: 0,cell,brafMut,egfrMut,krasMut
0,K21_B003995,[],[],[]
1,L22_1001000408,[],['Q787Q'],[]
2,A8_B001557,[],[],[]
3,L22_B001016,[],[],[]
4,C20_B002073,[],[],[]
5,D8_B001474,[],[],[]
6,C5_B002572,[],[],[]
7,E17_B003116,[],[],[]
8,A15_B000420,[],[],[]
9,H10_B002573,[],[],[]


In [5]:
# CONVERTING LISTS INTO STRS. MAKES DOWNSTEAM ANALYSIS EASIER
summarizeModule.removeExtraCharacters_mutationsDF('egfr', mutationsDF)
summarizeModule.removeExtraCharacters_mutationsDF('braf', mutationsDF)
summarizeModule.removeExtraCharacters_mutationsDF('kras', mutationsDF)
mutationsDF

Unnamed: 0,cell,brafMut,egfrMut,krasMut
0,K21_B003995,,,
1,L22_1001000408,,Q787Q,
2,A8_B001557,,,
3,L22_B001016,,,
4,C20_B002073,,,
5,D8_B001474,,,
6,C5_B002572,,,
7,E17_B003116,,,
8,A15_B000420,,,
9,H10_B002573,,,


In [6]:
# READ IN patientMetadata
patientMetadata = pd.read_csv('/Users/lincoln.harris/code/SNP_calling_pipeline/metadata_all_cells_4.10.19.csv')
patientMetadata = patientMetadata.drop([0,1]) # first two rows are wierd
patientMetadata

Unnamed: 0.1,Unnamed: 0,nGene,nReads,orig.ident,well,plate,cell_id,sample_name,patient_id,DOB,...,res.0.1,res.0.3,res.0.5,res.0.7,res.0.9,S.Score,G2M.Score,Phase,main_seurat_cluster,immune_annotation
2,A10_1001000408,7085,602263,SeuratProject,A10,1001000408,A10_1001000408,LT_S21,TH185,1961-12-29,...,6,7,8,8,9,-0.180289,-0.535851,G1,8,non-immune
3,A10_1001000410,1914,185720,SeuratProject,A10,1001000410,A10_1001000410,LT_S21,TH185,1961-12-29,...,1,0,10,10,8,-0.005365,-0.231767,G1,10,immune
4,A10_1001000412,7274,914254,SeuratProject,A10,1001000412,A10_1001000412,LT_S21,TH185,1961-12-29,...,6,7,8,8,9,-0.377089,-0.492793,G1,8,non-immune
5,A10_B000420,2464,101565,SeuratProject,A10,B000420,A10_B000420,LT_S66,TH238,1949-08-25,...,0,11,13,13,13,-0.200227,0.030644,G2M,13,immune
6,A10_B000422,3424,748360,SeuratProject,A10,B000422,A10_B000422,LT_S66,TH238,1949-08-25,...,0,1,0,0,0,0.141943,0.025751,S,0,immune
7,A10_B000580,3984,3184663,SeuratProject,A10,B000580,A10_B000580,LT_S50,TH225,1940-06-25,...,0,1,0,0,0,0.117926,-0.303089,S,0,immune
8,A10_B000863,1093,391731,SeuratProject,A10,B000863,A10_B000863,LT_S47,TH220,1946-09-18,...,2,4,3,4,2,-0.124229,-0.203540,G1,3,non-immune
9,A10_B001007,717,399863,SeuratProject,A10,B001007,A10_B001007,LT_S82,TH226,1954-12-26,...,10,15,19,19,22,0.151958,-0.111904,S,19,non-immune
10,A10_B001008,3483,993763,SeuratProject,A10,B001008,A10_B001008,LT_S82,TH226,1954-12-26,...,0,1,0,0,0,-0.257014,-0.217378,G1,0,immune
11,A10_B001010,1911,730050,SeuratProject,A10,B001010,A10_B001010,LT_S82,TH226,1954-12-26,...,1,0,1,1,1,0.004001,0.422330,G2M,1,immune


In [7]:
set(patientMetadata['patient_id'])

{'TH067',
 'TH103',
 'TH146',
 'TH153',
 'TH155',
 'TH156',
 'TH157',
 'TH158',
 'TH169',
 'TH171',
 'TH179',
 'TH179_NAT',
 'TH185',
 'TH205',
 'TH210',
 'TH217',
 'TH218',
 'TH220',
 'TH222',
 'TH223',
 'TH225',
 'TH226',
 'TH227',
 'TH231',
 'TH236',
 'TH238',
 'TH238_NAT',
 'TH248',
 'TH266'}

In [8]:
# INIT THE SUMMARY TABLE
cols = ['cell', 'patient', 'clinical_driver_gene', 'clinical_mutation', 'coverage_to_ROI', 'clin_mut_found_bool', 'mutations_found_EGFR', 'mutations_found_BRAF', 'mutations_found_KRAS', 'fusions_found', 'tumorCell_bool']
summaryTable = pd.DataFrame(columns=cols)
summaryTable['cell'] = mutationsDF['cell']
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,K21_B003995,,,,,,,,,,
1,L22_1001000408,,,,,,,,,,
2,A8_B001557,,,,,,,,,,
3,L22_B001016,,,,,,,,,,
4,C20_B002073,,,,,,,,,,
5,D8_B001474,,,,,,,,,,
6,C5_B002572,,,,,,,,,,
7,E17_B003116,,,,,,,,,,
8,A15_B000420,,,,,,,,,,
9,H10_B002573,,,,,,,,,,


In [9]:
# FILL IN VARIOUS METADATA COLS
summarizeModule.genericSummaryTableFillIn('patient_id', 'patient', summaryTable, patientMetadata)
summarizeModule.genericSummaryTableFillIn('driver_gene', 'clinical_driver_gene', summaryTable, patientMetadata)
summarizeModule.genericSummaryTableFillIn('driver_mutation', 'clinical_mutation', summaryTable, patientMetadata)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,K21_B003995,TH236,EGFR,del19,,,,,,,
1,L22_1001000408,TH185,EGFR,L858R,,,,,,,
2,A8_B001557,TH179,BRAF,V600E,,,,,,,
3,L22_B001016,TH226,EGFR,del19,,,,,,,
4,C20_B002073,TH238_NAT,BRAF,V600E,,,,,,,
5,D8_B001474,TH248,EGFR,del19,,,,,,,
6,C5_B002572,TH266,ALK,fusion,,,,,,,
7,E17_B003116,TH231,ALK,fusion,,,,,,,
8,A15_B000420,TH238,BRAF,V600E,,,,,,,
9,H10_B002573,TH266,ALK,fusion,,,,,,,


In [10]:
# FILL IN MUTATIONS FOUND COL 
summaryTable['mutations_found_EGFR'] = mutationsDF['egfrMut']
summaryTable['mutations_found_KRAS'] = mutationsDF['krasMut']
summaryTable['mutations_found_BRAF'] = mutationsDF['brafMut']
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,K21_B003995,TH236,EGFR,del19,,,,,,,
1,L22_1001000408,TH185,EGFR,L858R,,,Q787Q,,,,
2,A8_B001557,TH179,BRAF,V600E,,,,,,,
3,L22_B001016,TH226,EGFR,del19,,,,,,,
4,C20_B002073,TH238_NAT,BRAF,V600E,,,,,,,
5,D8_B001474,TH248,EGFR,del19,,,,,,,
6,C5_B002572,TH266,ALK,fusion,,,,,,,
7,E17_B003116,TH231,ALK,fusion,,,,,,,
8,A15_B000420,TH238,BRAF,V600E,,,,,,,
9,H10_B002573,TH266,ALK,fusion,,,,,,,


In [11]:
# READ IN FUSIONS DATAFRAME, THEN FILL IN summaryTable
fusionsDF = pd.read_csv('/Users/lincoln.harris/code/SNP_calling_pipeline/summaryTable/fusion_dataframe.csv')
fusionsDF

Unnamed: 0,ALK--EML4,ALK_any,EML4_any,NTRK_any,RET_any,ROS1_any
0,C2_B000862,C2_B000862,L18_B003120,,,D10_B003523
1,P1_B001464,P1_B001464,D10_B003523,,,G5_1001000327
2,M11_B003522,M11_B003522,I22_B000276,,,O24_1001000377
3,G8_1001000317,G8_1001000317,A4_B001607,,,O23_1001000377
4,A7_10001000325,A7_10001000325,I6_B003642,,,A6_B003132
5,M12_B003522,M12_B003522,I4_B001607,,,H7_1001000377
6,B11_10001000325,B11_10001000325,P20_B002571,,,E10_B003528
7,G2_B000862,G2_B000862,E5_B001545,,,M19_B003777
8,J15_B000862,J15_B000862,E1_B003117,,,B3_B003187
9,E7_1001000317,E7_1001000317,O2_B003067,,,H2_B003126


In [33]:
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,K21_B003995,TH236,EGFR,del19,,,,,,,
1,L22_1001000408,TH185,EGFR,L858R,,,Q787Q,,,,
2,A8_B001557,TH179,BRAF,V600E,,,,,,,
3,L22_B001016,TH226,EGFR,del19,,,,,,,
4,C20_B002073,TH238_NAT,BRAF,V600E,,,,,,,
5,D8_B001474,TH248,EGFR,del19,,,,,,EML4_any,
6,C5_B002572,TH266,ALK,fusion,,,,,,,
7,E17_B003116,TH231,ALK,fusion,,,,,,,
8,A15_B000420,TH238,BRAF,V600E,,,,,,,
9,H10_B002573,TH266,ALK,fusion,,,,,,,


In [40]:
summarizeModule.fusionsFillIn(fusionsDF, summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,K21_B003995,TH236,EGFR,del19,,,,,,,
1,L22_1001000408,TH185,EGFR,L858R,,,Q787Q,,,,
2,A8_B001557,TH179,BRAF,V600E,,,,,,,
3,L22_B001016,TH226,EGFR,del19,,,,,,,
4,C20_B002073,TH238_NAT,BRAF,V600E,,,,,,,
5,D8_B001474,TH248,EGFR,del19,,,,,,,
6,C5_B002572,TH266,ALK,fusion,,,,,,,
7,E17_B003116,TH231,ALK,fusion,,,,,,,
8,A15_B000420,TH238,BRAF,V600E,,,,,,,
9,H10_B002573,TH266,ALK,fusion,,,,,,,


In [41]:
# SET UP A COL TO TRANSLATE 'RAW' MUTATION CALLS TO 'CLINICAL'
summaryTable['mutations_found_translated'] = ""
summarizeModule.translatedMutsFillIn_EGFR(summaryTable)
summarizeModule.translatedMutsFillIn_nonEGFR('KRAS', summaryTable)
summarizeModule.translatedMutsFillIn_nonEGFR('BRAF', summaryTable)
summarizeModule.translatedMutsFillIn_fusions(summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool,mutations_found_translated
0,K21_B003995,TH236,EGFR,del19,,,,,,,,[]
1,L22_1001000408,TH185,EGFR,L858R,,,Q787Q,,,,,[EGFR Q787Q]
2,A8_B001557,TH179,BRAF,V600E,,,,,,,,[]
3,L22_B001016,TH226,EGFR,del19,,,,,,,,[]
4,C20_B002073,TH238_NAT,BRAF,V600E,,,,,,,,[]
5,D8_B001474,TH248,EGFR,del19,,,,,,,,[]
6,C5_B002572,TH266,ALK,fusion,,,,,,,,[]
7,E17_B003116,TH231,ALK,fusion,,,,,,,,[]
8,A15_B000420,TH238,BRAF,V600E,,,,,,,,[]
9,H10_B002573,TH266,ALK,fusion,,,,,,,,[]


In [42]:
# CONVERT LISTS TO STRING, SO I CAN GET SET -- probably not necessary, actually 
summarizeModule.convertToString(summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool,mutations_found_translated
0,K21_B003995,TH236,EGFR,del19,,,,,,,,
1,L22_1001000408,TH185,EGFR,L858R,,,Q787Q,,,,,EGFR Q787Q
2,A8_B001557,TH179,BRAF,V600E,,,,,,,,
3,L22_B001016,TH226,EGFR,del19,,,,,,,,
4,C20_B002073,TH238_NAT,BRAF,V600E,,,,,,,,
5,D8_B001474,TH248,EGFR,del19,,,,,,,,
6,C5_B002572,TH266,ALK,fusion,,,,,,,,
7,E17_B003116,TH231,ALK,fusion,,,,,,,,
8,A15_B000420,TH238,BRAF,V600E,,,,,,,,
9,H10_B002573,TH266,ALK,fusion,,,,,,,,


In [43]:
# FILL IN clin_mut_found_bool
summarizeModule.clinMutFound_fillIn(summaryTable)
summarizeModule.clinMutFound_fillIn_fus(summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool,mutations_found_translated
0,K21_B003995,TH236,EGFR,del19,,0,,,,,,
1,L22_1001000408,TH185,EGFR,L858R,,0,Q787Q,,,,,EGFR Q787Q
2,A8_B001557,TH179,BRAF,V600E,,0,,,,,,
3,L22_B001016,TH226,EGFR,del19,,0,,,,,,
4,C20_B002073,TH238_NAT,BRAF,V600E,,0,,,,,,
5,D8_B001474,TH248,EGFR,del19,,0,,,,,,
6,C5_B002572,TH266,ALK,fusion,,0,,,,,,
7,E17_B003116,TH231,ALK,fusion,,0,,,,,,
8,A15_B000420,TH238,BRAF,V600E,,0,,,,,,
9,H10_B002573,TH266,ALK,fusion,,0,,,,,,


In [44]:
# FILL IN  tumorCellBool
summarizeModule.tumorCellBoolFillIn(summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool,mutations_found_translated
0,K21_B003995,TH236,EGFR,del19,,0,,,,,0,
1,L22_1001000408,TH185,EGFR,L858R,,0,Q787Q,,,,1,EGFR Q787Q
2,A8_B001557,TH179,BRAF,V600E,,0,,,,,0,
3,L22_B001016,TH226,EGFR,del19,,0,,,,,0,
4,C20_B002073,TH238_NAT,BRAF,V600E,,0,,,,,0,
5,D8_B001474,TH248,EGFR,del19,,0,,,,,1,
6,C5_B002572,TH266,ALK,fusion,,0,,,,,0,
7,E17_B003116,TH231,ALK,fusion,,0,,,,,1,
8,A15_B000420,TH238,BRAF,V600E,,0,,,,,0,
9,H10_B002573,TH266,ALK,fusion,,0,,,,,0,


In [45]:
# GET PER-CELL ROI COVERAGE DFs
braf_V600E_cov_nonZero = summarizeModule.getNonZeroCovROI('braf', 'V600E')
egfr_L858R_cov_nonZero = summarizeModule.getNonZeroCovROI('egfr', 'L858R')
egfr_exon19del_cov_nonZero = summarizeModule.getNonZeroCovROI('egfr', 'exon19del')
egfr_exon20ins_cov_nonZero = summarizeModule.getNonZeroCovROI('egfr', 'exon20ins') # this guy is totally empty...
egfr_G719X_cov_nonZero = summarizeModule.getNonZeroCovROI('egfr', 'G719X')
egfr_L861Q_cov_nonZero = summarizeModule.getNonZeroCovROI('egfr', 'L861Q')
egfr_S768I_cov_nonZero = summarizeModule.getNonZeroCovROI('egfr', 'S768I')
egfr_T790M_cov_nonZero = summarizeModule.getNonZeroCovROI('egfr', 'T790M')
kras_G12C_cov_nonZero = summarizeModule.getNonZeroCovROI('kras', 'G12C')
kras_G13X_cov_nonZero = summarizeModule.getNonZeroCovROI('kras', 'G13X')
kras_Q61X_cov_nonZero = summarizeModule.getNonZeroCovROI('kras', 'Q61X')
egfr_L858R_cov_nonZero

Unnamed: 0,cellName,coverage_bool_vcf,depth_vcf,coverage_bool_gvcf,depth_gvcf
66,A13_1001000407,1,12,1,23
67,A13_1001000408,0,0,1,2
121,A15_1001000408,0,0,1,3
153,A16_1001000407,1,6,1,15
155,A16_1001000412,1,16,1,29
185,A17_1001000407,0,0,1,5
212,A18_1001000408,0,0,1,2
285,A20_1001000408,1,6,1,14
384,A3_1001000412,0,0,1,3
542,B10_1001000412,1,3,1,7


In [46]:
# FIX UP SOME OF THE WEIRD ONES
kras_G13X_cov_nonZero['depth_gvcf'][4202] = 34
kras_Q61X_cov_nonZero['depth_gvcf'][6431] = 92
egfr_exon19del_cov_nonZero['depth_gvcf'] = egfr_exon19del_cov_nonZero['depth_gvcf'].str.strip('[')
egfr_exon19del_cov_nonZero['depth_gvcf'] = egfr_exon19del_cov_nonZero['depth_gvcf'].str.strip(']')
egfr_exon19del_cov_nonZero['depth_gvcf'] = egfr_exon19del_cov_nonZero['depth_gvcf'].str.strip("'")
egfr_exon19del_cov_nonZero

Unnamed: 0,cellName,coverage_bool_vcf,depth_vcf,coverage_bool_gvcf,depth_gvcf
0,A10_1001000407,0,0,0,0
1,A10_1001000408,0,0,0,0
2,A10_1001000412,0,0,0,0
3,A10_B000863,0,0,0,0
4,A10_B001007,0,0,0,0
5,A10_B001470,0,0,0,0
6,A10_B001474,0,0,0,0
7,A10_B001545,0,0,0,0
8,A10_B001548,0,0,0,0
9,A10_B001554,0,0,0,0


In [47]:
# FILL IN ROI COVERAGE TO SUMMARY TABLE
summarizeModule.ROI_coverage_fillIn(braf_V600E_cov_nonZero, 'BRAF', 'V600E', summaryTable)
summarizeModule.ROI_coverage_fillIn(egfr_G719X_cov_nonZero, 'EGFR', 'G719X', summaryTable)
summarizeModule.ROI_coverage_fillIn(egfr_L858R_cov_nonZero, 'EGFR', 'L858R', summaryTable)
summarizeModule.ROI_coverage_fillIn(egfr_L861Q_cov_nonZero, 'EGFR', 'L861Q', summaryTable)
summarizeModule.ROI_coverage_fillIn(egfr_S768I_cov_nonZero, 'EGFR', 'S768I', summaryTable)
summarizeModule.ROI_coverage_fillIn(egfr_T790M_cov_nonZero, 'EGFR', 'T790M', summaryTable)
summarizeModule.ROI_coverage_fillIn(kras_G12C_cov_nonZero, 'KRAS', 'G12C', summaryTable)
summarizeModule.ROI_coverage_fillIn(kras_G13X_cov_nonZero, 'KRAS', 'G13X', summaryTable)
summarizeModule.ROI_coverage_fillIn(kras_Q61X_cov_nonZero, 'KRAS', 'Q61X', summaryTable)
summarizeModule.ROI_coverage_fillIn(egfr_exon19del_cov_nonZero, 'EGFR', 'del19', summaryTable)
summarizeModule.ROI_coverage_fillIn(egfr_exon20ins_cov_nonZero, 'EGFR', 'ins20', summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool,mutations_found_translated
0,K21_B003995,TH236,EGFR,del19,0,0,,,,,0,
1,L22_1001000408,TH185,EGFR,L858R,0,0,Q787Q,,,,1,EGFR Q787Q
2,A8_B001557,TH179,BRAF,V600E,0,0,,,,,0,
3,L22_B001016,TH226,EGFR,del19,0,0,,,,,0,
4,C20_B002073,TH238_NAT,BRAF,V600E,0,0,,,,,0,
5,D8_B001474,TH248,EGFR,del19,0,0,,,,,1,
6,C5_B002572,TH266,ALK,fusion,,0,,,,,0,
7,E17_B003116,TH231,ALK,fusion,,0,,,,,1,
8,A15_B000420,TH238,BRAF,V600E,0,0,,,,,0,
9,H10_B002573,TH266,ALK,fusion,,0,,,,,0,


In [48]:
# TRIM IT DOWN
summaryTable_trimmed = summaryTable[['cell', 'patient', 'clinical_driver_gene', 'clinical_mutation', 'coverage_to_ROI', 'clin_mut_found_bool', 'tumorCell_bool', 'mutations_found_translated']]
summaryTable_trimmed.columns = ['cell', 'patient', 'clinical_driver_gene', 'clinical_mutation', 'coverage_to_ROI', 'clinical_mutation_found_bool', 'tumorCell_bool', 'mutations_found']
summaryTable_trimmed = summaryTable_trimmed[['cell', 'patient', 'clinical_driver_gene', 'clinical_mutation', 'mutations_found', 'coverage_to_ROI', 'clinical_mutation_found_bool', 'tumorCell_bool']]
summaryTable_trimmed

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,mutations_found,coverage_to_ROI,clinical_mutation_found_bool,tumorCell_bool
0,K21_B003995,TH236,EGFR,del19,,0,0,0
1,L22_1001000408,TH185,EGFR,L858R,EGFR Q787Q,0,0,1
2,A8_B001557,TH179,BRAF,V600E,,0,0,0
3,L22_B001016,TH226,EGFR,del19,,0,0,0
4,C20_B002073,TH238_NAT,BRAF,V600E,,0,0,0
5,D8_B001474,TH248,EGFR,del19,,0,0,1
6,C5_B002572,TH266,ALK,fusion,,,0,0
7,E17_B003116,TH231,ALK,fusion,,,0,1
8,A15_B000420,TH238,BRAF,V600E,,0,0,0
9,H10_B002573,TH266,ALK,fusion,,,0,0


In [49]:
# ADD SAMPLE_NAME COL TO SUMMARYTABLE
summaryTable_trimmed['sample_name'] = ''
summarizeModule.genericSummaryTableFillIn('sample_name', 'sample_name', summaryTable_trimmed, patientMetadata)
summaryTable_trimmed

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,mutations_found,coverage_to_ROI,clinical_mutation_found_bool,tumorCell_bool,sample_name
0,K21_B003995,TH236,EGFR,del19,,0,0,0,LT_S71
1,L22_1001000408,TH185,EGFR,L858R,EGFR Q787Q,0,0,1,LT_S21
2,A8_B001557,TH179,BRAF,V600E,,0,0,0,LT_S80
3,L22_B001016,TH226,EGFR,del19,,0,0,0,LT_S82
4,C20_B002073,TH238_NAT,BRAF,V600E,,0,0,0,LT_S65
5,D8_B001474,TH248,EGFR,del19,,0,0,1,LT_S74
6,C5_B002572,TH266,ALK,fusion,,,0,0,LT_S81
7,E17_B003116,TH231,ALK,fusion,,,0,1,LT_S56
8,A15_B000420,TH238,BRAF,V600E,,0,0,0,LT_S66
9,H10_B002573,TH266,ALK,fusion,,,0,0,LT_S81


In [50]:
keepRows = summaryTable_trimmed['patient'] == 'TH146'
summaryTable_trimmed[keepRows]

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,mutations_found,coverage_to_ROI,clinical_mutation_found_bool,tumorCell_bool,sample_name
1154,O23_1001000377,TH146,ROS1,ROS1-CD74,ROS1 fusion,,1,0,LT_S16
2978,O24_1001000377,TH146,ROS1,ROS1-CD74,ROS1 fusion,,1,0,LT_S16
6843,D6_1001000378,TH146,ROS1,ROS1-CD74,,,0,0,LT_S16
7510,H7_1001000377,TH146,ROS1,ROS1-CD74,ROS1 fusion,,1,0,LT_S16


In [52]:
keepRows = summaryTable_trimmed['sample_name'] == 'LT_S82'
summaryTable_trimmed[keepRows]

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,mutations_found,coverage_to_ROI,clinical_mutation_found_bool,tumorCell_bool,sample_name
3,L22_B001016,TH226,EGFR,del19,,0,0,0,LT_S82
22,I19_B001007,TH226,EGFR,del19,,0,0,0,LT_S82
28,M14_B001008,TH226,EGFR,del19,"EGFR R521K, EGFR Q787Q",0,0,0,LT_S82
31,D12_B001007,TH226,EGFR,del19,,0,0,0,LT_S82
35,C21_B001010,TH226,EGFR,del19,,0,0,0,LT_S82
60,E1_B001007,TH226,EGFR,del19,,0,0,0,LT_S82
70,O12_B001008,TH226,EGFR,del19,,0,0,0,LT_S82
169,N14_B001007,TH226,EGFR,del19,,0,0,0,LT_S82
179,C21_B001008,TH226,EGFR,del19,,0,0,0,LT_S82
228,H16_B001008,TH226,EGFR,del19,,0,0,0,LT_S82


In [53]:
# WRITE TO FILE
summaryTable_trimmed.to_csv('/Users/lincoln.harris/Desktop/validationTable_cells.4.10.19.csv', index=False)

In [None]:
#/////////////////////////////////////////////////////////////////
#
# LETS MAKE A BY-SAMPLE SUMMARY TABLE
#
#/////////////////////////////////////////////////////////////////

In [30]:
# GET MIN SET OF SAMPLE NAMES
relevantSamplesSet = set(summaryTable_trimmed['sample_name'])
relevantSamplesList = list(relevantSamplesSet)
relevantSamplesSeries = pd.Series(relevantSamplesList)
relevantSamplesSeries

0           
1     LT_S09
2     LT_S48
3     LT_S41
4     LT_S80
5     LT_S13
6     LT_S16
7     LT_S79
8     LT_S81
9     LT_S66
10    LT_S49
11    LT_S03
12    LT_S69
13    LT_S17
14    LT_S05
15    LT_S63
16    LT_S55
17    LT_S35
18    LT_S12
19    LT_S67
20    LT_S74
21    LT_S51
22    LT_S23
23    LT_S78
24    LT_S50
25    LT_S72
26    LT_S42
27    LT_S21
28    LT_S07
29    LT_S22
30    LT_S65
31    LT_S01
32    LT_S02
33    LT_S11
34    LT_S53
35    LT_S57
36    LT_S19
37    LT_S34
38    LT_S44
39    LT_S75
40    LT_S58
41    LT_S08
42    LT_S54
43    LT_S28
44    LT_S71
45    LT_S47
46    LT_S52
47    LT_S43
48    LT_S56
49    LT_S45
50    LT_S37
51    LT_S38
52    LT_S29
dtype: object

In [31]:
# INIT VALIDATIONTABLE_SAMPLES
cols = ['sample', 'patient', 'driver_gene', 'driver_mutation', 'mutations_found', 'numCells', 'numTumorCells', 'numTumorCells_w_coverage_to_ROI', 'numTumorCells_clinMut_found']
validationTable_samples = pd.DataFrame(columns=cols)
validationTable_samples['sample'] = relevantSamplesSeries
validationTable_samples

Unnamed: 0,sample,patient,driver_gene,driver_mutation,mutations_found,numCells,numTumorCells,numTumorCells_w_coverage_to_ROI,numTumorCells_clinMut_found
0,,,,,,,,,
1,LT_S09,,,,,,,,
2,LT_S48,,,,,,,,
3,LT_S41,,,,,,,,
4,LT_S80,,,,,,,,
5,LT_S13,,,,,,,,
6,LT_S16,,,,,,,,
7,LT_S79,,,,,,,,
8,LT_S81,,,,,,,,
9,LT_S66,,,,,,,,


In [32]:
# FILL IN METADATA FIELDS
summarizeModule.validationTable_metadata_fillIn('patient_id', 'patient', validationTable_samples, patientMetadata)
summarizeModule.validationTable_metadata_fillIn('driver_gene', 'driver_gene', validationTable_samples, patientMetadata)
summarizeModule.validationTable_metadata_fillIn('driver_mutation', 'driver_mutation', validationTable_samples, patientMetadata)
validationTable_samples

Unnamed: 0,sample,patient,driver_gene,driver_mutation,mutations_found,numCells,numTumorCells,numTumorCells_w_coverage_to_ROI,numTumorCells_clinMut_found
0,,,,,,,,,
1,LT_S09,TH067,EGFR,del19,,,,,
2,LT_S48,TH155,EGFR,del19,,,,,
3,LT_S41,TH210,ALK,fusion,,,,,
4,LT_S80,TH179,BRAF,V600E,,,,,
5,LT_S13,TH169,EGFR,L858R,,,,,
6,LT_S16,TH146,ROS1,ROS1-CD74,,,,,
7,LT_S79,TH179,BRAF,V600E,,,,,
8,LT_S81,TH266,ALK,fusion,,,,,
9,LT_S66,TH238,BRAF,V600E,,,,,


In [33]:
# FILL IN MUTATIONS FOUND
muts_dict = summarizeModule.validationTable_dict_muts(validationTable_samples, summaryTable_trimmed)
validationTable_samples['mutations_found'] = muts_dict.values()
validationTable_samples

Unnamed: 0,sample,patient,driver_gene,driver_mutation,mutations_found,numCells,numTumorCells,numTumorCells_w_coverage_to_ROI,numTumorCells_clinMut_found
0,,,,,"EGFR Q787Q, KRAS G13D, EGFR R521K, BRAF V600E...",,,,
1,LT_S09,TH067,EGFR,del19,,,,,
2,LT_S48,TH155,EGFR,del19,"EML4 fusion, BRAF G469R,",,,,
3,LT_S41,TH210,ALK,fusion,,,,,
4,LT_S80,TH179,BRAF,V600E,"KRAS A146V,",,,,
5,LT_S13,TH169,EGFR,L858R,,,,,
6,LT_S16,TH146,ROS1,ROS1-CD74,"ROS1 fusion,",,,,
7,LT_S79,TH179,BRAF,V600E,"EML4 fusion, BRAF V600E, KRAS A146P,",,,,
8,LT_S81,TH266,ALK,fusion,"BRAF G643G, KRAS G13C, BRAF Q609H, EML4 fusion...",,,,
9,LT_S66,TH238,BRAF,V600E,"KRAS G12C, EGFR F856L, EGFR G598V, EML4 fusion...",,,,


In [34]:
# FILL IN NUMTUMORCELLS (various)
tc_dict = summarizeModule.validationTable_dict_generic(validationTable_samples, summaryTable_trimmed, 'tumorCell_bool')
tc_cov_dict = summarizeModule.validationTable_dict_generic(validationTable_samples, summaryTable_trimmed, 'coverage_to_ROI')
clinMut_dict = summarizeModule.validationTable_dict_generic(validationTable_samples, summaryTable_trimmed, 'clinical_mutation_found_bool')

validationTable_samples['numTumorCells'] = tc_dict.values()
validationTable_samples['numTumorCells_w_coverage_to_ROI'] = tc_cov_dict.values()
validationTable_samples['numTumorCells_clinMut_found'] = clinMut_dict.values()
validationTable_samples

Unnamed: 0,sample,patient,driver_gene,driver_mutation,mutations_found,numCells,numTumorCells,numTumorCells_w_coverage_to_ROI,numTumorCells_clinMut_found
0,,,,,"EGFR Q787Q, KRAS G13D, EGFR R521K, BRAF V600E...",,6,0,0
1,LT_S09,TH067,EGFR,del19,,,0,0,0
2,LT_S48,TH155,EGFR,del19,"EML4 fusion, BRAF G469R,",,0,0,0
3,LT_S41,TH210,ALK,fusion,,,15,0,0
4,LT_S80,TH179,BRAF,V600E,"KRAS A146V,",,13,0,0
5,LT_S13,TH169,EGFR,L858R,,,1,0,0
6,LT_S16,TH146,ROS1,ROS1-CD74,"ROS1 fusion,",,0,0,0
7,LT_S79,TH179,BRAF,V600E,"EML4 fusion, BRAF V600E, KRAS A146P,",,60,3,3
8,LT_S81,TH266,ALK,fusion,"BRAF G643G, KRAS G13C, BRAF Q609H, EML4 fusion...",,0,0,0
9,LT_S66,TH238,BRAF,V600E,"KRAS G12C, EGFR F856L, EGFR G598V, EML4 fusion...",,260,2,2


In [35]:
# CLEAN UP A BIT
validationTable_samples = validationTable_samples.drop([0]) # this can change
cols = ['sample', 'patient', 'driver_gene', 'driver_mutation', 'mutations_found', 'numTumorCells', 'numTumorCells_w_coverage_to_ROI', 'numTumorCells_clinMut_found']
validationTable_samples = validationTable_samples[cols]
validationTable_samples

Unnamed: 0,sample,patient,driver_gene,driver_mutation,mutations_found,numTumorCells,numTumorCells_w_coverage_to_ROI,numTumorCells_clinMut_found
1,LT_S09,TH067,EGFR,del19,,0,0,0
2,LT_S48,TH155,EGFR,del19,"EML4 fusion, BRAF G469R,",0,0,0
3,LT_S41,TH210,ALK,fusion,,15,0,0
4,LT_S80,TH179,BRAF,V600E,"KRAS A146V,",13,0,0
5,LT_S13,TH169,EGFR,L858R,,1,0,0
6,LT_S16,TH146,ROS1,ROS1-CD74,"ROS1 fusion,",0,0,0
7,LT_S79,TH179,BRAF,V600E,"EML4 fusion, BRAF V600E, KRAS A146P,",60,3,3
8,LT_S81,TH266,ALK,fusion,"BRAF G643G, KRAS G13C, BRAF Q609H, EML4 fusion...",0,0,0
9,LT_S66,TH238,BRAF,V600E,"KRAS G12C, EGFR F856L, EGFR G598V, EML4 fusion...",260,2,2
10,LT_S49,TH223,EGFR,del19,"EGFR T903T,",7,0,0


In [36]:
# write this bitch
validationTable_samples.to_csv('./validationTable_samples_4.1.19.csv', index=False)