In [1]:
#///////////////////////////////////////////////////////////////////
# script: mutations_by_patient_GOI
# author: Lincoln
# date: 2/27/19
#
# This script takes the output of getMutationCounts_overall_and_GOI.py 
#    (option 4) and parses it into a readable, useable table, where 
#    the number of each possible mutation is given, on a per-patient
#   basis
#
#  adapted from SNP_calling_pipeline/lolliplots/creating_lollipop_cmds.ipynb
#
#///////////////////////////////////////////////////////////////////

In [2]:
import pandas as pd

In [3]:
egfr_df = pd.read_csv('/Users/lincoln.harris/Desktop/egfr_out_AA.csv', header=None, names=['cell', 'mutations'])
egfr_df

Unnamed: 0,cell,mutations
0,P13_1001000339,[]
1,C1_B002572,[]
2,J2_B002573,[]
3,P22_B003116,[]
4,O22_B003191,[]
5,J12_B003529,['T903T']
6,D14_1001000407,"['R521K', 'T903T', 'T629T', 'Q787Q']"
7,F14_B003105,[]
8,P11_1001000340,[]
9,E21_B003930,['T903T']


In [4]:
egfr_df['mutations'] = egfr_df['mutations'].str.replace("'", "") # remove quotes
egfr_df['mutations'] = egfr_df['mutations'].str.replace("[", "") # remove brackets
egfr_df['mutations'] = egfr_df['mutations'].str.replace("]", "") # remove brackets
egfr_df['mutations'] = egfr_df['mutations'].str.replace(" ", "") # remove whitespace?
egfr_df['mutations']

0                                    
1                                    
2                                    
3                                    
4                                    
5                               T903T
6             R521K,T903T,T629T,Q787Q
7                                    
8                                    
9                               T903T
10                                   
11                                   
12                        Q787Q,T629T
13                                   
14                                   
15                              T903T
16                                   
17                                   
18                                   
19                                   
20                                   
21                                   
22                                   
23                                   
24                                   
25                                   
26          

In [5]:
# create a dict for all of the unique mutations 

mutations_dict = {}

for i in range(0,len(egfr_df.index)):
    currRow = egfr_df['mutations'].iloc[i]
    currRowSplit = currRow.split(',')
    for currMut in currRowSplit:
        if currMut in mutations_dict.keys():     # existing entry
            currVal = mutations_dict.get(currMut)
            currVal += 1
            mutations_dict.update({currMut : currVal})
        else:     # new entry
            mutations_dict.update({currMut : 1})

In [6]:
mutations_dict

{'': 8352,
 'A237>?': 1,
 'A237V': 7,
 'D1014N': 24,
 'D994D': 41,
 'E709A': 1,
 'E746_A750delELREA': 53,
 'E746_T751delELREAT': 53,
 'F856L': 3,
 'G221V': 1,
 'G331R': 1,
 'G598V': 1,
 'G652G': 3,
 'G719A': 1,
 'G857V': 2,
 'H1129Y': 1,
 'I1093M': 1,
 'K745_A750>T': 21,
 'K754E': 2,
 'L1034I': 1,
 'L1167V': 2,
 'L387M': 1,
 'L747S': 1,
 'L858R': 148,
 'L861Q': 1,
 'L907M': 1,
 'P1019L': 2,
 'P589L': 3,
 'Q1020H': 3,
 'Q787Q': 840,
 'R1100S': 6,
 'R521K': 541,
 'R776H': 1,
 'R831H': 2,
 'S442I': 1,
 'S768I': 1,
 'S811F': 1,
 'S921R': 5,
 'T629T': 562,
 'T903T': 667,
 'V1142V': 2,
 'V536M': 3,
 'V769L': 2,
 'V774_C775insHV': 1,
 'V843L': 2}

In [7]:
mutations_dict_items = list(mutations_dict.items())
mutations_dict_items

[('', 8352),
 ('T903T', 667),
 ('R521K', 541),
 ('T629T', 562),
 ('Q787Q', 840),
 ('L858R', 148),
 ('E746_T751delELREAT', 53),
 ('E746_A750delELREA', 53),
 ('D1014N', 24),
 ('G857V', 2),
 ('D994D', 41),
 ('K745_A750>T', 21),
 ('S811F', 1),
 ('V1142V', 2),
 ('Q1020H', 3),
 ('A237V', 7),
 ('G221V', 1),
 ('P589L', 3),
 ('L747S', 1),
 ('S442I', 1),
 ('R1100S', 6),
 ('S921R', 5),
 ('L1167V', 2),
 ('L1034I', 1),
 ('F856L', 3),
 ('V536M', 3),
 ('R831H', 2),
 ('L861Q', 1),
 ('V769L', 2),
 ('V843L', 2),
 ('G652G', 3),
 ('I1093M', 1),
 ('P1019L', 2),
 ('R776H', 1),
 ('V774_C775insHV', 1),
 ('A237>?', 1),
 ('L907M', 1),
 ('G331R', 1),
 ('H1129Y', 1),
 ('K754E', 2),
 ('S768I', 1),
 ('G598V', 1),
 ('L387M', 1),
 ('G719A', 1),
 ('E709A', 1)]

In [8]:
mutations_dict_items[10]

('D994D', 41)

In [9]:
#////////////////////////////////////////////////////////////////////////////
#////////////////////////////////////////////////////////////////////////////
#
# want to create a per-patient EGFR mutations dataframe
#
#////////////////////////////////////////////////////////////////////////////
#////////////////////////////////////////////////////////////////////////////

In [10]:
patientMetadata = pd.read_csv('/Users/lincoln.harris/Desktop/152-LAUD_cell_lists_and_various_shit/cDNA_plate_metadata.csv')
patientMetadata

Unnamed: 0,plate,sample_type,patient_id,DOB,gender,race,smokingHx,histolgy,driver_gene,driver_mutation,...,sample_name,processing_status,physical_description,sort_data_exported,cell_density,cDNA_cells,sequenced_cells_passQC,Sequence_Run1,Sequence_Run2,Sequence_Run3
0,,cell_line,Ewing,,,,,,,,...,CL_S1,Sequenced,,,,43.0,,170504_NS500126_0691_AHC22JBGX2,,
1,,cell_line,Fibroblasts,,,,,,,,...,CL_S1,Sequenced,,,,43.0,,170504_NS500126_0691_AHC22JBGX2,,
2,1001000332,cell_line,H1975,,,,,,,,...,CL_S1,Sequenced,,,,38.0,,170504_NS500126_0691_AHC22JBGX2,,
3,1001000330,cell_line,TPH1,,,,,,,,...,CL_S1,Sequenced,,,,37.0,,170504_NS500126_0691_AHC22JBGX2,,
4,1001000302,Lung_tumor,TH158,1959-11-23,Female,Native Hawaiian or Other Pacific Island,Never,Adenocarcinoma,EGFR,del19,...,LT_S01,Sequenced,,,,,44,170215_NS500126_0658_AH7TLYBGX2,,
5,1001000301,Lung_tumor,TH157,2016-11-25,Male,Asian,Never,Adenocarcinoma,ALK,fusion,...,LT_S02,Sequenced,,,,,27,170215_NS500126_0658_AH7TLYBGX2,,
6,1001000292,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,63,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
7,1001000293,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,64,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
8,1001000294,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,59,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
9,1001000295,Lung_tumor,TH156,1973-03-17,Male,B003109,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,89,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,


In [11]:
patientMetadata = patientMetadata.drop([0,1])
patientMetadata

Unnamed: 0,plate,sample_type,patient_id,DOB,gender,race,smokingHx,histolgy,driver_gene,driver_mutation,...,sample_name,processing_status,physical_description,sort_data_exported,cell_density,cDNA_cells,sequenced_cells_passQC,Sequence_Run1,Sequence_Run2,Sequence_Run3
2,1001000332,cell_line,H1975,,,,,,,,...,CL_S1,Sequenced,,,,38.0,,170504_NS500126_0691_AHC22JBGX2,,
3,1001000330,cell_line,TPH1,,,,,,,,...,CL_S1,Sequenced,,,,37.0,,170504_NS500126_0691_AHC22JBGX2,,
4,1001000302,Lung_tumor,TH158,1959-11-23,Female,Native Hawaiian or Other Pacific Island,Never,Adenocarcinoma,EGFR,del19,...,LT_S01,Sequenced,,,,,44,170215_NS500126_0658_AH7TLYBGX2,,
5,1001000301,Lung_tumor,TH157,2016-11-25,Male,Asian,Never,Adenocarcinoma,ALK,fusion,...,LT_S02,Sequenced,,,,,27,170215_NS500126_0658_AH7TLYBGX2,,
6,1001000292,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,63,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
7,1001000293,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,64,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
8,1001000294,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,59,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
9,1001000295,Lung_tumor,TH156,1973-03-17,Male,B003109,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,89,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
10,1001000296,Lung_tumor,TH166,1974-12-15,Male,Hispanic or Latino,<5py,Adenocarcinoma,BRAF,V600E,...,LT_S04,Sequenced,,y,,,71,170129_NS500126_0650_AHVM75BGXY,170202_NS500126_0653_AHVL5NBGXY,
11,1001000297,Lung_tumor,TH166,1974-12-15,Male,Hispanic or Latino,<5py,Adenocarcinoma,BRAF,V600E,...,LT_S04,Sequenced,,y,,,79,170129_NS500126_0650_AHVM75BGXY,170202_NS500126_0653_AHVL5NBGXY,


In [12]:
egfr_df

Unnamed: 0,cell,mutations
0,P13_1001000339,
1,C1_B002572,
2,J2_B002573,
3,P22_B003116,
4,O22_B003191,
5,J12_B003529,T903T
6,D14_1001000407,"R521K,T903T,T629T,Q787Q"
7,F14_B003105,
8,P11_1001000340,
9,E21_B003930,T903T


In [13]:
colNames = ['patientID'] + list(mutations_dict.keys())
colNames

['patientID',
 '',
 'T903T',
 'R521K',
 'T629T',
 'Q787Q',
 'L858R',
 'E746_T751delELREAT',
 'E746_A750delELREA',
 'D1014N',
 'G857V',
 'D994D',
 'K745_A750>T',
 'S811F',
 'V1142V',
 'Q1020H',
 'A237V',
 'G221V',
 'P589L',
 'L747S',
 'S442I',
 'R1100S',
 'S921R',
 'L1167V',
 'L1034I',
 'F856L',
 'V536M',
 'R831H',
 'L861Q',
 'V769L',
 'V843L',
 'G652G',
 'I1093M',
 'P1019L',
 'R776H',
 'V774_C775insHV',
 'A237>?',
 'L907M',
 'G331R',
 'H1129Y',
 'K754E',
 'S768I',
 'G598V',
 'L387M',
 'G719A',
 'E709A']

In [14]:
# want to make a dataframe that has every patient and the specific EGFR mutations for that patient
egfr_muts_by_patient = pd.DataFrame(columns=colNames)
egfr_muts_by_patient

Unnamed: 0,patientID,Unnamed: 2,T903T,R521K,T629T,Q787Q,L858R,E746_T751delELREAT,E746_A750delELREA,D1014N,...,A237>?,L907M,G331R,H1129Y,K754E,S768I,G598V,L387M,G719A,E709A


In [15]:
uniquePatientIDs = set(patientMetadata['patient_id'])
uniquePatientIDs

{'H1975',
 'TH041',
 'TH067',
 'TH082',
 'TH100',
 'TH103',
 'TH107',
 'TH116',
 'TH124',
 'TH134_PDX',
 'TH143B',
 'TH144',
 'TH146',
 'TH150',
 'TH153',
 'TH155',
 'TH156',
 'TH157',
 'TH158',
 'TH166',
 'TH169',
 'TH171',
 'TH172',
 'TH174',
 'TH178',
 'TH179',
 'TH179_Normal',
 'TH183',
 'TH185',
 'TH187',
 'TH188',
 'TH199',
 'TH205',
 'TH208',
 'TH210',
 'TH214',
 'TH217',
 'TH218',
 'TH220',
 'TH222',
 'TH223',
 'TH225',
 'TH226',
 'TH227',
 'TH231',
 'TH236',
 'TH238',
 'TH238_Normal',
 'TH248',
 'TH249',
 'TH257',
 'TH266',
 'TPH1'}

In [16]:
list(uniquePatientIDs)

['TH100',
 'TH225',
 'TH103',
 'TH238_Normal',
 'TH188',
 'TH169',
 'TH179_Normal',
 'TH185',
 'TH208',
 'TH266',
 'TH205',
 'TH178',
 'TH144',
 'TH157',
 'TH222',
 'TH249',
 'TH257',
 'TH236',
 'TH143B',
 'H1975',
 'TPH1',
 'TH172',
 'TH187',
 'TH155',
 'TH146',
 'TH218',
 'TH179',
 'TH116',
 'TH231',
 'TH171',
 'TH183',
 'TH220',
 'TH227',
 'TH156',
 'TH248',
 'TH150',
 'TH107',
 'TH214',
 'TH153',
 'TH199',
 'TH223',
 'TH158',
 'TH067',
 'TH217',
 'TH134_PDX',
 'TH210',
 'TH041',
 'TH124',
 'TH174',
 'TH238',
 'TH166',
 'TH082',
 'TH226']

In [17]:
egfr_muts_by_patient['patientID'] = list(uniquePatientIDs)
egfr_muts_by_patient

Unnamed: 0,patientID,Unnamed: 2,T903T,R521K,T629T,Q787Q,L858R,E746_T751delELREAT,E746_A750delELREA,D1014N,...,A237>?,L907M,G331R,H1129Y,K754E,S768I,G598V,L387M,G719A,E709A
0,TH100,,,,,,,,,,...,,,,,,,,,,
1,TH225,,,,,,,,,,...,,,,,,,,,,
2,TH103,,,,,,,,,,...,,,,,,,,,,
3,TH238_Normal,,,,,,,,,,...,,,,,,,,,,
4,TH188,,,,,,,,,,...,,,,,,,,,,
5,TH169,,,,,,,,,,...,,,,,,,,,,
6,TH179_Normal,,,,,,,,,,...,,,,,,,,,,
7,TH185,,,,,,,,,,...,,,,,,,,,,
8,TH208,,,,,,,,,,...,,,,,,,,,,
9,TH266,,,,,,,,,,...,,,,,,,,,,


In [18]:
# set all values to 0
egfr_muts_by_patient[:] = 0
egfr_muts_by_patient

Unnamed: 0,patientID,Unnamed: 2,T903T,R521K,T629T,Q787Q,L858R,E746_T751delELREAT,E746_A750delELREA,D1014N,...,A237>?,L907M,G331R,H1129Y,K754E,S768I,G598V,L387M,G719A,E709A
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# dataframe looking good
egfr_muts_by_patient['patientID'] = list(uniquePatientIDs)
egfr_muts_by_patient

Unnamed: 0,patientID,Unnamed: 2,T903T,R521K,T629T,Q787Q,L858R,E746_T751delELREAT,E746_A750delELREA,D1014N,...,A237>?,L907M,G331R,H1129Y,K754E,S768I,G598V,L387M,G719A,E709A
0,TH100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TH225,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TH103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TH238_Normal,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TH188,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,TH169,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,TH179_Normal,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,TH185,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,TH208,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,TH266,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# trying to fill it in 

for i in range(0,len(egfr_df.index)):
    currCell = egfr_df['cell'].iloc[i]
    currPlate = currCell.split('_')[1]
    
    index_to_keep = patientMetadata['plate'] == currPlate
    keepRow = patientMetadata[index_to_keep]
    try:
        currPatient = list(keepRow['patient_id'])[0]
    except IndexError:
        pass
        #print('ERROR: plate not found') # maybe these are the plates that were NOT including in the analysis? 
        
    currRow = egfr_df['mutations'].iloc[i]
    currRowSplit = currRow.split(',')
    
    match_row = egfr_muts_by_patient[egfr_muts_by_patient['patientID'] == currPatient]
    match_row_index = match_row.index[0]
    try:
        for currMut in currRowSplit:
            egfr_muts_by_patient[currMut][match_row_index] += 1
    except:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plat

ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plate not found
ERROR: plat

In [21]:
egfr_muts_by_patient

Unnamed: 0,patientID,Unnamed: 2,T903T,R521K,T629T,Q787Q,L858R,E746_T751delELREAT,E746_A750delELREA,D1014N,...,A237>?,L907M,G331R,H1129Y,K754E,S768I,G598V,L387M,G719A,E709A
0,TH100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TH225,82,6,3,3,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TH103,51,0,16,18,14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TH238_Normal,269,11,4,18,22,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TH188,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,TH169,126,26,44,37,0,0,23,23,0,...,0,0,0,0,0,0,0,0,0,0
6,TH179_Normal,227,11,0,0,26,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,TH185,369,283,394,360,435,146,0,0,0,...,1,0,1,1,0,1,0,0,0,1
8,TH208,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,TH266,625,19,0,0,43,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
#///////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////
#
# MAKE THE SUMMARY TABLE
#
#///////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////

In [23]:
#egfr_df
summaryTable = pd.DataFrame(columns=['cell', 'patient', 'clinical_driver_gene', 'clinical_mutation', 'coverage_to_ROI', 'clin_mut_found_bool', 'mutations_found_EGFR', 'mutations_found_BRAF', 'mutations_found_KRAS', 'fusions_found', 'tumorCell_bool'])
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool


In [24]:
summaryTable['cell'] = egfr_df['cell']
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,,,,,,,,,,
1,C1_B002572,,,,,,,,,,
2,J2_B002573,,,,,,,,,,
3,P22_B003116,,,,,,,,,,
4,O22_B003191,,,,,,,,,,
5,J12_B003529,,,,,,,,,,
6,D14_1001000407,,,,,,,,,,
7,F14_B003105,,,,,,,,,,
8,P11_1001000340,,,,,,,,,,
9,E21_B003930,,,,,,,,,,


In [25]:
# fill in patient
for i in range(0,len(summaryTable.index)):
    currCell = summaryTable['cell'].iloc[i]
    currPlate = currCell.split('_')[1]
    
    index_to_keep = patientMetadata['plate'] == currPlate
    keepRow = patientMetadata[index_to_keep]
    try:
        currPatient = list(keepRow['patient_id'])[0]
        summaryTable['patient'][i] = currPatient
    except IndexError:
        continue
        #print('ERROR: plate not found') # maybe these are the plates that were NOT including in the analysis? 

In [26]:
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,,,,,,,,,
1,C1_B002572,TH266,,,,,,,,,
2,J2_B002573,TH266,,,,,,,,,
3,P22_B003116,TH231,,,,,,,,,
4,O22_B003191,TH226,,,,,,,,,
5,J12_B003529,TH171,,,,,,,,,
6,D14_1001000407,TH185,,,,,,,,,
7,F14_B003105,TH222,,,,,,,,,
8,P11_1001000340,TH169,,,,,,,,,
9,E21_B003930,TH222,,,,,,,,,


In [27]:
summaryTable['mutations_found_EGFR'] = egfr_df['mutations']
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,,,,,,,,,
1,C1_B002572,TH266,,,,,,,,,
2,J2_B002573,TH266,,,,,,,,,
3,P22_B003116,TH231,,,,,,,,,
4,O22_B003191,TH226,,,,,,,,,
5,J12_B003529,TH171,,,,,T903T,,,,
6,D14_1001000407,TH185,,,,,"R521K,T903T,T629T,Q787Q",,,,
7,F14_B003105,TH222,,,,,,,,,
8,P11_1001000340,TH169,,,,,,,,,
9,E21_B003930,TH222,,,,,T903T,,,,


In [28]:
# read-in fusion dataframe
fusionsDF = pd.read_csv('./fusion_dataframe.csv')
fusionsDF

Unnamed: 0,ALK--EML4,ALK_any,EML4_any,NTRK_any,RET_any,ROS1_any
0,C2_B000862,K14_B003132,L18_B003120,C8_B003191,E2_B003920,D10_B003523
1,P1_B001464,G19_B003121,D10_B003523,D8_B003126,B15_B003529,L17_B003116
2,M11_B003522,O13_1001000339,I22_B000276,J11_B003528,M21_B002572,F5_B001556
3,G8_1001000317,I13_B003046,A4_B001607,B5_B002078,J6_B001470,I5_B003070
4,A7_10001000325,M8_B003119,I6_B003642,J18_B002571,J19_B003070,J5_B003125
5,M12_B003522,K2_B003522,I4_B001607,P6_B001545,G13_B000862,G5_1001000327
6,B11_10001000325,N8_B001556,P20_B002571,O17_B002097,D15_B002078,J22_B003120
7,G2_B000862,D18_B002073,E5_B001545,K10_B003528,P19_B002572,O24_1001000377
8,J15_B000862,O17_1001000409,E1_B003117,C17_B003071,D19_B003132,A3_B003067
9,E7_1001000317,C2_B000862,O2_B003067,,G11_B001543,L20_B002572


In [29]:
# this works
for i in range(0, len(summaryTable.index)):
    currCell = summaryTable['cell'][i]
    fusionsListCurr = []
    
    colList0 = list(fusionsDF['ALK--EML4'])
    colList1 = list(fusionsDF['ALK_any'])
    colList2 = list(fusionsDF['EML4_any'])
    colList3 = list(fusionsDF['NTRK_any'])
    colList4 = list(fusionsDF['RET_any'])
    colList5 = list(fusionsDF['ROS1_any'])

    if currCell in colList0:
        fusionsListCurr.append('ALK-EML4')
    elif currCell in colList1:
        fusionsListCurr.append('ALK_any')
    elif currCell in colList2:
        fusionsListCurr.append('EML4_any')
    elif currCell in colList3:
        fusionsListCurr.append('NTRK_any')
    elif currCell in colList4:
        fusionsListCurr.append('RET_any')
    elif currCell in colList5:
        fusionsListCurr.append('ROS1_any')
    else:
        fusionsListCurr = ""
        
    fusionsListCurr = str(fusionsListCurr)
    fusionsListCurr = fusionsListCurr.strip(']')
    fusionsListCurr = fusionsListCurr.strip('[')
    fusionsListCurr = fusionsListCurr.strip("'")
    fusionsListCurr = fusionsListCurr.strip(" ")
    #print(fusionsListCurr)
 
    summaryTable['fusions_found'][i] = fusionsListCurr

In [30]:
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,,,,,,,,,
1,C1_B002572,TH266,,,,,,,,,
2,J2_B002573,TH266,,,,,,,,,
3,P22_B003116,TH231,,,,,,,,,
4,O22_B003191,TH226,,,,,,,,,
5,J12_B003529,TH171,,,,,T903T,,,,
6,D14_1001000407,TH185,,,,,"R521K,T903T,T629T,Q787Q",,,,
7,F14_B003105,TH222,,,,,,,,,
8,P11_1001000340,TH169,,,,,,,,,
9,E21_B003930,TH222,,,,,T903T,,,,


In [31]:
# lookin good!
set(summaryTable['fusions_found'])

{'', 'ALK-EML4', 'ALK_any', 'EML4_any', 'NTRK_any', 'RET_any', 'ROS1_any'}

In [32]:
#//////////////////////////////////////////////////////////////////////////
#//////////////////////////////////////////////////////////////////////////
#
# FILL IN driver_gene AND driver_mutation
#
#//////////////////////////////////////////////////////////////////////////
#//////////////////////////////////////////////////////////////////////////

In [33]:
patientMetadata

Unnamed: 0,plate,sample_type,patient_id,DOB,gender,race,smokingHx,histolgy,driver_gene,driver_mutation,...,sample_name,processing_status,physical_description,sort_data_exported,cell_density,cDNA_cells,sequenced_cells_passQC,Sequence_Run1,Sequence_Run2,Sequence_Run3
2,1001000332,cell_line,H1975,,,,,,,,...,CL_S1,Sequenced,,,,38.0,,170504_NS500126_0691_AHC22JBGX2,,
3,1001000330,cell_line,TPH1,,,,,,,,...,CL_S1,Sequenced,,,,37.0,,170504_NS500126_0691_AHC22JBGX2,,
4,1001000302,Lung_tumor,TH158,1959-11-23,Female,Native Hawaiian or Other Pacific Island,Never,Adenocarcinoma,EGFR,del19,...,LT_S01,Sequenced,,,,,44,170215_NS500126_0658_AH7TLYBGX2,,
5,1001000301,Lung_tumor,TH157,2016-11-25,Male,Asian,Never,Adenocarcinoma,ALK,fusion,...,LT_S02,Sequenced,,,,,27,170215_NS500126_0658_AH7TLYBGX2,,
6,1001000292,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,63,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
7,1001000293,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,64,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
8,1001000294,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,59,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
9,1001000295,Lung_tumor,TH156,1973-03-17,Male,B003109,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,89,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
10,1001000296,Lung_tumor,TH166,1974-12-15,Male,Hispanic or Latino,<5py,Adenocarcinoma,BRAF,V600E,...,LT_S04,Sequenced,,y,,,71,170129_NS500126_0650_AHVM75BGXY,170202_NS500126_0653_AHVL5NBGXY,
11,1001000297,Lung_tumor,TH166,1974-12-15,Male,Hispanic or Latino,<5py,Adenocarcinoma,BRAF,V600E,...,LT_S04,Sequenced,,y,,,79,170129_NS500126_0650_AHVM75BGXY,170202_NS500126_0653_AHVL5NBGXY,


In [34]:
list(patientMetadata.columns)

['plate',
 'sample_type',
 'patient_id',
 'DOB',
 'gender',
 'race',
 'smokingHx',
 'histolgy',
 'driver_gene',
 'driver_mutation',
 'secondary_mutation',
 'Notes',
 'stage',
 'pathlogy_review',
 'biopsy_date',
 'biopsy_type',
 'biopsy_site',
 'treatment_status',
 'treatment_navie',
 'treatment_type',
 'treatment',
 'facs_selection',
 'sort_date',
 'plate_number',
 'sample_name',
 'processing_status',
 'physical_description',
 'sort_data_exported',
 'cell_density',
 'cDNA_cells',
 'sequenced_cells_passQC',
 'Sequence_Run1',
 'Sequence_Run2',
 'Sequence_Run3']

In [35]:
patientMetadata.driver_mutation

2          NaN
3          NaN
4        del19
5       fusion
6       fusion
7       fusion
8       fusion
9       fusion
10       V600E
11       V600E
12       V600E
13       V600E
14       V600E
15       L858R
16       L858R
17       L858R
18       L858R
19       L858R
20      fusion
21      fusion
22      fusion
23      fusion
24      fusion
25      fusion
26      fusion
27      fusion
28      fusion
29      fusion
30      fusion
31      fusion
        ...   
271    Unknown
272    Unknown
273      del19
274      del19
275      del19
276     fusion
277     fusion
278     fusion
279     fusion
280     fusion
281     fusion
282     fusion
283      del19
284      del19
285      del19
286      del19
287     fusion
288     fusion
289     fusion
290     fusion
291      V600E
292      V600E
293      V600E
294      V600E
295      V600E
296      V600E
297      V600E
298     fusion
299     fusion
300     fusion
Name: driver_mutation, Length: 299, dtype: object

In [36]:
patientMetadata.driver_gene

2          NaN
3          NaN
4         EGFR
5          ALK
6          ALK
7          ALK
8          ALK
9          ALK
10        BRAF
11        BRAF
12        BRAF
13        BRAF
14        BRAF
15        EGFR
16        EGFR
17        EGFR
18        EGFR
19        EGFR
20         ALK
21         MET
22         ALK
23         ALK
24         ALK
25         ALK
26         ALK
27         ALK
28         ALK
29         ALK
30         ALK
31         ALK
        ...   
271    Unknown
272    Unknown
273       EGFR
274       EGFR
275       EGFR
276        ALK
277        ALK
278        ALK
279        ALK
280        ALK
281        ALK
282        ALK
283       EGFR
284       EGFR
285       EGFR
286       EGFR
287        ALK
288        ALK
289        ALK
290        ALK
291       BRAF
292       BRAF
293       BRAF
294       BRAF
295       BRAF
296       BRAF
297       BRAF
298        ALK
299        ALK
300        ALK
Name: driver_gene, Length: 299, dtype: object

In [37]:
# fill in driver_gene
for i in range(0,len(summaryTable.index)):
    currCell = summaryTable['cell'].iloc[i]
    currPlate = currCell.split('_')[1]
    
    index_to_keep = patientMetadata['plate'] == currPlate
    keepRow = patientMetadata[index_to_keep]
    try:
        currDriver = list(keepRow['driver_gene'])[0]
        summaryTable['clinical_driver_gene'][i] = currDriver
    except IndexError:
        continue
        #print('ERROR: plate not found') # maybe these are the plates that were NOT including in the analysis? 

In [38]:
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,EGFR,,,,,,,,
1,C1_B002572,TH266,ALK,,,,,,,,
2,J2_B002573,TH266,ALK,,,,,,,,
3,P22_B003116,TH231,ALK,,,,,,,,
4,O22_B003191,TH226,EGFR,,,,,,,,
5,J12_B003529,TH171,ALK,,,,T903T,,,,
6,D14_1001000407,TH185,EGFR,,,,"R521K,T903T,T629T,Q787Q",,,,
7,F14_B003105,TH222,ROS1,,,,,,,,
8,P11_1001000340,TH169,EGFR,,,,,,,,
9,E21_B003930,TH222,ROS1,,,,T903T,,,,


In [39]:
# now the same, for driver_mutation
for i in range(0,len(summaryTable.index)):
    currCell = summaryTable['cell'].iloc[i]
    currPlate = currCell.split('_')[1]
    
    index_to_keep = patientMetadata['plate'] == currPlate
    keepRow = patientMetadata[index_to_keep]
    try:
        currDriver = list(keepRow['driver_mutation'])[0]
        summaryTable['clinical_mutation'][i] = currDriver
    except IndexError:
        continue
        #print('ERROR: plate not found') # maybe these are the plates that were NOT including in the analysis? 

In [40]:
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,EGFR,del19,,,,,,,
1,C1_B002572,TH266,ALK,fusion,,,,,,,
2,J2_B002573,TH266,ALK,fusion,,,,,,,
3,P22_B003116,TH231,ALK,fusion,,,,,,,
4,O22_B003191,TH226,EGFR,del19,,,,,,,
5,J12_B003529,TH171,ALK,fusion,,,T903T,,,,
6,D14_1001000407,TH185,EGFR,L858R,,,"R521K,T903T,T629T,Q787Q",,,,
7,F14_B003105,TH222,ROS1,fusion,,,,,,,
8,P11_1001000340,TH169,EGFR,del19,,,,,,,
9,E21_B003930,TH222,ROS1,fusion,,,T903T,,,,


In [41]:
#///////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////
#
# WHAT CAN I DO WITH coverage_bool? 
#
#///////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////

In [42]:
egfr_L858R_allCells = pd.read_csv('../coverage/out/egfr_L858R_allCells.csv')
egfr_L858R_allCells

Unnamed: 0,cellName,coverage_bool_vcf,depth_vcf,coverage_bool_gvcf,depth_gvcf
0,A10_1001000407,0,0,0,0
1,A10_1001000408,0,0,0,0
2,A10_1001000412,0,0,0,0
3,A10_B000863,0,0,0,0
4,A10_B001007,0,0,0,0
5,A10_B001470,0,0,0,0
6,A10_B001474,0,0,0,0
7,A10_B001545,0,0,0,0
8,A10_B001548,0,0,0,0
9,A10_B001554,0,0,0,0


In [43]:
egfr_L858R_allCells['depth_gvcf']

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
9318    0
9319    0
9320    0
9321    0
9322    0
9323    2
9324    6
9325    0
9326    0
9327    0
9328    0
9329    0
9330    0
9331    0
9332    0
9333    0
9334    0
9335    0
9336    0
9337    0
9338    0
9339    0
9340    0
9341    0
9342    0
9343    0
9344    0
9345    0
9346    0
9347    0
Name: depth_gvcf, Length: 9348, dtype: int64

In [44]:
indices_to_keep = egfr_L858R_allCells['depth_gvcf'] != 0
indices_to_keep

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
9318    False
9319    False
9320    False
9321    False
9322    False
9323     True
9324     True
9325    False
9326    False
9327    False
9328    False
9329    False
9330    False
9331    False
9332    False
9333    False
9334    False
9335    False
9336    False
9337    False
9338    False
9339    False
9340    False
9341    False
9342    False
9343    False
9344    False
9345    False
9346    False
9347    False
Name: depth_gvcf, Length: 9348, dtype: bool

In [45]:
# these are the badboys that have coverage to the ROI
#     only 200 of them...wow
egfr_L858R_allCells_wCov_df = egfr_L858R_allCells[indices_to_keep]
egfr_L858R_allCells_wCov_df

Unnamed: 0,cellName,coverage_bool_vcf,depth_vcf,coverage_bool_gvcf,depth_gvcf
66,A13_1001000407,1,12,1,23
67,A13_1001000408,0,0,1,2
121,A15_1001000408,0,0,1,3
153,A16_1001000407,1,6,1,15
155,A16_1001000412,1,16,1,29
185,A17_1001000407,0,0,1,5
212,A18_1001000408,0,0,1,2
285,A20_1001000408,1,6,1,14
384,A3_1001000412,0,0,1,3
542,B10_1001000412,1,3,1,7


In [46]:
# fill in summaryTable.coverage_to_ROI, for cells where 'clinical_driver_gene' is EGFR 
#     and 'clinical_mutation' is L858R
#
# lookin good

for i in range(0,len(summaryTable.index)):
    currCell = summaryTable['cell'].iloc[i]
    currDriverGene = summaryTable['clinical_driver_gene'].iloc[i]
    currDriverMut = summaryTable['clinical_mutation'].iloc[i]
        
    if currDriverGene == 'EGFR' and currDriverMut == 'L858R':
    
        if currCell in list(egfr_L858R_allCells_wCov_df['cellName']): 
            egfr_L858R_df_index = egfr_L858R_allCells_wCov_df['cellName'] == currCell
            egfr_L858R_df_row = egfr_L858R_allCells_wCov_df[egfr_L858R_df_index]
            depth = list(egfr_L858R_df_row['depth_gvcf'])[0]
            #print(depth)
            summaryTable['coverage_to_ROI'][i] = depth
        

In [47]:
#summaryTable['coverage_to_ROI'] = 0 # reset
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,EGFR,del19,,,,,,,
1,C1_B002572,TH266,ALK,fusion,,,,,,,
2,J2_B002573,TH266,ALK,fusion,,,,,,,
3,P22_B003116,TH231,ALK,fusion,,,,,,,
4,O22_B003191,TH226,EGFR,del19,,,,,,,
5,J12_B003529,TH171,ALK,fusion,,,T903T,,,,
6,D14_1001000407,TH185,EGFR,L858R,,,"R521K,T903T,T629T,Q787Q",,,,
7,F14_B003105,TH222,ROS1,fusion,,,,,,,
8,P11_1001000340,TH169,EGFR,del19,,,,,,,
9,E21_B003930,TH222,ROS1,fusion,,,T903T,,,,


In [68]:
# can i make this into a formal function? 
def fillInCoverage_func(summaryTable, GOI_allCells_wCov_df, GOI, mutation):
    
    for i in range(0,len(summaryTable.index)):
        currCell = summaryTable['cell'].iloc[i]
        currDriverGene = summaryTable['clinical_driver_gene'].iloc[i]
        currDriverMut = summaryTable['clinical_mutation'].iloc[i]
        
        if currDriverGene == GOI and currDriverMut == mutation:
    
            if currCell in list(GOI_allCells_wCov_df['cellName']): 
                index = GOI_allCells_wCov_df['cellName'] == currCell
                row = GOI_allCells_wCov_df[index]
                depth = list(row['depth_gvcf'])[0]
                #print(depth)
                summaryTable['coverage_to_ROI'][i] = depth

In [69]:
# do this more generically, so we dont have to duplicate too many chunks
GOI_ = 'EGFR'
mut_ = 'L858R'

GOI_mut_allCells = pd.read_csv('../coverage/out/egfr_L858R_allCells.csv')      
indices_to_keep = GOI_mut_allCells['depth_gvcf'] != 0
GOI_mut_allCells_wCov_df = GOI_mut_allCells[indices_to_keep]
        
fillInCoverage_func(summaryTable, GOI_mut_allCells_wCov_df, GOI_, mut_)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,EGFR,del19,,,,,,,
1,C1_B002572,TH266,ALK,fusion,,,,,,,
2,J2_B002573,TH266,ALK,fusion,,,,,,,
3,P22_B003116,TH231,ALK,fusion,,,,,,,
4,O22_B003191,TH226,EGFR,del19,,,,,,,
5,J12_B003529,TH171,ALK,fusion,,,T903T,,,,
6,D14_1001000407,TH185,EGFR,L858R,,,"R521K,T903T,T629T,Q787Q",,,,
7,F14_B003105,TH222,ROS1,fusion,,,,,,,
8,P11_1001000340,TH169,EGFR,del19,,,,,,,
9,E21_B003930,TH222,ROS1,fusion,,,T903T,,,,


In [70]:
# do this more generically, so we dont have to duplicate too many chunks
GOI_ = 'EGFR'
mut_ = 'G719X'

GOI_mut_allCells = pd.read_csv('../coverage/out/egfr_G719X_allCells.csv')      
indices_to_keep = GOI_mut_allCells['depth_gvcf'] != 0
GOI_mut_allCells_wCov_df = GOI_mut_allCells[indices_to_keep]
        
fillInCoverage_func(summaryTable, GOI_mut_allCells_wCov_df, GOI_, mut_)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,EGFR,del19,,,,,,,
1,C1_B002572,TH266,ALK,fusion,,,,,,,
2,J2_B002573,TH266,ALK,fusion,,,,,,,
3,P22_B003116,TH231,ALK,fusion,,,,,,,
4,O22_B003191,TH226,EGFR,del19,,,,,,,
5,J12_B003529,TH171,ALK,fusion,,,T903T,,,,
6,D14_1001000407,TH185,EGFR,L858R,,,"R521K,T903T,T629T,Q787Q",,,,
7,F14_B003105,TH222,ROS1,fusion,,,,,,,
8,P11_1001000340,TH169,EGFR,del19,,,,,,,
9,E21_B003930,TH222,ROS1,fusion,,,T903T,,,,


In [71]:
# do this more generically, so we dont have to duplicate too many chunks
GOI_ = 'EGFR'
mut_ = 'L861Q'

GOI_mut_allCells = pd.read_csv('../coverage/out/egfr_L861Q_allCells.csv')      
indices_to_keep = GOI_mut_allCells['depth_gvcf'] != 0
GOI_mut_allCells_wCov_df = GOI_mut_allCells[indices_to_keep]
        
fillInCoverage_func(summaryTable, GOI_mut_allCells_wCov_df, GOI_, mut_)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,EGFR,del19,,,,,,,
1,C1_B002572,TH266,ALK,fusion,,,,,,,
2,J2_B002573,TH266,ALK,fusion,,,,,,,
3,P22_B003116,TH231,ALK,fusion,,,,,,,
4,O22_B003191,TH226,EGFR,del19,,,,,,,
5,J12_B003529,TH171,ALK,fusion,,,T903T,,,,
6,D14_1001000407,TH185,EGFR,L858R,,,"R521K,T903T,T629T,Q787Q",,,,
7,F14_B003105,TH222,ROS1,fusion,,,,,,,
8,P11_1001000340,TH169,EGFR,del19,,,,,,,
9,E21_B003930,TH222,ROS1,fusion,,,T903T,,,,


In [74]:
# do this more generically, so we dont have to duplicate too many chunks
GOI_ = 'EGFR'
mut_ = 'T790M'

GOI_mut_allCells = pd.read_csv('../coverage/out/egfr_T790M_allCells.csv')      
indices_to_keep = GOI_mut_allCells['depth_gvcf'] != 0
GOI_mut_allCells_wCov_df = GOI_mut_allCells[indices_to_keep]
        
fillInCoverage_func(summaryTable, GOI_mut_allCells_wCov_df, GOI_, mut_)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,EGFR,del19,,,,,,,
1,C1_B002572,TH266,ALK,fusion,,,,,,,
2,J2_B002573,TH266,ALK,fusion,,,,,,,
3,P22_B003116,TH231,ALK,fusion,,,,,,,
4,O22_B003191,TH226,EGFR,del19,,,,,,,
5,J12_B003529,TH171,ALK,fusion,,,T903T,,,,
6,D14_1001000407,TH185,EGFR,L858R,,,"R521K,T903T,T629T,Q787Q",,,,
7,F14_B003105,TH222,ROS1,fusion,,,,,,,
8,P11_1001000340,TH169,EGFR,del19,,,,,,,
9,E21_B003930,TH222,ROS1,fusion,,,T903T,,,,


In [75]:
# quick check
GOI_mut_allCells_wCov_df

Unnamed: 0,cellName,coverage_bool_vcf,depth_vcf,coverage_bool_gvcf,depth_gvcf
1181,C14_1001000407,1,1,1,3
4666,H9_B003522,1,21,1,21
5979,K15_B001008,1,2,1,2
6182,K21_B001010,1,51,1,51


In [79]:
summaryTable[summaryTable['cell'] == 'K21_B001010']

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
9641,K21_B001010,,,,,,"R521K,T629T,Q787Q",,,,


In [None]:
#//////////////////////////////////////////////////////////////////
#//////////////////////////////////////////////////////////////////
#
# WANT TO FILL IN THE REST OF THE mutations_found_* cols!
#
#//////////////////////////////////////////////////////////////////
#//////////////////////////////////////////////////////////////////

In [80]:
braf_df = pd.read_csv('/Users/lincoln.harris/code/SNP_calling_pipeline/getMutationCounts/braf_out_AA.csv', header=None, names=['cell', 'mutations'])
braf_df

Unnamed: 0,cell,mutations
0,K20_B003894,[]
1,B2_B001558,[]
2,M19_B003125,[]
3,E14_B003776,[]
4,L8_1001000412,[]
5,H13_1001000339,[]
6,A5_B001007,[]
7,L4_B003079,[]
8,B15_1001000339,[]
9,N8_B003132,[]


In [81]:
braf_df['mutations'] = braf_df['mutations'].str.replace("'", "") # remove quotes
braf_df['mutations'] = braf_df['mutations'].str.replace("[", "") # remove brackets
braf_df['mutations'] = braf_df['mutations'].str.replace("]", "") # remove brackets
braf_df['mutations'] = braf_df['mutations'].str.replace(" ", "") # remove whitespace?
braf_df['mutations']

0            
1            
2            
3            
4            
5            
6            
7            
8            
9            
10           
11           
12           
13           
14           
15           
16           
17           
18           
19           
20           
21           
22           
23           
24           
25           
26           
27           
28           
29           
        ...  
9851         
9852         
9853         
9854         
9855         
9856         
9857         
9858         
9859    G643G
9860         
9861         
9862         
9863         
9864         
9865         
9866         
9867         
9868         
9869         
9870         
9871         
9872         
9873         
9874         
9875         
9876         
9877         
9878         
9879         
9880         
Name: mutations, Length: 9881, dtype: object

In [82]:
summaryTable['mutations_found_BRAF'] = braf_df['mutations']
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,EGFR,del19,,,,,,,
1,C1_B002572,TH266,ALK,fusion,,,,,,,
2,J2_B002573,TH266,ALK,fusion,,,,,,,
3,P22_B003116,TH231,ALK,fusion,,,,,,,
4,O22_B003191,TH226,EGFR,del19,,,,,,,
5,J12_B003529,TH171,ALK,fusion,,,T903T,,,,
6,D14_1001000407,TH185,EGFR,L858R,,,"R521K,T903T,T629T,Q787Q",,,,
7,F14_B003105,TH222,ROS1,fusion,,,,,,,
8,P11_1001000340,TH169,EGFR,del19,,,,,,,
9,E21_B003930,TH222,ROS1,fusion,,,T903T,,,,


In [83]:
kras_df = pd.read_csv('/Users/lincoln.harris/code/SNP_calling_pipeline/getMutationCounts/kras_out_AA.csv', header=None, names=['cell', 'mutations'])
kras_df

Unnamed: 0,cell,mutations
0,D6_B003648,[]
1,H13_B001008,[]
2,G21_B001010,[]
3,E13_B000420,[]
4,H13_B003529,[]
5,K12_B002078,[]
6,E1_B003071,[]
7,J3_B001007,[]
8,P15_1001000412,[]
9,E7_B003125,[]


In [84]:
kras_df['mutations'] = kras_df['mutations'].str.replace("'", "") # remove quotes
kras_df['mutations'] = kras_df['mutations'].str.replace("[", "") # remove brackets
kras_df['mutations'] = kras_df['mutations'].str.replace("]", "") # remove brackets
kras_df['mutations'] = kras_df['mutations'].str.replace(" ", "") # remove whitespace?
kras_df['mutations']

0           
1           
2           
3           
4           
5           
6           
7           
8           
9           
10          
11          
12          
13          
14          
15          
16          
17          
18          
19          
20          
21          
22          
23          
24          
25          
26          
27          
28          
29          
        ... 
9851        
9852        
9853        
9854        
9855        
9856        
9857        
9858        
9859        
9860        
9861    A11A
9862    K88K
9863        
9864        
9865        
9866        
9867        
9868        
9869        
9870        
9871        
9872        
9873        
9874        
9875        
9876        
9877        
9878        
9879        
9880        
Name: mutations, Length: 9881, dtype: object

In [85]:
summaryTable['mutations_found_KRAS'] = kras_df['mutations']
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,P13_1001000339,TH169,EGFR,del19,,,,,,,
1,C1_B002572,TH266,ALK,fusion,,,,,,,
2,J2_B002573,TH266,ALK,fusion,,,,,,,
3,P22_B003116,TH231,ALK,fusion,,,,,,,
4,O22_B003191,TH226,EGFR,del19,,,,,,,
5,J12_B003529,TH171,ALK,fusion,,,T903T,,,,
6,D14_1001000407,TH185,EGFR,L858R,,,"R521K,T903T,T629T,Q787Q",,,,
7,F14_B003105,TH222,ROS1,fusion,,,,,,,
8,P11_1001000340,TH169,EGFR,del19,,,,,,,
9,E21_B003930,TH222,ROS1,fusion,,,T903T,,,,
