# <span style='font-family:"Times New Roman"'> <span styel=''>**COHORT MAF FILE CREATION**
## <span style='font-family:"Times New Roman"'> <span styel=''>*Emile Cohen*
 *February 2020*

**Goal:** Through this notebook, we create a maf file composed of all patients in the cohort. As we do not have the mutations for all patients in impact-facets-tp53 datasets, we will merge the cohort file with cbioportal data.

The notebook is composed of 2 parts:
   * **1. Extraction of patients from the cohort**
   * **2. Creation of the MAF file from CbioPortal raw datasets**
---

In [53]:
%run -i '../../utils/setup_environment.ipy'

import warnings
warnings.filterwarnings('ignore')

data_path = '../../data/'

Setup environment... done!


<span style="color:green">✅ Working on **mskimpact_env** conda environment.</span>

---
## Extraction of patients from the cohort

In [54]:
cohort = pd.read_csv(data_path + 'impact-facets-tp53/raw/default_qc_pass.cohort.txt', sep='\t')

In [55]:
len(cohort)

29304

In [56]:
cohort['Patient_Id'] = cohort['sample_id'].str[:9]
cohort['Tumor_Id'] = cohort['sample_id'].str[:17]

In [57]:
cohort.head()

Unnamed: 0,sample_id,sample_path,tumor_sample_id,path,fit_name,purity_run_version,purity_run_prefix,purity_run_Seed,purity_run_cval,purity_run_nhet,purity_run_Purity,purity_run_Ploidy,purity_run_dipLogR,purity_run_alBalLogR,hisens_run_version,hisens_run_prefix,hisens_run_Seed,hisens_run_cval,hisens_run_nhet,hisens_run_hisens,hisens_run_Ploidy,hisens_run_dipLogR,manual_note,is_best_fit,purity,ploidy,dipLogR,dipLogR_flag,n_alternative_dipLogR,n_dip_bal_segs,frac_dip_bal_segs,n_dip_imbal_segs,frac_dip_imbal_segs,n_amps,n_homdels,frac_homdels,n_homdels_clonal,frac_homdels_clonal,n_cn_states,n_segs,n_cnlr_clusters,n_lcn_na,n_loh,frac_loh,n_segs_subclonal,frac_segs_subclonal,n_snps,n_het_snps,frac_het_snps,n_het_snps_hom_in_tumor_1pct,n_het_snps_hom_in_tumor_5pct,frac_het_snps_hom_in_tumor_1pct,frac_het_snps_hom_in_tumor_5pct,mean_cnlr_residual,sd_cnlr_residual,n_segs_discordant_tcn,frac_discordant_tcn,n_segs_discordant_lcn,frac_discordant_lcn,n_segs_discordant_both,frac_discordant_both,n_segs_icn_cnlor_discordant,frac_icn_cnlor_discordant,homdel_filter_pass,diploid_bal_seg_filter_pass,diploid_imbal_seg_filter_pass,waterfall_filter_pass,hyper_seg_filter_pass,high_ploidy_filter_pass,valid_purity_filter_pass,diploid_seg_filter_pass,facets_suite_qc,arm_level_file,gene_level_file,ccf_file,arm_level_file_exists,gene_level_file_exists,ccf_file_exists,Patient_Id,Tumor_Id
0,P-0034223-T01-IM6_P-0034223-N01-IM6,/juno/work/ccs/resources/impact/facets/all/P-00342/P-0034223-T01-IM6_P-0034223-N01-IM6/,P-0034223-T01-IM6_P-0034223-N01-IM6,/juno/work/ccs/resources/impact/facets/all/P-00342/P-0034223-T01-IM6_P-0034223-N01-IM6/,default,0.5.14,/juno/work/ccs/resources/impact/facets/all/P-00342/P-0034223-T01-IM6_P-0034223-N01-IM6//default/P-0034223-T01-IM6_P-0034223-N01-IM6_purity,100.0,100,15,0.94,2.24,-0.16,-0.16,0.5.14,/juno/work/ccs/resources/impact/facets/all/P-00342/P-0034223-T01-IM6_P-0034223-N01-IM6//default/P-0034223-T01-IM6_P-0034223-N01-IM6_hisens,100.0,50.0,15.0,,2.24,-0.16,,False,0.941111,2.24183,-0.155483,False,0,12,0.59,0,0.0,0,0,0.0,0,0.0,6,31,10,2,4,0.062,1,1e-05,22963,2655,0.12,43,129,0.016,0.049,-0.19,0.63,1,0.0038,0.0,0.0,0,0.0,1,0.042,True,True,False,True,True,True,True,True,True,/juno/work/ccs/resources/impact/facets/all/P-00342/P-0034223-T01-IM6_P-0034223-N01-IM6//default/P-0034223-T01-IM6_P-0034223-N01-IM6.arm_level.txt,/juno/work/ccs/resources/impact/facets/all/P-00342/P-0034223-T01-IM6_P-0034223-N01-IM6//default/P-0034223-T01-IM6_P-0034223-N01-IM6.gene_level.txt,/juno/work/ccs/resources/impact/facets/all/P-00342/P-0034223-T01-IM6_P-0034223-N01-IM6//default/P-0034223-T01-IM6_P-0034223-N01-IM6.ccf.maf,True,True,True,P-0034223,P-0034223-T01-IM6
1,P-0009819-T01-IM5_P-0009819-N01-IM5,/juno/work/ccs/resources/impact/facets/all/P-00098/P-0009819-T01-IM5_P-0009819-N01-IM5/,P-0009819-T01-IM5_P-0009819-N01-IM5,/juno/work/ccs/resources/impact/facets/all/P-00098/P-0009819-T01-IM5_P-0009819-N01-IM5/,default,0.5.14,/juno/work/ccs/resources/impact/facets/all/P-00098/P-0009819-T01-IM5_P-0009819-N01-IM5//default/P-0009819-T01-IM5_P-0009819-N01-IM5_purity,100.0,100,15,0.28,2.68,-0.13,-0.13,0.5.14,/juno/work/ccs/resources/impact/facets/all/P-00098/P-0009819-T01-IM5_P-0009819-N01-IM5//default/P-0009819-T01-IM5_P-0009819-N01-IM5_hisens,100.0,50.0,15.0,,2.77,-0.13,,False,0.275237,2.681075,-0.129255,False,0,7,0.43,0,0.0,0,1,0.0062,1,0.0062,3,25,6,0,5,0.094,0,0.0,16527,2041,0.12,10,10,0.0049,0.0049,-0.25,0.83,1,0.0062,0.0,0.0,0,0.0,2,0.0063,True,True,False,True,True,True,True,True,True,/juno/work/ccs/resources/impact/facets/all/P-00098/P-0009819-T01-IM5_P-0009819-N01-IM5//default/P-0009819-T01-IM5_P-0009819-N01-IM5.arm_level.txt,/juno/work/ccs/resources/impact/facets/all/P-00098/P-0009819-T01-IM5_P-0009819-N01-IM5//default/P-0009819-T01-IM5_P-0009819-N01-IM5.gene_level.txt,/juno/work/ccs/resources/impact/facets/all/P-00098/P-0009819-T01-IM5_P-0009819-N01-IM5//default/P-0009819-T01-IM5_P-0009819-N01-IM5.ccf.maf,True,True,True,P-0009819,P-0009819-T01-IM5
2,P-0025956-T01-IM6_P-0025956-N01-IM6,/juno/work/ccs/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6/,P-0025956-T01-IM6_P-0025956-N01-IM6,/juno/work/ccs/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6/,default,0.5.14,/juno/work/ccs/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default/P-0025956-T01-IM6_P-0025956-N01-IM6_purity,100.0,100,15,0.19,3.5,-0.19,"-0.19, 0.02",0.5.14,/juno/work/ccs/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default/P-0025956-T01-IM6_P-0025956-N01-IM6_hisens,100.0,50.0,15.0,,3.45,-0.19,,False,0.185874,3.496971,-0.187925,False,0,2,0.096,4,0.18,0,0,0.0,0,0.0,6,26,6,0,5,0.19,0,0.0,17971,2159,0.12,0,0,0.0,0.0,-0.052,0.3,2,0.015,0.0,0.0,3,0.12,8,0.3,True,True,True,True,True,True,True,True,True,/juno/work/ccs/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default/P-0025956-T01-IM6_P-0025956-N01-IM6.arm_level.txt,/juno/work/ccs/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default/P-0025956-T01-IM6_P-0025956-N01-IM6.gene_level.txt,/juno/work/ccs/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default/P-0025956-T01-IM6_P-0025956-N01-IM6.ccf.maf,False,False,False,P-0025956,P-0025956-T01-IM6
3,P-0027408-T01-IM6_P-0027408-N01-IM6,/juno/work/ccs/resources/impact/facets/all/P-00274/P-0027408-T01-IM6_P-0027408-N01-IM6/,P-0027408-T01-IM6_P-0027408-N01-IM6,/juno/work/ccs/resources/impact/facets/all/P-00274/P-0027408-T01-IM6_P-0027408-N01-IM6/,default,0.5.14,/juno/work/ccs/resources/impact/facets/all/P-00274/P-0027408-T01-IM6_P-0027408-N01-IM6//default/P-0027408-T01-IM6_P-0027408-N01-IM6_purity,100.0,100,15,0.31,1.81,0.04,0.04,0.5.14,/juno/work/ccs/resources/impact/facets/all/P-00274/P-0027408-T01-IM6_P-0027408-N01-IM6//default/P-0027408-T01-IM6_P-0027408-N01-IM6_hisens,100.0,50.0,15.0,,1.82,0.04,,False,0.308886,1.811066,0.042724,False,0,7,0.28,1,0.035,0,0,0.0,0,0.0,4,31,6,0,12,0.34,0,0.0,18633,2163,0.12,0,0,0.0,0.0,-0.058,0.29,2,0.096,0.0,0.0,0,0.0,4,0.21,True,True,False,True,True,True,True,True,True,/juno/work/ccs/resources/impact/facets/all/P-00274/P-0027408-T01-IM6_P-0027408-N01-IM6//default/P-0027408-T01-IM6_P-0027408-N01-IM6.arm_level.txt,/juno/work/ccs/resources/impact/facets/all/P-00274/P-0027408-T01-IM6_P-0027408-N01-IM6//default/P-0027408-T01-IM6_P-0027408-N01-IM6.gene_level.txt,/juno/work/ccs/resources/impact/facets/all/P-00274/P-0027408-T01-IM6_P-0027408-N01-IM6//default/P-0027408-T01-IM6_P-0027408-N01-IM6.ccf.maf,True,True,True,P-0027408,P-0027408-T01-IM6
4,P-0006554-T01-IM5_P-0006554-N01-IM5,/juno/work/ccs/resources/impact/facets/all/P-00065/P-0006554-T01-IM5_P-0006554-N01-IM5/,P-0006554-T01-IM5_P-0006554-N01-IM5,/juno/work/ccs/resources/impact/facets/all/P-00065/P-0006554-T01-IM5_P-0006554-N01-IM5/,default,0.5.14,/juno/work/ccs/resources/impact/facets/all/P-00065/P-0006554-T01-IM5_P-0006554-N01-IM5//default/P-0006554-T01-IM5_P-0006554-N01-IM5_purity,100.0,100,15,0.72,1.91,0.05,0.05,0.5.14,/juno/work/ccs/resources/impact/facets/all/P-00065/P-0006554-T01-IM5_P-0006554-N01-IM5//default/P-0006554-T01-IM5_P-0006554-N01-IM5_hisens,100.0,50.0,15.0,,1.92,0.05,,False,0.715208,1.910719,0.046812,False,0,11,0.49,0,0.0,0,1,0.0074,0,0.0,6,30,12,2,6,0.088,6,0.15,16557,2041,0.12,1,7,0.00049,0.0034,-0.024,0.38,0,0.0,160000000.0,0.058,3,0.054,3,0.11,True,True,False,True,True,True,True,True,True,/juno/work/ccs/resources/impact/facets/all/P-00065/P-0006554-T01-IM5_P-0006554-N01-IM5//default/P-0006554-T01-IM5_P-0006554-N01-IM5.arm_level.txt,/juno/work/ccs/resources/impact/facets/all/P-00065/P-0006554-T01-IM5_P-0006554-N01-IM5//default/P-0006554-T01-IM5_P-0006554-N01-IM5.gene_level.txt,/juno/work/ccs/resources/impact/facets/all/P-00065/P-0006554-T01-IM5_P-0006554-N01-IM5//default/P-0006554-T01-IM5_P-0006554-N01-IM5.ccf.maf,True,True,True,P-0006554,P-0006554-T01-IM5


In [58]:
print_md('cohort columns:','green')
for column in cohort.columns: print(column)

<span style="color:green">cohort columns:</span>

sample_id
sample_path
tumor_sample_id
path
fit_name
purity_run_version
purity_run_prefix
purity_run_Seed
purity_run_cval
purity_run_nhet
purity_run_Purity
purity_run_Ploidy
purity_run_dipLogR
purity_run_alBalLogR
hisens_run_version
hisens_run_prefix
hisens_run_Seed
hisens_run_cval
hisens_run_nhet
hisens_run_hisens
hisens_run_Ploidy
hisens_run_dipLogR
manual_note
is_best_fit
purity
ploidy
dipLogR
dipLogR_flag
n_alternative_dipLogR
n_dip_bal_segs
frac_dip_bal_segs
n_dip_imbal_segs
frac_dip_imbal_segs
n_amps
n_homdels
frac_homdels
n_homdels_clonal
frac_homdels_clonal
n_cn_states
n_segs
n_cnlr_clusters
n_lcn_na
n_loh
frac_loh
n_segs_subclonal
frac_segs_subclonal
n_snps
n_het_snps
frac_het_snps
n_het_snps_hom_in_tumor_1pct
n_het_snps_hom_in_tumor_5pct
frac_het_snps_hom_in_tumor_1pct
frac_het_snps_hom_in_tumor_5pct
mean_cnlr_residual
sd_cnlr_residual
n_segs_discordant_tcn
frac_discordant_tcn
n_segs_discordant_lcn
frac_discordant_lcn
n_segs_discordant_both
frac_discordant_both
n_segs_icn_cnlo

In [59]:
s = cohort.Tumor_Id.duplicated()
s[s == True]

2415     True
5268     True
6171     True
6240     True
7181     True
12459    True
13791    True
13793    True
21235    True
21236    True
21240    True
21245    True
21278    True
21304    True
21588    True
21710    True
21714    True
21894    True
22016    True
22351    True
22783    True
23317    True
23771    True
23883    True
23884    True
23885    True
23935    True
24306    True
24645    True
24830    True
24863    True
25799    True
26561    True
26885    True
27255    True
28090    True
28100    True
28107    True
28317    True
28325    True
28629    True
28704    True
28796    True
28798    True
29296    True
Name: Tumor_Id, dtype: bool

In [60]:
# We verify that each sample is unique (we have an equal number of lines and unique samples )
assert(len(cohort) == len(set(cohort.sample_id)))

---
## Creation of the MAF file from CbioPortal raw datasets

In [61]:
clinical_data = pd.read_csv(data_path + 'cbioportal/raw/mskimpact_clinical_data-2.tsv', sep= '\t')
mutations = pd.read_pickle(data_path + 'cbioportal/raw/mutations_cohort.pkl')

In [62]:
mutations

Unnamed: 0,uniqueSampleKey,uniquePatientKey,molecularProfileId,sampleId,patientId,entrezGeneId,gene,studyId,center,mutationStatus,validationStatus,tumorAltCount,tumorRefCount,normalAltCount,normalRefCount,startPosition,endPosition,referenceAllele,proteinChange,mutationType,functionalImpactScore,fisValue,linkXvar,linkPdb,linkMsa,ncbiBuild,variantType,keyword,driverFilter,driverFilterAnnotation,driverTiersFilter,driverTiersFilterAnnotation,chr,variantAllele,refseqMrnaId,proteinPosStart,proteinPosEnd,hugoGeneSymbol,type
0,UC0wMDAwMDA0LVQwMS1JTTM6bXNraW1wYWN0,UC0wMDAwMDA0Om1za2ltcGFjdA,mskimpact_mutations,P-0000004-T01-IM3,P-0000004,207,"{'entrezGeneId': 207, 'hugoGeneSymbol': 'AKT1', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,244.0,202.0,1.0,711.0,105246551,105246551,C,E17K,Missense_Mutation,M,2.175000e+00,"getma.org/?cm=var&var=hg19,14,105246551,C,T&fts=all",getma.org/pdb.php?prot=AKT1_HUMAN&from=6&to=108&var=E17K,getma.org/?cm=msa&ty=f&p=AKT1_HUMAN&rb=6&re=108&var=E17K,GRCh37,SNP,AKT1 E17 missense,,,,,14,T,,17,17,AKT1,protein-coding
1,UC0wMDAwMDA0LVQwMS1JTTM6bXNraW1wYWN0,UC0wMDAwMDA0Om1za2ltcGFjdA,mskimpact_mutations,P-0000004-T01-IM3,P-0000004,7157,"{'entrezGeneId': 7157, 'hugoGeneSymbol': 'TP53', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,58.0,209.0,0.0,600.0,7578503,7578518,CAGGGCAGGTCTTGGC,A138Cfs*27,Frame_Shift_Del,,1.401300e-45,,,,GRCh37,DEL,TP53 truncating,,,,,17,-,"NM_001126112.2,NM_001276761.1,NM_001276760.1,NM_000546.5,NM_0011",138,143,TP53,protein-coding
2,UC0wMDAwMDA0LVQwMS1JTTM6bXNraW1wYWN0,UC0wMDAwMDA0Om1za2ltcGFjdA,mskimpact_mutations,P-0000004-T01-IM3,P-0000004,23013,"{'entrezGeneId': 23013, 'hugoGeneSymbol': 'SPEN', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,73.0,400.0,0.0,1071.0,16265908,16265908,A,I3661F,Missense_Mutation,M,2.275000e+00,"getma.org/?cm=var&var=hg19,1,16265908,A,T&fts=all",getma.org/pdb.php?prot=MINT_HUMAN&from=3498&to=3664&var=I3661F,getma.org/?cm=msa&ty=f&p=MINT_HUMAN&rb=3498&re=3664&var=I3661F,GRCh37,SNP,SPEN I3661 missense,,,,,1,T,NM_015001.2,3661,3661,SPEN,protein-coding
3,UC0wMDAwMDA0LVQwMS1JTTM6bXNraW1wYWN0,UC0wMDAwMDA0Om1za2ltcGFjdA,mskimpact_mutations,P-0000004-T01-IM3,P-0000004,58508,"{'entrezGeneId': 58508, 'hugoGeneSymbol': 'KMT2C', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,11.0,84.0,0.0,193.0,151945083,151945083,C,M812I,Missense_Mutation,L,8.050000e-01,"getma.org/?cm=var&var=hg19,7,151945083,C,T&fts=all",,getma.org/?cm=msa&ty=f&p=MLL3_HUMAN&rb=639&re=838&var=M812I,GRCh37,SNP,KMT2C M812 missense,,,,,7,T,NM_170606.2,812,812,KMT2C,protein-coding
4,UC0wMDAwMDEyLVQwMi1JTTM6bXNraW1wYWN0,UC0wMDAwMDEyOm1za2ltcGFjdA,mskimpact_mutations,P-0000012-T02-IM3,P-0000012,7157,"{'entrezGeneId': 7157, 'hugoGeneSymbol': 'TP53', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,114.0,113.0,0.0,569.0,7577515,7577515,T,T256P,Missense_Mutation,M,3.140000e+00,"getma.org/?cm=var&var=hg19,17,7577515,T,G&fts=all",getma.org/pdb.php?prot=P53_HUMAN&from=95&to=289&var=T256P,getma.org/?cm=msa&ty=f&p=P53_HUMAN&rb=95&re=289&var=T256P,GRCh37,SNP,TP53 T256 missense,,,,,17,G,"NM_001126112.2,NM_001276761.1,NM_001276760.1,NM_000546.5,NM_0011",256,256,TP53,protein-coding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411354,UC0wMDUyMzY5LVQwMS1YUzE6bXNraW1wYWN0,UC0wMDUyMzY5Om1za2ltcGFjdA,mskimpact_mutations,P-0052369-T01-XS1,P-0052369,3845,"{'entrezGeneId': 3845, 'hugoGeneSymbol': 'KRAS', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,57.0,3616.0,0.0,1306.0,25398281,25398282,CC,G13F,Missense_Mutation,,1.401300e-45,,,,GRCh37,DNP,KRAS G13 missense,,,,,12,AA,NM_033360.2,13,13,KRAS,protein-coding
411355,UC0wMDUyMzY5LVQwMS1YUzE6bXNraW1wYWN0,UC0wMDUyMzY5Om1za2ltcGFjdA,mskimpact_mutations,P-0052369-T01-XS1,P-0052369,4089,"{'entrezGeneId': 4089, 'hugoGeneSymbol': 'SMAD4', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,11.0,3529.0,0.0,985.0,48573529,48573529,G,R38T,Missense_Mutation,,1.401300e-45,,,,GRCh37,SNP,SMAD4 R38 missense,,,,,18,C,NM_005359.5,38,38,SMAD4,protein-coding
411356,UC0wMDUyMzY5LVQwMS1YUzE6bXNraW1wYWN0,UC0wMDUyMzY5Om1za2ltcGFjdA,mskimpact_mutations,P-0052369-T01-XS1,P-0052369,55294,"{'entrezGeneId': 55294, 'hugoGeneSymbol': 'FBXW7', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,16.0,3519.0,0.0,1178.0,153253766,153253766,C,E323Q,Missense_Mutation,,1.401300e-45,,,,GRCh37,SNP,FBXW7 E323 missense,,,,,4,G,NM_033632.3,323,323,FBXW7,protein-coding
411357,UC0wMDUyMzY5LVQwMS1YUzE6bXNraW1wYWN0,UC0wMDUyMzY5Om1za2ltcGFjdA,mskimpact_mutations,P-0052369-T01-XS1,P-0052369,10320,"{'entrezGeneId': 10320, 'hugoGeneSymbol': 'IKZF1', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,11.0,4200.0,1.0,1212.0,50468030,50468030,C,A422E,Missense_Mutation,,1.401300e-45,,,,GRCh37,SNP,IKZF1 A422 missense,,,,,7,A,NM_006060.4,422,422,IKZF1,protein-coding


In [63]:
clinical_data.head()

Unnamed: 0,Study ID,Patient ID,Sample ID,Diagnosis Age,Age at Which Sequencing was Reported (Days),Patient Current Age,Archer Panel,Cancer Type,Cancer Type Detailed,CRDB_ADJ_TXT,CRDB_BASIC_COMMENTS,CRDB_BRAINMET,CRDB_CONSENT_DATE_DAYS,CRDB_ECOG,CRDB_NOSYSTXT,CRDB_OFF_STUDY_DAYS,CRDB_PRIOR_RX,CRDB_SURVEY_COMMENTS,CRDB_SURVIVAL_STATUS,CRDB_TREATMENT_END_DAYS,Impact TMB Percentile (Across All Tumor Types),Impact TMB Score,Impact TMB Percentile (Within Tumor Type),Date added to cBioPortal,Disease Free (Months),Disease Free Status,Ethnicity Category,Fraction Genome Altered,Gene Panel,Neoplasm Histologic Type Name,Institute Source,Metastatic Site,MGMT Status,Month added to cBioPortal,MSI Comment,MSI Score,MSI Type,MSK Slide ID,Mutation Count,Oncotree Code,Overall Survival (Months),Overall Survival Months Reported by DMT,Overall Survival Status,Overall Status Reported by DMT,Other Patient ID,12-245 Part A Consented,12-245 Part C Consented,MSK Pathology Slide Available,Pediatric Case Indicator,Primary Tumor Site,Race Category,Religion,Sample Class,Number of Samples Per Patient,Sample coverage,Sample Type,Sex,Sex Reported by DMT,Somatic Status,SO comments,Tumor Purity,Week added to cBioPortal,WHO Grade
0,mskimpact,P-0000004,P-0000004-T01-IM3,,40,40.0,NO,Breast Cancer,Breast Invasive Ductal Carcinoma,YES,,NO,14484.0,0.0,6.0,14631.0,YES,,Dead,14631.0,58.6,4.5,67.8,2015/04/07,,,Non-Spanish; Non-Hispanic,0.2782,IMPACT341,,MSKCC,,,2015/04,,2.5,Stable,,4,IDC,3.551,,DECEASED,,DMP0004,YES,NO,NO,No,Breast,WHITE,NONE,Tumor,1,428,Primary,Female,,Matched,,50.0,"2015, Wk. 15",
1,mskimpact,P-0000012,P-0000012-T02-IM3,,59,64.0,NO,Breast Cancer,Breast Invasive Ductal Carcinoma,,,,21192.0,,,,,,Alive,,17.1,1.1,13.5,2015/04/07,,,Non-Spanish; Non-Hispanic,0.3146,IMPACT341,,MSKCC,,,2015/04,MICROSATELLITE INSTABILITY-INDETERMINATE. See MSI note below.,4.1,Indeterminate,,1,IDC,72.46,,LIVING,,,YES,NO,NO,No,Breast,WHITE,NONE,Tumor,3,344,Primary,Female,,Matched,,,"2015, Wk. 15",
2,mskimpact,P-0000012,P-0000012-T03-IM3,,59,64.0,NO,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,,21192.0,,,,,,Alive,,96.7,33.5,98.3,2015/04/07,,,Non-Spanish; Non-Hispanic,0.1844,IMPACT341,,MSKCC,Neck,,2015/04,MICROSATELLITE STABLE (MSS). See MSI note below.,0.47,Stable,,30,LUAD,72.46,,LIVING,,,YES,NO,NO,No,Lung,WHITE,NONE,Tumor,3,428,Metastasis,Female,,Matched,,,"2015, Wk. 15",
3,mskimpact,P-0000012,P-0000012-T04-IM6,,64,64.0,YES,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,,21192.0,,,,,,Alive,,79.1,7.9,63.8,2018/08/01,,,Non-Spanish; Non-Hispanic,0.225,IMPACT468,,MSKCC,,,2018/08,MICROSATELLITE STABLE (MSS). See MSI note below.,0.2,Stable,1103618.0,9,LUAD,72.46,,LIVING,,,YES,NO,YES,No,Lung,WHITE,NONE,Tumor,3,713,Primary,Female,,Matched,Note: The mutations and copy number profile suggests that this sample may be clonally unrelated to the prior sample M13-9122. Note: Allele specific copy number analysis by FACETS suggests copy neutral loss of heterozygosity for BRCA1. FACETS results are f,20.0,"2018, Wk. 31",
4,mskimpact,P-0000015,P-0000015-T01-IM3,,45,45.0,NO,Breast Cancer,Breast Invasive Ductal Carcinoma,NO,,NO,16221.0,1.0,6.0,16656.0,NO,,Dead,16656.0,78.6,7.8,88.2,2015/04/07,,,Non-Spanish; Non-Hispanic,0.3503,IMPACT341,,MSKCC,Liver,,2015/04,,2.55,Stable,,7,IDC,13.677,,DECEASED,,DMP0003,YES,NO,NO,No,Breast,WHITE,NONE,Tumor,1,281,Metastasis,Female,,Matched,,40.0,"2015, Wk. 15",


In [64]:
# Here are the columns we will select for the three different files

filter_cohort = ['sample_id','Tumor_Id', 'purity', 'ploidy', 'dipLogR', 'frac_loh']

filter_mut = ['sampleId',
             'patientId',
             'gene',
             'entrezGeneId',
             'mutationType',
             'proteinChange',
             'startPosition',
             'endPosition',
             'referenceAllele',
             'variantAllele',
             'chr',
             'hugoGeneSymbol',
             'tumorAltCount',
             'tumorRefCount']

filter_clinical = [  'Sample ID',
                     'Patient ID',
                     'Patient Current Age',
                     'Cancer Type',
                     'Cancer Type Detailed',
                     'Ethnicity Category' ,
                     'Sex',
                     'Mutation Count',
                     'Sample Type',
                     'Number of Samples Per Patient',
                     'Overall Survival Status',
                     'Overall Survival (Months)',
                     'MSI Score',
                     'Impact TMB Score'
                      ]

cohort_filtered = cohort[filter_cohort]
mutations_filtered = mutations[filter_mut]
clinical_data_filtered = clinical_data[filter_clinical]

---
We create 3 new columns in mutations_filtered:
* *mut_key*: mutation key that describes entirely the mutation
* *sample_mut_key*: sample mutation key that adds information about the sample (it allows to filter out duplicates)
* *mut_spot*: number representing the location of the amino acid mutated

In [65]:
# Create a mutation Key
mutations_filtered['mut_key'] = pd.Series([str(i)+'_'+str(j)+'_'+str(k)+'_'+str(l) for i,j,k,l in zip(mutations_filtered.chr, mutations_filtered.startPosition, mutations_filtered.referenceAllele, mutations_filtered.variantAllele)]) 
# Create a sample key to differentiate duplicates
mutations_filtered['sample_mut_key'] = pd.Series([str(j)+'_'+str(i) for i,j in zip( mutations_filtered.mut_key, mutations_filtered.sampleId)])
# Extract the mutation spot from HGVSp
mutations_filtered['mut_spot'] = mutations_filtered.proteinChange.str.extract('(\d+)')
#Create the vaf column
mutations_filtered['vaf'] = mutations_filtered.apply(lambda x: x.tumorAltCount/(x.tumorAltCount + x.tumorRefCount) if (x.tumorAltCount + x.tumorRefCount)>0 else 'None' , axis=1)

In [66]:
# We merge the three dataframes
# Left Join on 'patient_Id' and 'Patient ID'
maf = pd.merge(left=cohort_filtered,right=clinical_data_filtered, how='left', left_on='Tumor_Id', right_on='Sample ID')
maf_cohort = pd.merge(left=maf, right=mutations_filtered, how='left', left_on='Tumor_Id', right_on='sampleId')
# We drop column duplicates
maf_cohort = maf_cohort.drop(['sampleId', 'Sample ID','patientId'], axis=1)
# We rename the columns to be consistent with other maf files created
maf_cohort.columns = ['Sample_Id', 'Tumor_Id','purity', 'ploidy', 'dipLogR', 'frac_loh', 'Patient_Id', 'Patient_Current_Age', 'Cancer_Type',
                    'Cancer_Type_Detailed', 'Ethnicity_Category', 'Sex', 'Mutation_Count', 'Sample_Type', 'samples_per_patient','Overall Survival Status',
                     'Overall Survival (Months)', 'MSI Score','TMB_Score', 'gene','Gene_Id','Variant_Classification', 'proteinChange',
                    'Start_Position', 'End_Position', 'Reference_Allele','Variant_Allele', 'Chromosome',  
                    'Hugo_Symbol','alt_count', 'ref_count', 'mut_key', 'sample_mut_key', 'mut_spot', 'vaf']
maf_cohort

Unnamed: 0,Sample_Id,Tumor_Id,purity,ploidy,dipLogR,frac_loh,Patient_Id,Patient_Current_Age,Cancer_Type,Cancer_Type_Detailed,Ethnicity_Category,Sex,Mutation_Count,Sample_Type,samples_per_patient,Overall Survival Status,Overall Survival (Months),MSI Score,TMB_Score,gene,Gene_Id,Variant_Classification,proteinChange,Start_Position,End_Position,Reference_Allele,Variant_Allele,Chromosome,Hugo_Symbol,alt_count,ref_count,mut_key,sample_mut_key,mut_spot,vaf
0,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,5.3,"{'entrezGeneId': 5290, 'hugoGeneSymbol': 'PIK3CA', 'type': 'protein-coding'}",5290.0,Missense_Mutation,E545K,178936091.0,178936091.0,G,A,3,PIK3CA,284.0,334.0,3_178936091_G_A,P-0034223-T01-IM6_3_178936091_G_A,545,0.459547
1,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,5.3,"{'entrezGeneId': 2064, 'hugoGeneSymbol': 'ERBB2', 'type': 'protein-coding'}",2064.0,Missense_Mutation,L755S,37880220.0,37880220.0,T,C,17,ERBB2,224.0,262.0,17_37880220_T_C,P-0034223-T01-IM6_17_37880220_T_C,755,0.460905
2,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,5.3,"{'entrezGeneId': 9641, 'hugoGeneSymbol': 'IKBKE', 'type': 'protein-coding'}",9641.0,Missense_Mutation,R27H,206646650.0,206646650.0,G,A,1,IKBKE,252.0,1027.0,1_206646650_G_A,P-0034223-T01-IM6_1_206646650_G_A,27,0.197029
3,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,5.3,"{'entrezGeneId': 6926, 'hugoGeneSymbol': 'TBX3', 'type': 'protein-coding'}",6926.0,Frame_Shift_Ins,S321Vfs*6,115114257.0,115114258.0,-,T,12,TBX3,358.0,384.0,12_115114257_-_T,P-0034223-T01-IM6_12_115114257_-_T,321,0.48248
4,P-0034223-T01-IM6_P-0034223-N01-IM6,P-0034223-T01-IM6,0.941111,2.241830,-0.155483,0.062,P-0034223,63.0,Breast Cancer,Invasive Breast Carcinoma,,Female,6.0,Metastasis,1.0,LIVING,,0.55,5.3,"{'entrezGeneId': 3169, 'hugoGeneSymbol': 'FOXA1', 'type': 'protein-coding'}",3169.0,Missense_Mutation,C227Y,38061309.0,38061309.0,C,T,14,FOXA1,410.0,462.0,14_38061309_C_T,P-0034223-T01-IM6_14_38061309_C_T,227,0.470183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260808,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,5.3,"{'entrezGeneId': 5290, 'hugoGeneSymbol': 'PIK3CA', 'type': 'protein-coding'}",5290.0,Missense_Mutation,A987D,178951905.0,178951905.0,C,A,3,PIK3CA,158.0,409.0,3_178951905_C_A,P-0050745-T01-IM6_3_178951905_C_A,987,0.27866
260809,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,5.3,"{'entrezGeneId': 63978, 'hugoGeneSymbol': 'PRDM14', 'type': 'protein-coding'}",63978.0,Missense_Mutation,D375H,70978530.0,70978530.0,C,G,8,PRDM14,185.0,261.0,8_70978530_C_G,P-0050745-T01-IM6_8_70978530_C_G,375,0.414798
260810,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,5.3,"{'entrezGeneId': 54880, 'hugoGeneSymbol': 'BCOR', 'type': 'protein-coding'}",54880.0,Missense_Mutation,Q231E,39933908.0,39933908.0,G,C,23,BCOR,274.0,333.0,23_39933908_G_C,P-0050745-T01-IM6_23_39933908_G_C,231,0.4514
260811,P-0050745-T01-IM6_P-0050745-N01-IM6,P-0050745-T01-IM6,0.597798,1.808634,0.084975,0.560,P-0050745,68.0,Breast Cancer,Breast Invasive Ductal Carcinoma,Non-Spanish; Non-Hispanic,Female,6.0,Primary,1.0,LIVING,1.841,1.85,5.3,"{'entrezGeneId': 5058, 'hugoGeneSymbol': 'PAK1', 'type': 'protein-coding'}",5058.0,Fusion,GDPD4-PAK1 fusion,-1.0,-1.0,,,,PAK1,-1.0,-1.0,NA_-1_NA_,P-0050745-T01-IM6_NA_-1_NA_,4,


In [58]:
maf_cohort.vaf.isna().sum()

1070

In [59]:
maf_cohort[maf_cohort['Tumor_Id'] == 'P-0009819-T01-IM5']

Unnamed: 0,Sample_Id,Tumor_Id,purity,ploidy,dipLogR,frac_loh,Patient_Id,Patient_Current_Age,Cancer_Type,Cancer_Type_Detailed,Ethnicity_Category,Sex,Mutation_Count,Sample_Type,samples_per_patient,Overall Survival Status,Overall Survival (Months),MSI Score,gene,Gene_Id,Variant_Classification,proteinChange,Start_Position,End_Position,Reference_Allele,Variant_Allele,Chromosome,Hugo_Symbol,alt_count,ref_count,mut_key,sample_mut_key,mut_spot,vaf
6,P-0009819-T01-IM5_P-0009819-N01-IM5,P-0009819-T01-IM5,0.275237,2.681075,-0.129255,0.094,P-0009819,72.0,Prostate Cancer,Prostate Adenocarcinoma,Non-Spanish; Non-Hispanic,Male,1.0,Primary,1.0,LIVING,23.441,0.0,"{'entrezGeneId': 3169, 'hugoGeneSymbol': 'FOXA1', 'type': 'protein-coding'}",3169.0,In_Frame_Ins,G157dup,38061516.0,38061517.0,-,CGC,14.0,FOXA1,41.0,236.0,14_38061516_-_CGC,P-0009819-T01-IM5_14_38061516_-_CGC,157,0.148014
7,P-0009819-T01-IM5_P-0009819-N01-IM5,P-0009819-T01-IM5,0.275237,2.681075,-0.129255,0.094,P-0009819,72.0,Prostate Cancer,Prostate Adenocarcinoma,Non-Spanish; Non-Hispanic,Male,1.0,Primary,1.0,LIVING,23.441,0.0,"{'entrezGeneId': 2078, 'hugoGeneSymbol': 'ERG', 'type': 'protein-coding'}",2078.0,Fusion,TMPRSS2-ERG fusion,-1.0,-1.0,,,,ERG,-1.0,-1.0,NA_-1_NA_,P-0009819-T01-IM5_NA_-1_NA_,2,
8,P-0009819-T01-IM5_P-0009819-N01-IM5,P-0009819-T01-IM5,0.275237,2.681075,-0.129255,0.094,P-0009819,72.0,Prostate Cancer,Prostate Adenocarcinoma,Non-Spanish; Non-Hispanic,Male,1.0,Primary,1.0,LIVING,23.441,0.0,"{'entrezGeneId': 7113, 'hugoGeneSymbol': 'TMPRSS2', 'type': 'protein-coding'}",7113.0,Fusion,TMPRSS2-ERG fusion,-1.0,-1.0,,,,TMPRSS2,-1.0,-1.0,NA_-1_NA_,P-0009819-T01-IM5_NA_-1_NA_,2,


In [67]:
# Saving to pickle File
maf_cohort.to_pickle(data_path + 'merged_data/maf_cohort.pkl')

In [205]:
maf_cohort_unique = maf_cohort.drop_duplicates('Patient_Id')
print('Number of cohort patients without cancer type information: '+str(maf_cohort_unique['Cancer_Type'].isna().sum()))

Number of cohort patients without cancer type information: 7


In [203]:
len(set(maf_cohort.Patient_Id))

26972

In [213]:
cohort_patients = set(cohort.Patient_Id)
cbioportal_patients = set(clinical_data['Patient ID'])
maf_patients = set(maf_cohort['Patient_Id'])
mutation_patients = set(mutations_filtered['patientId'])
print(len(cohort_patients - mutation_patients))

884


In [200]:
clinical_data = pd.read_csv(data_path + 'cbioportal/raw/mskimpact_clinical_data-2.tsv', sep= '\t')
clinical_data[clinical_data['Patient ID'] == 'P-0003702']

Unnamed: 0,Patient ID,Sample ID,Cancer Type,Cancer Type Detailed,Number of Samples Per Patient,Mutation Count,Fraction Genome Altered,Sex,Ethnicity Category,Race Category,Sample Type,12-245 Part C Consented,Gene Panel,Impact TMB Score,Institute Source,MSI Score,MSI Type,Overall Survival Status,Patient Current Age,Sample coverage,Somatic Status,Tumor Purity
3924,P-0003702,P-0003702-T02-IM5,Breast Cancer,Breast Invasive Ductal Carcinoma,1,5,0.1945,Female,Non-Spanish; Non-Hispanic,WHITE,Metastasis,NO,IMPACT410,4.9,MSKCC,0.08,Stable,LIVING,55.0,191,Matched,20


In [198]:
mutations_filtered[mutations_filtered['patientId'] == 'P-0002760']

Unnamed: 0,sampleId,patientId,gene,mutationType,proteinChange,startPosition,endPosition,referenceAllele,variantAllele,chr,hugoGeneSymbol,mut_key,sample_mut_key,mut_spot


In [61]:
maf_cohort.vaf.isna().sum()

1070

In [43]:
mutations = pd.read_pickle(data_path + 'cbioportal/raw/mutations_cohort.pkl')

def cond(x):
    return list(x.gene.values())[1]

mutations['hugo_gene_symbol'] = mutations.apply(cond, axis=1)

In [51]:
KRAS = get_groupby(mutations[mutations['hugo_gene_symbol'] == 'KRAS'],'mutationType', 'KRAS')
PIK3CA = get_groupby(mutations[mutations['hugo_gene_symbol'] == 'PIK3CA'],'mutationType', 'PIK3CA')
ARID1A = get_groupby(mutations[mutations['hugo_gene_symbol'] == 'ARID1A'],'mutationType', 'ARID1A')

display_side_by_side(KRAS,PIK3CA, ARID1A)
type_set = set(mutations.mutationType)
print(type_set)

Unnamed: 0_level_0,KRAS
mutationType,Unnamed: 1_level_1
Frame_Shift_Del,8
Frame_Shift_Ins,5
Fusion,8
In_Frame_Del,2
In_Frame_Ins,24
Missense_Mutation,7552
Nonsense_Mutation,12
Splice_Site,2

Unnamed: 0_level_0,PIK3CA
mutationType,Unnamed: 1_level_1
Frame_Shift_Del,17
Frame_Shift_Ins,12
Fusion,11
In_Frame_Del,212
In_Frame_Ins,16
Missense_Mutation,6781
Nonsense_Mutation,44
Nonstop_Mutation,4
Splice_Region,1
Splice_Site,13

Unnamed: 0_level_0,ARID1A
mutationType,Unnamed: 1_level_1
Frame_Shift_Del,1700
Frame_Shift_Ins,691
Fusion,107
In_Frame_Del,83
In_Frame_Ins,15
Missense_Mutation,1288
Nonsense_Mutation,1336
Nonstop_Mutation,1
Splice_Region,2
Splice_Site,154


{'Splice_Site', 'frameshift_deletion', 'Nonsense_Mutation', "5'Flank", 'nonsynonymous_SNV', 'Nonstop_Mutation', 'In_Frame_Ins', 'Translation_Start_Site', 'Fusion', 'Frame_Shift_Ins', 'In_Frame_Del', 'Frame_Shift_Del', 'Splice_Region', 'Missense_Mutation'}


Unnamed: 0_level_0,count
mutationType,Unnamed: 1_level_1
Frame_Shift_Del,17
Frame_Shift_Ins,12
Fusion,11
In_Frame_Del,212
In_Frame_Ins,16
Missense_Mutation,6781
Nonsense_Mutation,44
Nonstop_Mutation,4
Splice_Region,1
Splice_Site,13


Unnamed: 0_level_0,count
mutationType,Unnamed: 1_level_1
Frame_Shift_Del,1700
Frame_Shift_Ins,691
Fusion,107
In_Frame_Del,83
In_Frame_Ins,15
Missense_Mutation,1288
Nonsense_Mutation,1336
Nonstop_Mutation,1
Splice_Region,2
Splice_Site,154


{"5'Flank",
 'Frame_Shift_Del',
 'Frame_Shift_Ins',
 'Fusion',
 'In_Frame_Del',
 'In_Frame_Ins',
 'Missense_Mutation',
 'Nonsense_Mutation',
 'Nonstop_Mutation',
 'Splice_Region',
 'Splice_Site',
 'Translation_Start_Site',
 'frameshift_deletion',
 'nonsynonymous_SNV'}

In [45]:
mutations.to_csv(data_path + 'cbioportal/raw/mutations_cohort.tsv')

In [52]:
mutations

Unnamed: 0,uniqueSampleKey,uniquePatientKey,molecularProfileId,sampleId,patientId,entrezGeneId,gene,studyId,center,mutationStatus,validationStatus,tumorAltCount,tumorRefCount,normalAltCount,normalRefCount,startPosition,endPosition,referenceAllele,proteinChange,mutationType,functionalImpactScore,fisValue,linkXvar,linkPdb,linkMsa,ncbiBuild,variantType,keyword,driverFilter,driverFilterAnnotation,driverTiersFilter,driverTiersFilterAnnotation,chr,variantAllele,refseqMrnaId,proteinPosStart,proteinPosEnd,hugoGeneSymbol,type,hugo_gene_symbol
0,UC0wMDAwMDA0LVQwMS1JTTM6bXNraW1wYWN0,UC0wMDAwMDA0Om1za2ltcGFjdA,mskimpact_mutations,P-0000004-T01-IM3,P-0000004,207,"{'entrezGeneId': 207, 'hugoGeneSymbol': 'AKT1', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,244.0,202.0,1.0,711.0,105246551,105246551,C,E17K,Missense_Mutation,M,2.175000e+00,"getma.org/?cm=var&var=hg19,14,105246551,C,T&fts=all",getma.org/pdb.php?prot=AKT1_HUMAN&from=6&to=108&var=E17K,getma.org/?cm=msa&ty=f&p=AKT1_HUMAN&rb=6&re=108&var=E17K,GRCh37,SNP,AKT1 E17 missense,,,,,14,T,,17,17,AKT1,protein-coding,AKT1
1,UC0wMDAwMDA0LVQwMS1JTTM6bXNraW1wYWN0,UC0wMDAwMDA0Om1za2ltcGFjdA,mskimpact_mutations,P-0000004-T01-IM3,P-0000004,7157,"{'entrezGeneId': 7157, 'hugoGeneSymbol': 'TP53', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,58.0,209.0,0.0,600.0,7578503,7578518,CAGGGCAGGTCTTGGC,A138Cfs*27,Frame_Shift_Del,,1.401300e-45,,,,GRCh37,DEL,TP53 truncating,,,,,17,-,"NM_001126112.2,NM_001276761.1,NM_001276760.1,NM_000546.5,NM_0011",138,143,TP53,protein-coding,TP53
2,UC0wMDAwMDA0LVQwMS1JTTM6bXNraW1wYWN0,UC0wMDAwMDA0Om1za2ltcGFjdA,mskimpact_mutations,P-0000004-T01-IM3,P-0000004,23013,"{'entrezGeneId': 23013, 'hugoGeneSymbol': 'SPEN', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,73.0,400.0,0.0,1071.0,16265908,16265908,A,I3661F,Missense_Mutation,M,2.275000e+00,"getma.org/?cm=var&var=hg19,1,16265908,A,T&fts=all",getma.org/pdb.php?prot=MINT_HUMAN&from=3498&to=3664&var=I3661F,getma.org/?cm=msa&ty=f&p=MINT_HUMAN&rb=3498&re=3664&var=I3661F,GRCh37,SNP,SPEN I3661 missense,,,,,1,T,NM_015001.2,3661,3661,SPEN,protein-coding,SPEN
3,UC0wMDAwMDA0LVQwMS1JTTM6bXNraW1wYWN0,UC0wMDAwMDA0Om1za2ltcGFjdA,mskimpact_mutations,P-0000004-T01-IM3,P-0000004,58508,"{'entrezGeneId': 58508, 'hugoGeneSymbol': 'KMT2C', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,11.0,84.0,0.0,193.0,151945083,151945083,C,M812I,Missense_Mutation,L,8.050000e-01,"getma.org/?cm=var&var=hg19,7,151945083,C,T&fts=all",,getma.org/?cm=msa&ty=f&p=MLL3_HUMAN&rb=639&re=838&var=M812I,GRCh37,SNP,KMT2C M812 missense,,,,,7,T,NM_170606.2,812,812,KMT2C,protein-coding,KMT2C
4,UC0wMDAwMDEyLVQwMi1JTTM6bXNraW1wYWN0,UC0wMDAwMDEyOm1za2ltcGFjdA,mskimpact_mutations,P-0000012-T02-IM3,P-0000012,7157,"{'entrezGeneId': 7157, 'hugoGeneSymbol': 'TP53', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,114.0,113.0,0.0,569.0,7577515,7577515,T,T256P,Missense_Mutation,M,3.140000e+00,"getma.org/?cm=var&var=hg19,17,7577515,T,G&fts=all",getma.org/pdb.php?prot=P53_HUMAN&from=95&to=289&var=T256P,getma.org/?cm=msa&ty=f&p=P53_HUMAN&rb=95&re=289&var=T256P,GRCh37,SNP,TP53 T256 missense,,,,,17,G,"NM_001126112.2,NM_001276761.1,NM_001276760.1,NM_000546.5,NM_0011",256,256,TP53,protein-coding,TP53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411354,UC0wMDUyMzY5LVQwMS1YUzE6bXNraW1wYWN0,UC0wMDUyMzY5Om1za2ltcGFjdA,mskimpact_mutations,P-0052369-T01-XS1,P-0052369,3845,"{'entrezGeneId': 3845, 'hugoGeneSymbol': 'KRAS', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,57.0,3616.0,0.0,1306.0,25398281,25398282,CC,G13F,Missense_Mutation,,1.401300e-45,,,,GRCh37,DNP,KRAS G13 missense,,,,,12,AA,NM_033360.2,13,13,KRAS,protein-coding,KRAS
411355,UC0wMDUyMzY5LVQwMS1YUzE6bXNraW1wYWN0,UC0wMDUyMzY5Om1za2ltcGFjdA,mskimpact_mutations,P-0052369-T01-XS1,P-0052369,4089,"{'entrezGeneId': 4089, 'hugoGeneSymbol': 'SMAD4', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,11.0,3529.0,0.0,985.0,48573529,48573529,G,R38T,Missense_Mutation,,1.401300e-45,,,,GRCh37,SNP,SMAD4 R38 missense,,,,,18,C,NM_005359.5,38,38,SMAD4,protein-coding,SMAD4
411356,UC0wMDUyMzY5LVQwMS1YUzE6bXNraW1wYWN0,UC0wMDUyMzY5Om1za2ltcGFjdA,mskimpact_mutations,P-0052369-T01-XS1,P-0052369,55294,"{'entrezGeneId': 55294, 'hugoGeneSymbol': 'FBXW7', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,16.0,3519.0,0.0,1178.0,153253766,153253766,C,E323Q,Missense_Mutation,,1.401300e-45,,,,GRCh37,SNP,FBXW7 E323 missense,,,,,4,G,NM_033632.3,323,323,FBXW7,protein-coding,FBXW7
411357,UC0wMDUyMzY5LVQwMS1YUzE6bXNraW1wYWN0,UC0wMDUyMzY5Om1za2ltcGFjdA,mskimpact_mutations,P-0052369-T01-XS1,P-0052369,10320,"{'entrezGeneId': 10320, 'hugoGeneSymbol': 'IKZF1', 'type': 'protein-coding'}",mskimpact,MSKCC,SOMATIC,Unknown,11.0,4200.0,1.0,1212.0,50468030,50468030,C,A422E,Missense_Mutation,,1.401300e-45,,,,GRCh37,SNP,IKZF1 A422 missense,,,,,7,A,NM_006060.4,422,422,IKZF1,protein-coding,IKZF1


In [70]:
h = maf_cohort[maf_cohort['Cancer_Type'] == 'Peripheral Nervous System']
len(set(h.Tumor_Id))

221