In [1]:
import pandas as pd
import csv
import time
from tqdm import tqdm

# Read all the mutations
# filter data
# count total mutations in a gene-export 
# Identify ACK1 mutations and export them

In [2]:
"""
Read all the cBIO mutation data
low_memory=False
"""
cBio_mutations_df= pd.read_csv ('cBIO_mutations.csv', names = ['gene_name', 'gene_id', 'mutationType', 'aminoAcidChange', 'mutproteinPosStart', 'mutproteinPosEnd', 'uniqueSampleKey', 'uniquePatientKey', 'molecularProfileId', 'sampleId', 'patientId', 'studyId', 'center', 'mutationStatus', 'validationStatus', 'tumorAltCount', 'tumorRefCount', 'normalAltCount', 'normalRefCount', 'startPosition', 'endPosition', 'referenceAllele', 'functionalImpactScore', 'fisValue', 'linkXvar', 'linkPdb', 'linkMsa', 'ncbiBuild', 'variantType', 'keyword', 'chrm', 'variantAllele', 'refseqMrnaId'], low_memory=False)


In [3]:
cBio_mutations_df.head(5)

Unnamed: 0,gene_name,gene_id,mutationType,aminoAcidChange,mutproteinPosStart,mutproteinPosEnd,uniqueSampleKey,uniquePatientKey,molecularProfileId,sampleId,...,fisValue,linkXvar,linkPdb,linkMsa,ncbiBuild,variantType,keyword,chrm,variantAllele,refseqMrnaId
0,HRAS,3265,Missense_Mutation,G13R,13,13,TFVBRC0yR1VHSy1UdW1vcjpuc2NsY190Y2dhX2Jyb2FkX...,TFVBRC0yR1VHSzpuc2NsY190Y2dhX2Jyb2FkXzIwMTY,nsclc_tcga_broad_2016_mutations,LUAD-2GUGK-Tumor,...,1.4013e-45,,,,GRCh37,SNP,HRAS G13 missense,11,G,
1,PCDH9,5101,Missense_Mutation,E534K,534,534,TFVBRC0yR1VHSy1UdW1vcjpuc2NsY190Y2dhX2Jyb2FkX...,TFVBRC0yR1VHSzpuc2NsY190Y2dhX2Jyb2FkXzIwMTY,nsclc_tcga_broad_2016_mutations,LUAD-2GUGK-Tumor,...,1.4013e-45,,,,GRCh37,SNP,PCDH9 E534 missense,13,T,
2,B3GALT1,8708,Missense_Mutation,S237L,237,237,TFVBRC0yR1VHSy1UdW1vcjpuc2NsY190Y2dhX2Jyb2FkX...,TFVBRC0yR1VHSzpuc2NsY190Y2dhX2Jyb2FkXzIwMTY,nsclc_tcga_broad_2016_mutations,LUAD-2GUGK-Tumor,...,1.4013e-45,,,,GRCh37,SNP,B3GALT1 S237 missense,2,T,
3,TTYH2,94015,Missense_Mutation,R439C,439,439,TFVBRC0yR1VHSy1UdW1vcjpuc2NsY190Y2dhX2Jyb2FkX...,TFVBRC0yR1VHSzpuc2NsY190Y2dhX2Jyb2FkXzIwMTY,nsclc_tcga_broad_2016_mutations,LUAD-2GUGK-Tumor,...,1.4013e-45,,,,GRCh37,SNP,TTYH2 R439 missense,17,T,NM_032646.5
4,ITK,3702,Missense_Mutation,L106I,106,106,TFVBRC0yR1VHSy1UdW1vcjpuc2NsY190Y2dhX2Jyb2FkX...,TFVBRC0yR1VHSzpuc2NsY190Y2dhX2Jyb2FkXzIwMTY,nsclc_tcga_broad_2016_mutations,LUAD-2GUGK-Tumor,...,1.4013e-45,,,,GRCh37,SNP,ITK L106 missense,5,A,NM_005546.3


In [4]:
len(cBio_mutations_df)

46177819

In [5]:
"""
Remove duplicated rows
"""
cBio_mutations_no_duplicates = cBio_mutations_df.drop_duplicates()

In [6]:
len(cBio_mutations_no_duplicates)

8235921

In [7]:
len(cBio_mutations_no_duplicates)

8235921

In [8]:
"""
can use the chuncksize feature when I have lots of data
"""
# chunksize = 10**6
# cBio_mutations_df= pd.read_csv ('cBIO_mutations.csv', names = ['patient_ID', 'sample_ID', 'sample_type', 'unique_patient_key', 'unique_sample_key'], chunksize = chunksize)

'\ncan use the chuncksize feature when I have lots of data\n'

In [9]:
"""
Sort all the mutations by 'gene_id' and 'aminoAcidChange'
"""
# cBio_mutations_sorted_df = cBio_mutations_df.sort_value(by=['gene_id', 'aminoAcidChange'], ascending=[True, True])
cBio_mutations_sorted_df = cBio_mutations_no_duplicates.sort_values(['gene_id', 'mutproteinPosStart'], ascending=[True, True])

In [10]:
cBio_mutations_sorted_df.head(2)

Unnamed: 0,gene_name,gene_id,mutationType,aminoAcidChange,mutproteinPosStart,mutproteinPosEnd,uniqueSampleKey,uniquePatientKey,molecularProfileId,sampleId,...,fisValue,linkXvar,linkPdb,linkMsa,ncbiBuild,variantType,keyword,chrm,variantAllele,refseqMrnaId
40396918,MIR-4710/4710,-3334,Splice_Site,MUTATED,-1,-1,Y3NjYy11Y3NmLTIwMjEtU2FtcGxlNjI6Y3NjY191Y3NmX...,Y3NjYy11Y3NmLTIwMjEtUGF0aWVudDU5OmNzY2NfdWNzZ...,cscc_ucsf_2021_mutations,cscc-ucsf-2021-Sample62,...,1.4013e-45,,,,GRCh37,SNP,MIR-4710/4710 truncating,14,T,
7101012,MIR-4436A/4436A,-3084,Splice_Site,X66_splice,66,66,VENHQS0xOC00MDgzLTAxOmx1c2NfdGNnYV9wYW5fY2FuX...,VENHQS0xOC00MDgzOmx1c2NfdGNnYV9wYW5fY2FuX2F0b...,lusc_tcga_pan_can_atlas_2018_mutations,TCGA-18-4083-01,...,1.4013e-45,,,,GRCh37,SNP,MIR-4436A/4436A truncating,2,C,


In [11]:
"""
export cBio_mutations_sorted_df
"""
cBio_mutations_sorted_df.to_csv('cBio_mutations_sorted.csv', columns = ['gene_name', 'gene_id', 'mutationType', 'aminoAcidChange', 'mutproteinPosStart', 'mutproteinPosEnd', 'uniqueSampleKey', 'uniquePatientKey', 'molecularProfileId', 'sampleId', 'patientId', 'studyId', 'center', 'mutationStatus', 'validationStatus', 'tumorAltCount', 'tumorRefCount', 'normalAltCount', 'normalRefCount', 'startPosition', 'endPosition', 'referenceAllele', 'functionalImpactScore', 'fisValue', 'linkXvar', 'linkPdb', 'linkMsa', 'ncbiBuild', 'variantType', 'keyword', 'chrm', 'variantAllele', 'refseqMrnaId'], index = False)

# Start from here

In [12]:
"""
Read all the cBIO mutation data
low_memory=False
"""
cBio_mutations_sorted_df1= pd.read_csv ('cBio_mutations_sorted.csv', names = ['gene_name', 'gene_id', 'mutationType', 'aminoAcidChange', 'mutproteinPosStart', 'mutproteinPosEnd', 'uniqueSampleKey', 'uniquePatientKey', 'molecularProfileId', 'sampleId', 'patientId', 'studyId', 'center', 'mutationStatus', 'validationStatus', 'tumorAltCount', 'tumorRefCount', 'normalAltCount', 'normalRefCount', 'startPosition', 'endPosition', 'referenceAllele', 'functionalImpactScore', 'fisValue', 'linkXvar', 'linkPdb', 'linkMsa', 'ncbiBuild', 'variantType', 'keyword', 'chrm', 'variantAllele', 'refseqMrnaId'], low_memory=False)


In [13]:
len(cBio_mutations_sorted_df1)

8235922

In [14]:
"""
identify any duplicated data ????
"""
duplicate_df = cBio_mutations_sorted_df1.loc[cBio_mutations_sorted_df1.duplicated(), :]

In [15]:
len(duplicate_df)

0

In [16]:
duplicate_df.head(3)

Unnamed: 0,gene_name,gene_id,mutationType,aminoAcidChange,mutproteinPosStart,mutproteinPosEnd,uniqueSampleKey,uniquePatientKey,molecularProfileId,sampleId,...,fisValue,linkXvar,linkPdb,linkMsa,ncbiBuild,variantType,keyword,chrm,variantAllele,refseqMrnaId


In [17]:
"""
drop duplicates and keep the 1st 
"""
cBio_mutations_sorted_filt = cBio_mutations_sorted_df1.drop_duplicates()

In [18]:
len(cBio_mutations_sorted_filt)

8235922

In [18]:
len(cBio_mutations_sorted_filt)

8235922

In [70]:
cBio_mutations_sorted_filt.columns

Index(['gene_name', 'gene_id', 'mutationType', 'aminoAcidChange',
       'mutproteinPosStart', 'mutproteinPosEnd', 'uniqueSampleKey',
       'uniquePatientKey', 'molecularProfileId', 'sampleId', 'patientId',
       'studyId', 'center', 'mutationStatus', 'validationStatus',
       'tumorAltCount', 'tumorRefCount', 'normalAltCount', 'normalRefCount',
       'startPosition', 'endPosition', 'referenceAllele',
       'functionalImpactScore', 'fisValue', 'linkXvar', 'linkPdb', 'linkMsa',
       'ncbiBuild', 'variantType', 'keyword', 'chrm', 'variantAllele',
       'refseqMrnaId'],
      dtype='object')

In [71]:
"""
ASSUMPTION
1 PATIENT_ID PER 1 SAMPLE PER MUTATED GENE
"""
cBio_mutations_sorted_filt_final = cBio_mutations_sorted_filt.drop_duplicates(subset =['patientId', 'gene_id']).reset_index(drop = True)


In [84]:
cBio_mutations_sorted_filt_final.head(3)

Unnamed: 0,gene_name,gene_id,mutationType,aminoAcidChange,mutproteinPosStart,mutproteinPosEnd,uniqueSampleKey,uniquePatientKey,molecularProfileId,sampleId,...,fisValue,linkXvar,linkPdb,linkMsa,ncbiBuild,variantType,keyword,chrm,variantAllele,refseqMrnaId
0,gene_name,gene_id,mutationType,aminoAcidChange,mutproteinPosStart,mutproteinPosEnd,uniqueSampleKey,uniquePatientKey,molecularProfileId,sampleId,...,fisValue,linkXvar,linkPdb,linkMsa,ncbiBuild,variantType,keyword,chrm,variantAllele,refseqMrnaId
1,MIR-4710/4710,-3334,Splice_Site,MUTATED,-1,-1,Y3NjYy11Y3NmLTIwMjEtU2FtcGxlNjI6Y3NjY191Y3NmXz...,Y3NjYy11Y3NmLTIwMjEtUGF0aWVudDU5OmNzY2NfdWNzZ...,cscc_ucsf_2021_mutations,cscc-ucsf-2021-Sample62,...,1.4013e-45,,,,GRCh37,SNP,MIR-4710/4710 truncating,14,T,
2,MIR-4436A/4436A,-3084,Splice_Site,X66_splice,66,66,VENHQS0xOC00MDgzLTAxOmx1c2NfdGNnYV9wYW5fY2FuX2...,VENHQS0xOC00MDgzOmx1c2NfdGNnYV9wYW5fY2FuX2F0b...,lusc_tcga_pan_can_atlas_2018_mutations,TCGA-18-4083-01,...,1.4013e-45,,,,GRCh37,SNP,MIR-4436A/4436A truncating,2,C,


In [85]:
cBio_mutations_sorted_filt_final.to_csv('Final_cBio_mutations_sorted_filt.csv', columns= ['gene_name', 'gene_id', 'mutationType', 'aminoAcidChange',
       'mutproteinPosStart', 'mutproteinPosEnd', 'uniqueSampleKey',
       'uniquePatientKey', 'molecularProfileId', 'sampleId', 'patientId',
       'studyId', 'center', 'mutationStatus', 'validationStatus',
       'tumorAltCount', 'tumorRefCount', 'normalAltCount', 'normalRefCount',
       'startPosition', 'endPosition', 'referenceAllele',
       'functionalImpactScore', 'fisValue', 'linkXvar', 'linkPdb', 'linkMsa',
       'ncbiBuild', 'variantType', 'keyword', 'chrm', 'variantAllele',
       'refseqMrnaId'], index = False)

In [79]:
"""
groupby gene to get the total mutation counts
"""
gene_mutations_count_df_final= cBio_mutations_sorted_filt_final.groupby(['gene_name']).size().reset_index(name='mutation_counts')


In [80]:
gene_mutations_count_df_final.head(3)

Unnamed: 0,gene_name,mutation_counts
0,A1BG,206
1,A1BG-AS1,1
2,A1CF,454


In [72]:
len(cBio_mutations_sorted_filt_final)

4870635

In [20]:
"""
groupby gene to get the total mutation counts
"""
gene_mutations_count_df= cBio_mutations_sorted_df1.groupby(['gene_name']).size().reset_index(name='mutation_counts')


In [21]:
gene_mutations_count_df.head(3)

Unnamed: 0,gene_name,mutation_counts
0,A1BG,286
1,A1BG-AS1,1
2,A1CF,830


In [81]:
len(gene_mutations_count_df_final)

20810

In [22]:
len(gene_mutations_count_df)

20810

In [23]:
"""
export gene mutation counts
"""
gene_mutations_count_df.to_csv('gene_mutations_count.csv', columns= ['gene_name', 'mutation_counts'], index = False)

In [82]:
"""
export gene mutation counts
"""
gene_mutations_count_df_final.to_csv('FINAL_gene_mutations_count.csv', columns= ['gene_name', 'mutation_counts'], index = False)

In [24]:
"""
use filtered data to groupby gene to get the total mutation counts
"""
filtered_gene_mutations_count_df= cBio_mutations_sorted_filt.groupby(['gene_name']).size().reset_index(name='mutation_counts')

In [25]:
filtered_gene_mutations_count_df.head(3)

Unnamed: 0,gene_name,mutation_counts
0,A1BG,286
1,A1BG-AS1,1
2,A1CF,830


In [26]:
len(filtered_gene_mutations_count_df)

20810

In [27]:
"""
export filtered gene mutation counts
"""
filtered_gene_mutations_count_df.to_csv('filtered_gene_mutations_count.csv', columns= ['gene_name', 'mutation_counts'], index = False)

In [69]:
filtered_gene_mutations_count_df.columns

Index(['gene_name', 'mutation_counts'], dtype='object')

In [28]:
"""
select ACk1 mutations
"""
all_ack1_mutations_df = cBio_mutations_sorted_df1.loc[(cBio_mutations_sorted_df1['gene_id'] == '10188' )]

In [29]:
len(all_ack1_mutations_df)

774

In [30]:
"""
Export All ack mutations
"""
all_ack1_mutations_df.to_csv('all_ack1_mutations.csv', columns = ['gene_name', 'gene_id', 'mutationType', 'aminoAcidChange', 'mutproteinPosStart', 'mutproteinPosEnd', 'uniqueSampleKey', 'uniquePatientKey', 'molecularProfileId', 'sampleId', 'patientId', 'studyId', 'center', 'mutationStatus', 'validationStatus', 'tumorAltCount', 'tumorRefCount', 'normalAltCount', 'normalRefCount', 'startPosition', 'endPosition', 'referenceAllele', 'functionalImpactScore', 'fisValue', 'linkXvar', 'linkPdb', 'linkMsa', 'ncbiBuild', 'variantType', 'keyword', 'chrm', 'variantAllele', 'refseqMrnaId'], index = False)



In [31]:
"""
identify ACK1 duplicated data 
"""
ack_duplicate_df = all_ack1_mutations_df.loc[all_ack1_mutations_df.duplicated(), :]

In [32]:
len(ack_duplicate_df)

0

In [33]:
all_ack1_mutations_df_filt = all_ack1_mutations_df.drop_duplicates()

In [34]:
len(all_ack1_mutations_df_filt)

774

In [35]:
"""
Export filtered ack mutations
"""
all_ack1_mutations_df_filt.to_csv('ack1_mutations_filtered.csv', columns = ['gene_name', 'gene_id', 'mutationType', 'aminoAcidChange', 'mutproteinPosStart', 'mutproteinPosEnd', 'uniqueSampleKey', 'uniquePatientKey', 'molecularProfileId', 'sampleId', 'patientId', 'studyId', 'center', 'mutationStatus', 'validationStatus', 'tumorAltCount', 'tumorRefCount', 'normalAltCount', 'normalRefCount', 'startPosition', 'endPosition', 'referenceAllele', 'functionalImpactScore', 'fisValue', 'linkXvar', 'linkPdb', 'linkMsa', 'ncbiBuild', 'variantType', 'keyword', 'chrm', 'variantAllele', 'refseqMrnaId'], index = False)



In [36]:
"""
select 
"""
ack_mut_final_df = all_ack1_mutations_df_filt.drop_duplicates(subset =['patientId']).reset_index(drop = True)


In [89]:
ack_mut_final_df.head(3)

Unnamed: 0,gene_name,gene_id,mutationType,aminoAcidChange,mutproteinPosStart,mutproteinPosEnd,uniqueSampleKey,uniquePatientKey,molecularProfileId,sampleId,...,fisValue,linkXvar,linkPdb,linkMsa,ncbiBuild,variantType,keyword,chrm,variantAllele,refseqMrnaId
0,TNK2,10188,Splice_Site,MUTATED,-1,-1,Y3NjYy11Y3NmLTIwMjEtU2FtcGxlNDc6Y3NjY191Y3NmXz...,Y3NjYy11Y3NmLTIwMjEtUGF0aWVudDQ1OmNzY2NfdWNzZl...,cscc_ucsf_2021_mutations,cscc-ucsf-2021-Sample47,...,1.4013e-45,,,,GRCh37,SNP,TNK2 truncating,3,A,
1,TNK2,10188,Translation_Start_Site,M1?,1,1,VENHQS1MTi1BNDlYLTAxOmVzY2FfdGNnYQ,VENHQS1MTi1BNDlYOmVzY2FfdGNnYQ,esca_tcga_mutations,TCGA-LN-A49X-01,...,1.4013e-45,,,,GRCh37,SNP,TNK2 truncating,3,C,NM_005781.4
2,TNK2,10188,Missense_Mutation,E5K,5,5,TUVMLUlQSV9QYXQxNTEtVHVtb3ItU00tN0ExNUE6bWl4ZW...,UGF0MTUxOm1peGVkX2FsbGVuXzIwMTg,mixed_allen_2018_mutations,MEL-IPI_Pat151-Tumor-SM-7A15A,...,1.4013e-45,,,,GRCh37,SNP,TNK2 E5 missense,3,T,NM_005781.4


In [37]:
len(ack_mut_final_df)

535

In [38]:
"""
Export ack_mut_final_dfs
"""
ack_mut_final_df.to_csv('ack_mut_final.csv', columns = ['gene_name', 'gene_id', 'mutationType', 'aminoAcidChange', 'mutproteinPosStart', 'mutproteinPosEnd', 'uniqueSampleKey', 'uniquePatientKey', 'molecularProfileId', 'sampleId', 'patientId', 'studyId', 'center', 'mutationStatus', 'validationStatus', 'tumorAltCount', 'tumorRefCount', 'normalAltCount', 'normalRefCount', 'startPosition', 'endPosition', 'referenceAllele', 'functionalImpactScore', 'fisValue', 'linkXvar', 'linkPdb', 'linkMsa', 'ncbiBuild', 'variantType', 'keyword', 'chrm', 'variantAllele', 'refseqMrnaId'], index = False)



In [39]:
"""
groupby to get the ack1 mutation counts for other genes
"""
ack1_mut_count_df= ack_mut_final_df.groupby(['mutproteinPosStart']).size().reset_index(name='mutation_counts')


In [40]:
ack1_mut_count_df.head(3)

Unnamed: 0,mutproteinPosStart,mutation_counts
0,-1,1
1,1,1
2,1000,1


In [41]:
"""
export as a csv file for the prism graph-lollypo
"""
ack1_mut_count_df.to_csv('ack1_mut_count_prism.csv', columns = ['mutproteinPosStart', 'mutation_counts'], index = False)

In [42]:
"""
sort 
"""
ack1_mut_count_sorted_df = ack1_mut_count_df.sort_values(['mutation_counts'], ascending=[False])

# identify other mutations in ACK1 mutated patiens

In [88]:
"""
remove whitespace
"""
ack_mut_final_df['uniqueSampleKey'] = ack_mut_final_df['uniqueSampleKey'].str.strip()
ack_mut_final_df['uniquePatientKey'] = ack_mut_final_df['uniquePatientKey'].str.strip()
cBio_mutations_sorted_filt['uniqueSampleKey'] = cBio_mutations_sorted_filt['uniqueSampleKey'].str.strip()

In [45]:
"""
create a new df with 'uniqueSampleKey', 'uniquePatientKey'
"""
ack_mut_patients_df= ack_mut_final_df[['uniqueSampleKey', 'uniquePatientKey']]

In [46]:
ack_mut_patients_df.head(3)

Unnamed: 0,uniqueSampleKey,uniquePatientKey
0,Y3NjYy11Y3NmLTIwMjEtU2FtcGxlNDc6Y3NjY191Y3NmXz...,Y3NjYy11Y3NmLTIwMjEtUGF0aWVudDQ1OmNzY2NfdWNzZl...
1,VENHQS1MTi1BNDlYLTAxOmVzY2FfdGNnYQ,VENHQS1MTi1BNDlYOmVzY2FfdGNnYQ
2,TUVMLUlQSV9QYXQxNTEtVHVtb3ItU00tN0ExNUE6bWl4ZW...,UGF0MTUxOm1peGVkX2FsbGVuXzIwMTg


In [47]:
len(ack_mut_patients_df)

535

In [48]:
"""
drop duplicates
"""
ack_mut_patients_df_filt = ack_mut_patients_df.drop_duplicates()

In [49]:
len(ack_mut_patients_df_filt)

535

In [50]:
"""
Sort mutation counts to identify most mutated gene with ack1
"""
mut_ack1_cooccurrence_count_sorted_df = mut_ack1_cooccurrence_count_df.sort_values(['mutation_counts'], ascending=[False])

NameError: name 'mut_ack1_cooccurrence_count_df' is not defined

In [51]:
"""
create a new csv file to store other mutations in ACK1 patients
"""
for index, row in tqdm(ack_mut_patients_df_filt.iterrows()):
    mut_ack1_cooccurrence_df = cBio_mutations_sorted_filt.loc[(cBio_mutations_sorted_filt['uniqueSampleKey'] == row['uniqueSampleKey'])]

535it [03:22,  2.64it/s]


In [52]:
mut_ack1_cooccurrence_df.head(3)

Unnamed: 0,gene_name,gene_id,mutationType,aminoAcidChange,mutproteinPosStart,mutproteinPosEnd,uniqueSampleKey,uniquePatientKey,molecularProfileId,sampleId,...,fisValue,linkXvar,linkPdb,linkMsa,ncbiBuild,variantType,keyword,chrm,variantAllele,refseqMrnaId
126103,BIRC3,330,Missense_Mutation,L404V,404,404,TUlBUEFDQTJfUEFOQ1JFQVM6Y2VsbGxpbmVfY2NsZV9icm...,TUlBX1BhQ2EtMjpjZWxsbGluZV9jY2xlX2Jyb2Fk,cellline_ccle_broad_mutations,MIAPACA2_PANCREAS,...,1.4013e-45,,,,GRCh37,SNP,BIRC3 L404 missense,11,G,NM_001165.4
221161,BCR,613,Missense_Mutation,R22L,22,22,TUlBUEFDQTJfUEFOQ1JFQVM6Y2VsbGxpbmVfY2NsZV9icm...,TUlBX1BhQ2EtMjpjZWxsbGluZV9jY2xlX2Jyb2Fk,cellline_ccle_broad_mutations,MIAPACA2_PANCREAS,...,1.4013e-45,,,,GRCh37,SNP,BCR R22 missense,22,T,NM_004327.3
317723,RUNX2,860,Missense_Mutation,T286A,286,286,TUlBUEFDQTJfUEFOQ1JFQVM6Y2VsbGxpbmVfY2NsZV9icm...,TUlBX1BhQ2EtMjpjZWxsbGluZV9jY2xlX2Jyb2Fk,cellline_ccle_broad_mutations,MIAPACA2_PANCREAS,...,1.4013e-45,,,,GRCh37,SNP,RUNX2 T286 missense,6,G,NM_001024630.3


In [53]:
len(mut_ack1_cooccurrence_df)

40

In [54]:
"""
export the mut_ack1_cooccurrence_df
"""
mut_ack1_cooccurrence_df.to_csv('mut_ack1_cooccurrence_df.csv', columns = ['gene_name', 'gene_id', 'mutationType', 'aminoAcidChange', 'mutproteinPosStart', 'mutproteinPosEnd', 'uniqueSampleKey', 'uniquePatientKey', 'molecularProfileId', 'sampleId', 'patientId', 'studyId', 'center', 'mutationStatus', 'validationStatus', 'tumorAltCount', 'tumorRefCount', 'normalAltCount', 'normalRefCount', 'startPosition', 'endPosition', 'referenceAllele', 'functionalImpactScore', 'fisValue', 'linkXvar', 'linkPdb', 'linkMsa', 'ncbiBuild', 'variantType', 'keyword', 'chrm', 'variantAllele', 'refseqMrnaId'], index = False)

In [55]:
"""
use mut_ack1_cooccurrence_df to groupby gene to get the total mutation counts for other genes
"""
mut_ack1_cooccurrence_count_df= mut_ack1_cooccurrence_df.groupby(['gene_id', 'gene_name']).size().reset_index(name='mutation_counts')

In [56]:
mut_ack1_cooccurrence_count_df.head(3)

Unnamed: 0,gene_id,gene_name,mutation_counts
0,10076,PTPRU,1
1,10188,TNK2,1
2,10734,STAG3,1


In [57]:
"""
Sort mutation counts to identify most mutated gene with ack1
"""
mut_ack1_cooccurrence_count_sorted_df = mut_ack1_cooccurrence_count_df.sort_values(['mutation_counts'], ascending=[False])

In [58]:
mut_ack1_cooccurrence_count_sorted_df.head(3)

Unnamed: 0,gene_id,gene_name,mutation_counts
28,7273,TTN,5
4,114788,CSMD3,2
30,8202,NCOA3,2


In [59]:
len(mut_ack1_cooccurrence_count_sorted_df)

34

# Identify other mutations in ACK1 mutated patients

In [60]:
"""cBio_mutations_sorted_df"""
ack_mut_final_df.head(3)


Unnamed: 0,gene_name,gene_id,mutationType,aminoAcidChange,mutproteinPosStart,mutproteinPosEnd,uniqueSampleKey,uniquePatientKey,molecularProfileId,sampleId,...,fisValue,linkXvar,linkPdb,linkMsa,ncbiBuild,variantType,keyword,chrm,variantAllele,refseqMrnaId
0,TNK2,10188,Splice_Site,MUTATED,-1,-1,Y3NjYy11Y3NmLTIwMjEtU2FtcGxlNDc6Y3NjY191Y3NmXz...,Y3NjYy11Y3NmLTIwMjEtUGF0aWVudDQ1OmNzY2NfdWNzZl...,cscc_ucsf_2021_mutations,cscc-ucsf-2021-Sample47,...,1.4013e-45,,,,GRCh37,SNP,TNK2 truncating,3,A,
1,TNK2,10188,Translation_Start_Site,M1?,1,1,VENHQS1MTi1BNDlYLTAxOmVzY2FfdGNnYQ,VENHQS1MTi1BNDlYOmVzY2FfdGNnYQ,esca_tcga_mutations,TCGA-LN-A49X-01,...,1.4013e-45,,,,GRCh37,SNP,TNK2 truncating,3,C,NM_005781.4
2,TNK2,10188,Missense_Mutation,E5K,5,5,TUVMLUlQSV9QYXQxNTEtVHVtb3ItU00tN0ExNUE6bWl4ZW...,UGF0MTUxOm1peGVkX2FsbGVuXzIwMTg,mixed_allen_2018_mutations,MEL-IPI_Pat151-Tumor-SM-7A15A,...,1.4013e-45,,,,GRCh37,SNP,TNK2 E5 missense,3,T,NM_005781.4


In [61]:
ack_mut_final_df.columns

Index(['gene_name', 'gene_id', 'mutationType', 'aminoAcidChange',
       'mutproteinPosStart', 'mutproteinPosEnd', 'uniqueSampleKey',
       'uniquePatientKey', 'molecularProfileId', 'sampleId', 'patientId',
       'studyId', 'center', 'mutationStatus', 'validationStatus',
       'tumorAltCount', 'tumorRefCount', 'normalAltCount', 'normalRefCount',
       'startPosition', 'endPosition', 'referenceAllele',
       'functionalImpactScore', 'fisValue', 'linkXvar', 'linkPdb', 'linkMsa',
       'ncbiBuild', 'variantType', 'keyword', 'chrm', 'variantAllele',
       'refseqMrnaId'],
      dtype='object')

In [62]:
"""
remove whitespace
"""
ack_mut_final_df['patientId'] = ack_mut_final_df['patientId'].str.strip()

In [63]:
"""
remove whitespace
"""
cBio_mutations_sorted_df['patientId'] = cBio_mutations_sorted_df['patientId'].str.strip()

In [64]:
"""
identify ACK1 mutated patients in total muation DF
"""
for index, row in tqdm(ack_mut_final_df.iterrows()):
    ack1_and_other_mut_df = cBio_mutations_sorted_df.loc[(cBio_mutations_sorted_df['patientId'] == row['patientId'])]

535it [03:28,  2.56it/s]


In [65]:
len(ack1_and_other_mut_df)

40

Sample ID

In [66]:
"""
remove whitespace
"""
ack_mut_final_df['sampleId'] = ack_mut_final_df['sampleId'].str.strip()
cBio_mutations_sorted_df['patientId'] = cBio_mutations_sorted_df['patientId'].str.strip()

In [67]:
"""
identify ACK1 mutated patients in total muation DF
"""
for index, row in tqdm(ack_mut_final_df.iterrows()):
    ack1_and_other_mut_df_sampleID = cBio_mutations_sorted_df.loc[(cBio_mutations_sorted_df['patientId'] == row['patientId'])]

535it [03:29,  2.56it/s]


In [86]:
len(ack1_and_other_mut_df_sampleID)

40

In [87]:
ack1_and_other_mut_df_sampleID.head(3)

Unnamed: 0,gene_name,gene_id,mutationType,aminoAcidChange,mutproteinPosStart,mutproteinPosEnd,uniqueSampleKey,uniquePatientKey,molecularProfileId,sampleId,...,fisValue,linkXvar,linkPdb,linkMsa,ncbiBuild,variantType,keyword,chrm,variantAllele,refseqMrnaId
18840499,BIRC3,330,Missense_Mutation,L404V,404,404,TUlBUEFDQTJfUEFOQ1JFQVM6Y2VsbGxpbmVfY2NsZV9ic...,TUlBX1BhQ2EtMjpjZWxsbGluZV9jY2xlX2Jyb2Fk,cellline_ccle_broad_mutations,MIAPACA2_PANCREAS,...,1.4013e-45,,,,GRCh37,SNP,BIRC3 L404 missense,11,G,NM_001165.4
18840487,BCR,613,Missense_Mutation,R22L,22,22,TUlBUEFDQTJfUEFOQ1JFQVM6Y2VsbGxpbmVfY2NsZV9ic...,TUlBX1BhQ2EtMjpjZWxsbGluZV9jY2xlX2Jyb2Fk,cellline_ccle_broad_mutations,MIAPACA2_PANCREAS,...,1.4013e-45,,,,GRCh37,SNP,BCR R22 missense,22,T,NM_004327.3
18840506,RUNX2,860,Missense_Mutation,T286A,286,286,TUlBUEFDQTJfUEFOQ1JFQVM6Y2VsbGxpbmVfY2NsZV9ic...,TUlBX1BhQ2EtMjpjZWxsbGluZV9jY2xlX2Jyb2Fk,cellline_ccle_broad_mutations,MIAPACA2_PANCREAS,...,1.4013e-45,,,,GRCh37,SNP,RUNX2 T286 missense,6,G,NM_001024630.3
