In [1]:
import pandas as pd
import numpy as np
import sys
import re
import functools
import json
import random
import os
import math
import datetime

import matplotlib.pyplot as plt

In [2]:
start_time = datetime.datetime.now()

In [3]:
#os.getcwd()

# Linux workstation
#data_path = '/home/db600/phd/data/'

# Laptop
data_path = 'C:\\Users\\dan\\Documents\\phd\\data\\'

os.listdir(data_path)

['alphamissense',
 'biomart',
 'dependant',
 'depmap',
 'missing_gene_names.csv',
 'pathway_commons',
 'reactome',
 'reactome_rows_to_drop.csv',
 'string']

In [4]:
mut_path = data_path + 'depmap\\OmicsSomaticMutations.csv'
exp_path = data_path + 'depmap\\OmicsExpressionProteinCodingGenesTPMLogp1.csv' 
conv_path = data_path + 'biomart\\ensembl_biomart_plus_fasta.csv'

In [5]:
# Read the multi_gene_converter into a DF
conv = pd.read_csv(conv_path, header = 0, index_col = 0)
conv = conv.drop(columns='Unnamed: 0')

In [6]:
conv.head()

Unnamed: 0,Gene stable ID,HGNC symbol,Chromosome/scaffold name,Transcript stable ID,Protein stable ID,UniProtKB/TrEMBL ID,UniProtKB/Swiss-Prot ID,Peptide
0,ENSG00000198888,MT-ND1,MT,ENST00000361390,ENSP00000354687,U5Z754,P03886,MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLL...
1,ENSG00000198763,MT-ND2,MT,ENST00000361453,ENSP00000355046,Q7GXY9,P03891,MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVLTK...
2,ENSG00000198804,MT-CO1,MT,ENST00000361624,ENSP00000354499,U5YWV7,P00395,MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGN...
3,ENSG00000198712,MT-CO2,MT,ENST00000361739,ENSP00000354876,U5Z487,P00403,MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFLTL...
4,ENSG00000228253,MT-ATP8,MT,ENST00000361851,ENSP00000355265,U5YV54,P03928,MPQLNTTVWPTMITPMLLTLFLITQLKMLNTNYHLPPSPKPMKMKN...


In [7]:
def load_list(path):
    with open(path) as f:
        g = json.load(f)
    return g

kinases_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\kinases.json"
oncs_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\oncs.json"
tsgs_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\tsgs.json"

kinases = load_list(kinases_path)
oncs = load_list(oncs_path)
tsgs =load_list(tsgs_path)

In [8]:
# Get list of training cell lines used in original program
# Manually saved to CSV earlier
cell_lines = pd.read_csv(data_path + 'dependant\\original_training_cell_lines.csv')
cell_lines = cell_lines.rename(columns={'cell_line': 'CCLEName'})
cell_lines

Unnamed: 0,CCLEName
0,HCC1428_BREAST
1,HCC1806_BREAST
2,HCC1937_BREAST
3,MDAMB231_BREAST
4,HCC202_BREAST
5,CAL51_BREAST
6,MDAMB468_BREAST
7,KPL1_BREAST
8,MDAMB415_BREAST
9,MDAMB157_BREAST


In [9]:
# Load the DepMap model metadata
depmap_model = pd.read_csv(data_path + 'depmap\\Model.csv')

# Add depmap model ID to the list of 39 original training cell lines
cell_lines = pd.merge(cell_lines, depmap_model[['ModelID', 'CCLEName']], on='CCLEName', how='left')

# View the cell line list
cell_lines

Unnamed: 0,CCLEName,ModelID
0,HCC1428_BREAST,ACH-000352
1,HCC1806_BREAST,ACH-000624
2,HCC1937_BREAST,ACH-000223
3,MDAMB231_BREAST,ACH-000768
4,HCC202_BREAST,ACH-000725
5,CAL51_BREAST,ACH-000856
6,MDAMB468_BREAST,ACH-000849
7,KPL1_BREAST,ACH-000028
8,MDAMB415_BREAST,ACH-000876
9,MDAMB157_BREAST,ACH-000621


In [10]:
# Load the expression and mutation data
mutation_df = pd.read_csv(mut_path, low_memory=False)
expression_df = pd.read_csv(exp_path)

In [11]:
# Preview expression data
expression_df.head()

Unnamed: 0.1,Unnamed: 0,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
0,ACH-001113,4.331992,0.0,7.36466,2.792855,4.471187,0.028569,1.226509,3.044394,6.500005,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.0,0.275007,0.0,0.0,0.0
1,ACH-001289,4.567424,0.584963,7.106641,2.543496,3.50462,0.0,0.189034,3.813525,4.221877,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.0,0.014355,0.0,0.0,0.0
2,ACH-001339,3.15056,0.0,7.379118,2.333424,4.228049,0.056584,1.31034,6.687201,3.682573,...,0.594549,1.097611,0.831877,2.946731,0.475085,0.0,0.084064,0.0,0.0,0.042644
3,ACH-001538,5.08534,0.0,7.154211,2.545968,3.084064,0.0,5.86839,6.165309,4.489928,...,0.214125,0.632268,0.298658,1.641546,0.443607,0.0,0.028569,0.0,0.0,0.0
4,ACH-000242,6.729417,0.0,6.537917,2.456806,3.867896,0.799087,7.208478,5.570159,7.127117,...,1.117695,2.358959,0.084064,1.910733,0.0,0.0,0.464668,0.0,0.0,0.0


In [12]:
# Preview mutation data
mutation_df.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
0,chr1,1242864,GC,CT,0.31,19,8,0/1,,DNP,...,,,,,,,,,ACH-000839,388581.0
1,chr1,10647969,A,G,0.4,29,19,0|1,10647969.0,SNP,...,0.234,3.0,,,,,,,ACH-000839,54897.0
2,chr1,10648097,T,G,0.349,21,10,0/1,,SNP,...,,0.0,,,,,,,ACH-000839,54897.0
3,chr1,13198424,G,A,0.833,0,4,0/1,,SNP,...,0.002,,,,,,,,ACH-000839,400736.0
4,chr1,13225068,A,G,0.396,34,24,0/1,,SNP,...,,0.0,,,,,,,ACH-000839,391003.0


In [13]:
mutation_df.columns

Index(['Chrom', 'Pos', 'Ref', 'Alt', 'AF', 'RefCount', 'AltCount', 'GT', 'PS',
       'VariantType', 'VariantInfo', 'DNAChange', 'ProteinChange',
       'HugoSymbol', 'HgncName', 'HgncFamily', 'Transcript', 'TranscriptExon',
       'TranscriptStrand', 'UniprotID', 'Str', 'DbsnpID', 'DbsnpFilter',
       'Issues', 'GcContent', 'LineageAssociation', 'CancerMolecularGenetics',
       'CCLEDeleterious', 'StructuralRelation', 'CosmicHotspot',
       'CosmicOverlappingMutations', 'AssociatedWith', 'LoF', 'Driver',
       'LikelyDriver', 'TranscriptLikelyLoF', 'CivicID', 'CivicDescription',
       'CivicScore', 'Popaf', 'LikelyGoF', 'LikelyLoF', 'HessDriver',
       'HessSignature', 'CscapeScore', 'DannScore', 'RevelScore',
       'Funseq2Score', 'PharmgkbID', 'DidaID', 'DidaName', 'GwasDisease',
       'GwasPmID', 'GTexGene', 'ModelID', 'EntrezGeneID'],
      dtype='object')

In [14]:
mutation_df['TranscriptStrand'].value_counts()

+    711270
-    696829
Name: TranscriptStrand, dtype: int64

In [15]:
# Take a look at the mutation classes
mutation_df['VariantInfo'].unique()

array(['MISSENSE', 'SILENT', 'IN_FRAME_INS', 'SPLICE_SITE', 'NONSENSE',
       'FRAME_SHIFT_DEL', 'NONSTOP', 'START_CODON_SNP', 'IN_FRAME_DEL',
       'FRAME_SHIFT_INS', 'START_CODON_INS', 'FIVE_PRIME_FLANK', 'INTRON',
       'THREE_PRIME_UTR'], dtype=object)

In [16]:
# Select all mutations for our cell lines of interest only: where 'ModelID' is in cell_lines['ModelID']
#mutation_df = mutation_df.loc[mutation_df['Tumor_Sample_Barcode'].map(lambda x: x in cell_lines)]

mutation_df = mutation_df[mutation_df['ModelID'].isin(cell_lines['ModelID'])]

mutation_df.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29669,chr1,951180,G,A,0.39,26,15,0/1,,SNP,...,,0.0,,,,,,,ACH-000856,26155.0
29670,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,0.976,3.0,,,,,,,ACH-000856,339451.0
29671,chr1,1046671,C,T,0.25,30,8,0/1,,SNP,...,,0.0,,,,,,,ACH-000856,375790.0
29672,chr1,2358684,C,T,0.565,11,14,0/1,,SNP,...,,0.0,,,,,,,ACH-000856,79906.0
29673,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,0.416,3.0,,,,,,,ACH-000856,5192.0


In [17]:
# Check we've got the right number of ModelID values (should be 39)
mutation_df['ModelID'].unique()

array(['ACH-000856', 'ACH-000601', 'ACH-000097', 'ACH-000725',
       'ACH-000621', 'ACH-000223', 'ACH-000248', 'ACH-000910',
       'ACH-000159', 'ACH-000792', 'ACH-000060', 'ACH-000859',
       'ACH-000330', 'ACH-000352', 'ACH-000768', 'ACH-000374',
       'ACH-000178', 'ACH-000573', 'ACH-000624', 'ACH-000907',
       'ACH-000495', 'ACH-000127', 'ACH-000849', 'ACH-000262',
       'ACH-000876', 'ACH-000684', 'ACH-000265', 'ACH-000699',
       'ACH-000459', 'ACH-000222', 'ACH-000138', 'ACH-000246',
       'ACH-000028', 'ACH-000118', 'ACH-000320', 'ACH-000281',
       'ACH-000148', 'ACH-000234'], dtype=object)

In [18]:
# Take a look at the mutation classes
mutation_df['VariantInfo'].unique()

array(['SILENT', 'MISSENSE', 'FRAME_SHIFT_DEL', 'FRAME_SHIFT_INS',
       'SPLICE_SITE', 'NONSENSE', 'IN_FRAME_DEL', 'NONSTOP',
       'IN_FRAME_INS', 'START_CODON_SNP', 'START_CODON_INS'], dtype=object)

In [74]:
# How many unique transcripts are there?
mutation_df_transcripts = mutation_df['Transcript'].unique()
mutation_df_transcripts 

array(['ENST00000327044.7', 'ENST00000338591.8', 'ENST00000379370.7', ...,
       'ENST00000216146.9', 'ENST00000412172.4', 'ENST00000375722.5'],
      dtype=object)

In [19]:
# First we separate out the badly pathogenic mutations - these are assumed to result in loss of function
pathogenic = ('FRAME_SHIFT_DEL', 'FRAME_SHIFT_INS', 'NONSENSE', 'NONSTOP', 'START_CODON_INS')

##### NOTE ######
# These are the variations that were considered pathogenic on the original version
# pathogenic = ['Frame_Shift_Del', 'Frame_Shift_Ins','Nonsense_Mutation','Nonstop_Mutation','Stop_Codon_Del']

# I will keep this the same for now (note no 'Stop_Codon_Del' class in the new data - presumably because it's the same a NONSTOP) but note:

# IN_FRAME_DEL - this is likely damaging but could be LOF or GOF -  select which depending on whether its onc, tsg or kinase?
# IN_FRAME_INS - as above
# START_CODON_INS - this is likely to prevent the translation of the protein, so LOF?
# START_CODON_SNP - this may prevent translation if the SNP switched the codon from methianine to another amino acid

# Filter mut_df to only include rows where the variant classification is in the pathogenic list defined above
pathogenic_mutations = mutation_df.loc[mutation_df['VariantInfo'].isin(pathogenic)]

pathogenic_mutations.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29678,chr1,6197724,CT,C,0.534,13,15,0/1,,DEL,...,,,,,,,,,ACH-000856,6146.0
29685,chr1,12244554,A,AG,0.556,16,20,0/1,,INS,...,,,,,,,,,ACH-000856,55187.0
29699,chr1,20633885,GA,G,0.462,13,11,0/1,,DEL,...,,,,,,,,,ACH-000856,65018.0
29704,chr1,26773456,C,T,0.389,21,13,0/1,,SNP,...,,5.0,,,,,,,ACH-000856,8289.0
29705,chr1,26779439,TG,T,0.59,15,23,0/1,,DEL,...,,,,,,,,,ACH-000856,8289.0


In [20]:
# See if DepMap classification agrees that these are all likely to be LOF (it does)
pathogenic_mutations['LikelyGoF'].value_counts()

False    1345
Name: LikelyGoF, dtype: int64

In [21]:
pathogenic_mutations['LikelyLoF'].value_counts()

True    1345
Name: LikelyLoF, dtype: int64

In [22]:
# Group by Tumour_Sample_Barcode (cell-line name) so we have 39 rows (one for each cell line), and a column containing comma seperated list
# of all the highly pathogenic mutations in that sample
path_muts_per_sample = pathogenic_mutations.groupby('ModelID')['HugoSymbol'].apply(lambda x: ', '.join(x)).reset_index()

# Check there are no consecutive commas (denoting missing values)
#path_muts_per_sample[path_muts_per_sample['HugoSymbol'].str.contains(", , ")]
path_muts_per_sample

Unnamed: 0,ModelID,HugoSymbol
0,ACH-000028,"ADGRL2, SMCP, FAM228B, NECTIN3, TMEM175, CLCN3..."
1,ACH-000060,"PALMD, TGFBR2, PBRM1, ERVW-1, ERVW-1, MUC12, J..."
2,ACH-000097,"ATAD3C, PLCH2, GBP1, GTDC1, VIL1, VIL1, SLC4A7..."
3,ACH-000118,"CCDC27, CROCC, RSRP1, RHCE, COL8A2, TNS1, PER2..."
4,ACH-000127,"GPATCH3, ZMYM4, P3H1, KANK4, UBAP2L, OR10T2, G..."
5,ACH-000138,"PHGDH, STRIP2, TNS2, NCOR2, TSNAXIP1, NLGN3"
6,ACH-000148,"EPHA10, ITPRID2, PARD3B, POLQ, MEF2C, MYL10, H..."
7,ACH-000159,"SDHB, TRIM33, FAM228B, TRIM43, VHL, FLT4, MUC1..."
8,ACH-000178,"ARID1A, FANCG, GRID1, ANGPTL5, SIPA1L1, FMN1, ..."
9,ACH-000222,"CSMD2, COL8A2, POGK, PRRC2C, RD3, ANKRD36C, MY..."


In [23]:
# Write to csv
path_muts_per_sample.to_csv(data_path + '\\dependant\\pathogenic_mutations_per_sample.csv')

In [24]:
# Select all rows of mutation DF mutation_df where VariantInfo = 'MISSENSE' and VariantType = 'SNP' 
# May also want to include START_CODON_SNP here later (not sure if SNPs in start codon will be covered by the tools that assess mutations - introns only?)
missense_snp = mutation_df[(mutation_df['VariantInfo']=='MISSENSE') & (mutation_df['VariantType']=='SNP')]

In [81]:
missense_snp

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29670,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,0.976,3.000000,,,,,,,ACH-000856,339451.0
29673,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,0.416,3.000000,,,,,,,ACH-000856,5192.0
29674,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,0.436,1.000000,,,,,,,ACH-000856,55229.0
29675,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,0.071,2.000000,,,,,,,ACH-000856,8764.0
29676,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,0.166928,,,,,,,ACH-000856,100287898.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296633,chrX,153954753,T,C,0.857,0,6,0/1,,SNP,...,0.006,2.000000,,,,,,,ACH-000234,3054.0
1296636,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,,,,,ACH-000234,4508.0
1296637,chrM,9055,G,A,0.952,0,20,1|1,,SNP,...,,,,,,,,,ACH-000234,4508.0
1296640,chrM,11025,T,C,0.946,0,19,1|1,,SNP,...,,,,,,,,,ACH-000234,4538.0


In [82]:
missense_snp_transcripts = missense_snp['Transcript'].unique()
missense_snp_transcripts

array(['ENST00000338591.8', 'ENST00000447513.7', 'ENST00000378466.9', ...,
       'ENST00000398145.6', 'ENST00000610913.2', 'ENST00000216146.9'],
      dtype=object)

In [25]:
missense_snp[missense_snp['Transcript'].isnull()]

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID


In [26]:
# Select all DNP and TNP missense mutations (in original version these are not assessed)
missense_dnp_tnp = mutation_df[(mutation_df['VariantInfo']=='MISSENSE') & ((mutation_df['VariantType'] == 'DNP') | (mutation_df['VariantType'] == 'TNP'))]
missense_dnp_tnp

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29686,chr1,12777228,GT,AA,0.214,33,9,0|1,12777228.0,DNP,...,,,,,,,,,ACH-000856,390999.0
29687,chr1,12777427,GA,AG,0.155,38,6,0|1,12777421.0,DNP,...,,,,,,,,,ACH-000856,390999.0
29688,chr1,12859754,CC,TG,0.222,20,5,0|1,12859749.0,DNP,...,,,,,,,,,ACH-000856,65122.0
29689,chr1,12861232,AC,TG,0.315,13,6,0/1,,DNP,...,,,,,,,,,ACH-000856,65122.0
29697,chr1,17438418,CC,AT,0.450,10,8,0/1,,DNP,...,,,,,,,,,ACH-000856,55920.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296496,chr12,11091653,GT,TC,0.780,5,19,0|1,11091626.0,DNP,...,,,,,,,,,ACH-000234,259291.0
1296497,chr12,11091770,GC,AT,0.841,4,23,0|1,11091770.0,DNP,...,,,,,,,,,ACH-000234,259291.0
1296498,chr12,11092131,AA,GG,0.934,2,40,1|1,11092122.0,DNP,...,,,,,,,,,ACH-000234,259291.0
1296570,chr17,36212342,CG,GA,0.347,14,7,0|1,36212342.0,DNP,...,,,,,,,,,ACH-000234,9560.0


In [27]:
# Check what's in the chromosomes column
missense_snp['Chrom'].unique()

array(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
       'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
       'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
       'chrX', 'chrM', 'chrY'], dtype=object)

In [28]:
# Create a VCF file with the SNP missense mutation data

# Copy a subset of columns from the missense_snp dataframe
missense_snp_vcf = missense_snp[['Chrom', 'Pos', 'DbsnpID',  'Ref', 'Alt']].copy()

# Rename the columns to match VCF format requirements
missense_snp_vcf.rename(columns={'Chrom' : '#CHROM' , 'Pos': 'POS', 'Ref': 'REF', 'Alt': 'ALT', 'DbsnpID' : 'ID'}, inplace=True)

# Remove 'chr' string from the chromosome column values - Fathmm-XF won't recognise this this 
missense_snp_vcf['#CHROM'] = missense_snp_vcf['#CHROM'].str.replace('chr', '')

# Write to CSV
missense_snp_vcf.to_csv(data_path + '\\dependant\\depmap_mutations_for_fathmm.vcf', sep='\t', index=False)

print(len(missense_snp_vcf))
missense_snp_vcf.head()

12299


Unnamed: 0,#CHROM,POS,ID,REF,ALT
29670,1,963249,rs573683289,G,A
29673,1,2412394,,C,T
29674,1,2512914,,G,T
29675,1,2561696,,C,T
29676,1,2789833,,C,A


In [29]:
# Get unique transcript IDs from missense SNP mutations dataframe
missense_snp_transcripts = pd.Series(missense_snp['Transcript'].unique(), name='TranscriptID')
missense_snp_transcripts

0       ENST00000338591.8
1       ENST00000447513.7
2       ENST00000378466.9
3       ENST00000355716.4
4       ENST00000401095.8
              ...        
7518    ENST00000252998.2
7519    ENST00000314103.6
7520    ENST00000398145.6
7521    ENST00000610913.2
7522    ENST00000216146.9
Name: TranscriptID, Length: 7523, dtype: object

## Alpha Missense mutation analysis starts here (merge both full AM dfs with mutation df)

### Load and merge the primary data

In [67]:
# Load and preview the alpha missense data (primary assembly)
alpha_missense_primary = pd.read_csv(data_path + 'alphamissense\\AlphaMissense_hg38.tsv', skiprows=3, sep='\t')

In [68]:
alpha_missense_primary_len = len(alpha_missense_primary)
print(alpha_missense_primary_len)

alpha_missense_primary.head(20)

alpha_missense_primary_transcripts = alpha_missense_primary['transcript_id'].tolist()

71697556


In [32]:
# Rename columns in alpha_missense_complete df to match those in missense_snp for the merging criteria
alpha_missense_primary.rename(columns={'#CHROM': 'Chrom', 'POS': 'Pos', 'REF': 'Ref', 'ALT': 'Alt'}, inplace=True)

# add a column so we can later identify the source of the data when merged
alpha_missense_primary['am_source'] = 'primary'

# Now, merge the DataFrames
missense_snp_extended = missense_snp.merge(alpha_missense_primary[['Chrom', 'Pos', 'Ref', 'Alt', 'transcript_id', 'am_pathogenicity', 'am_class', 'am_source']], 
                on=['Chrom', 'Pos', 'Ref', 'Alt'], 
                how='left')

del alpha_missense_primary

missense_snp_extended

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,transcript_id,am_pathogenicity,am_class,am_source
0,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,,,,,ACH-000856,339451.0,ENST00000338591.8,0.9941,likely_pathogenic,primary
1,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,,,,,ACH-000856,5192.0,ENST00000447513.6,0.1815,likely_benign,primary
2,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,,,,,ACH-000856,55229.0,ENST00000378466.9,0.9989,likely_pathogenic,primary
3,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,,,,,ACH-000856,8764.0,ENST00000355716.4,0.1136,likely_benign,primary
4,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,ACH-000856,100287898.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12373,chrX,153954753,T,C,0.857,0,6,0/1,,SNP,...,,,,,ACH-000234,3054.0,ENST00000310441.12,0.0608,likely_benign,primary
12374,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,ACH-000234,4508.0,ENST00000361899.2,0.1314,likely_benign,primary
12375,chrM,9055,G,A,0.952,0,20,1|1,,SNP,...,,,,,ACH-000234,4508.0,ENST00000361899.2,0.1589,likely_benign,primary
12376,chrM,11025,T,C,0.946,0,19,1|1,,SNP,...,,,,,ACH-000234,4538.0,ENST00000361381.2,0.1123,likely_benign,primary


In [33]:
# There are more records now after merging. why?
len(missense_snp_extended)

12378

In [34]:
# Drop duplicates 
missense_snp_extended = missense_snp_extended.drop_duplicates(subset=['Chrom', 'Pos', 'Ref', 'Alt','am_class'], keep="first")

In [35]:
missense_snp_extended

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,transcript_id,am_pathogenicity,am_class,am_source
0,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,,,,,ACH-000856,339451.0,ENST00000338591.8,0.9941,likely_pathogenic,primary
1,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,,,,,ACH-000856,5192.0,ENST00000447513.6,0.1815,likely_benign,primary
2,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,,,,,ACH-000856,55229.0,ENST00000378466.9,0.9989,likely_pathogenic,primary
3,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,,,,,ACH-000856,8764.0,ENST00000355716.4,0.1136,likely_benign,primary
4,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,ACH-000856,100287898.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12370,chr22,39487946,T,G,0.188,30,7,0/1,,SNP,...,,,,,ACH-000234,4248.0,ENST00000341184.7,0.1395,likely_benign,primary
12372,chrX,73214201,T,A,0.972,2,108,1|1,,SNP,...,,,,,ACH-000234,4674.0,ENST00000373517.4,0.1350,likely_benign,primary
12373,chrX,153954753,T,C,0.857,0,6,0/1,,SNP,...,,,,,ACH-000234,3054.0,ENST00000310441.12,0.0608,likely_benign,primary
12374,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,ACH-000234,4508.0,ENST00000361899.2,0.1314,likely_benign,primary


In [36]:
# Save the rows that still don't have an alpha missense prediction - we will check the isoform data for these:
missing_missense_predictions = missense_snp_extended[missense_snp_extended['am_class'].isnull()]

missing_missense_predictions

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,transcript_id,am_pathogenicity,am_class,am_source
4,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,ACH-000856,100287898.0,,,,
14,chr1,20144618,T,C,0.344,19,9,0/1,,SNP,...,,,,,ACH-000856,64600.0,,,,
37,chr1,153765060,C,T,0.550,18,22,0/1,,SNP,...,,,,,ACH-000856,65123.0,,,,
60,chr2,20253922,C,T,0.629,13,23,0/1,,SNP,...,,,,,ACH-000856,23369.0,,,,
66,chr2,38958128,G,A,0.574,10,14,0/1,,SNP,...,,,,,ACH-000856,100271715.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12279,chr11,64118213,C,T,0.984,0,60,1|1,,SNP,...,,,,,ACH-000234,Unknown,,,,
12289,chr12,2566502,T,G,0.268,7,2,0|1,2566502.0,SNP,...,,,,,ACH-000234,775.0,,,,
12297,chr12,70635797,G,C,0.325,32,15,0/1,,SNP,...,,,,,ACH-000234,5787.0,,,,
12299,chr12,88192668,G,C,0.402,58,40,0/1,,SNP,...,,,,,ACH-000234,160418.0,,,,


### Load and merge the isoform data

In [69]:
# Load and preview the alpha missense data (isoforms)
alpha_missense_isoforms = pd.read_csv(data_path + '\\alphamissense\\AlphaMissense_isoforms_hg38.tsv', skiprows=3, sep='\t')

In [70]:
alpha_missense_isoforms_len = len(alpha_missense_isoforms)
print(alpha_missense_isoforms_len)

alpha_missense_isoform_transcripts = alpha_missense_isoforms['transcript_id'].tolist()

alpha_missense_isoforms.head()

144559028


Unnamed: 0,#CHROM,POS,REF,ALT,genome,transcript_id,protein_variant,am_pathogenicity,am_class
0,chr1,65568,A,C,hg38,ENST00000641515.2,K2Q,0.0938,likely_benign
1,chr1,65568,A,G,hg38,ENST00000641515.2,K2E,0.0766,likely_benign
2,chr1,65569,A,G,hg38,ENST00000641515.2,K2R,0.0756,likely_benign
3,chr1,65569,A,T,hg38,ENST00000641515.2,K2M,0.1732,likely_benign
4,chr1,65569,A,C,hg38,ENST00000641515.2,K2T,0.1186,likely_benign


In [39]:
# Rename columns in alpha_missense_isoforms df to match those in missense_snp_extended for the merging criteria
alpha_missense_isoforms.rename(columns={'#CHROM': 'Chrom', 'POS': 'Pos', 'REF': 'Ref', 'ALT': 'Alt'}, inplace=True)

# Add a column, so we can later identify the source of the data when merged
alpha_missense_isoforms['am_source'] = 'isoform'

# These cols were added during the forst merge. Drop them so we can add them again, otherwise we wil  get _x _y appended cols
missing_missense_predictions = missing_missense_predictions.drop(columns=['transcript_id', 'am_pathogenicity', 'am_class', 'am_source']) 

# Now, merge the DataFrames
missing_missense_predictions = missing_missense_predictions.merge(alpha_missense_isoforms[['Chrom', 'Pos', 'Ref', 'Alt', 'transcript_id', 'am_pathogenicity', 'am_class', 'am_source']], 
                on=['Chrom', 'Pos', 'Ref', 'Alt'], 
                how='left')

#del alpha_missense_isoforms

missing_missense_predictions

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,transcript_id,am_pathogenicity,am_class,am_source
0,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,ACH-000856,100287898.0,ENST00000401095.8,0.0870,likely_benign,isoform
1,chr1,20144618,T,C,0.344,19,9,0/1,,SNP,...,,,,,ACH-000856,64600.0,ENST00000375102.3,0.5294,ambiguous,isoform
2,chr1,153765060,C,T,0.550,18,22,0/1,,SNP,...,,,,,ACH-000856,65123.0,ENST00000435409.6,0.3609,ambiguous,isoform
3,chr1,153765060,C,T,0.550,18,22,0/1,,SNP,...,,,,,ACH-000856,65123.0,ENST00000512605.4,0.3640,ambiguous,isoform
4,chr2,20253922,C,T,0.629,13,23,0/1,,SNP,...,,,,,ACH-000856,23369.0,ENST00000403432.5,0.6862,likely_pathogenic,isoform
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1042,chr12,70635797,G,C,0.325,32,15,0/1,,SNP,...,,,,,ACH-000234,5787.0,ENST00000550358.5,0.1129,likely_benign,isoform
1043,chr12,70635797,G,C,0.325,32,15,0/1,,SNP,...,,,,,ACH-000234,5787.0,ENST00000551525.5,0.1061,likely_benign,isoform
1044,chr12,88192668,G,C,0.402,58,40,0/1,,SNP,...,,,,,ACH-000234,160418.0,ENST00000266712.11,0.1018,likely_benign,isoform
1045,chr17,5105919,G,A,0.477,55,49,0/1,,SNP,...,,,,,ACH-000234,7775.0,ENST00000575898.5,0.9779,likely_pathogenic,isoform


In [40]:
# Drop duplicates
#missing_missense_predictions.drop_duplicates(subset=['Chrom', 'Pos', 'Ref', 'Alt', 'DNAChange', 'ProteinChange', 'HugoSymbol', 'Transcript', 'am_class'], keep=False)
missing_missense_predictions = missing_missense_predictions.drop_duplicates(subset=['Chrom', 'Pos', 'Ref', 'Alt', 'am_class'], keep="first")

In [41]:
missense_snp_complete = pd.concat([missense_snp_extended, missing_missense_predictions], axis=0, ignore_index=True)

In [42]:
missense_snp_complete

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,transcript_id,am_pathogenicity,am_class,am_source
0,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,,,,,ACH-000856,339451.0,ENST00000338591.8,0.9941,likely_pathogenic,primary
1,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,,,,,ACH-000856,5192.0,ENST00000447513.6,0.1815,likely_benign,primary
2,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,,,,,ACH-000856,55229.0,ENST00000378466.9,0.9989,likely_pathogenic,primary
3,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,,,,,ACH-000856,8764.0,ENST00000355716.4,0.1136,likely_benign,primary
4,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,ACH-000856,100287898.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12229,chr11,64118213,C,T,0.984,0,60,1|1,,SNP,...,,,,,ACH-000234,Unknown,ENST00000246841.3,0.1162,likely_benign,isoform
12230,chr12,2566502,T,G,0.268,7,2,0|1,2566502.0,SNP,...,,,,,ACH-000234,775.0,ENST00000399655.6,0.9342,likely_pathogenic,isoform
12231,chr12,70635797,G,C,0.325,32,15,0/1,,SNP,...,,,,,ACH-000234,5787.0,ENST00000334414.10,0.0939,likely_benign,isoform
12232,chr12,88192668,G,C,0.402,58,40,0/1,,SNP,...,,,,,ACH-000234,160418.0,ENST00000266712.11,0.1018,likely_benign,isoform


In [43]:
# remove duplicates
missense_snp_complete = missense_snp_complete.drop_duplicates(subset=['Chrom', 'Pos', 'Ref', 'Alt','am_class'], keep="first")

In [62]:
# remove anything will a null am_class
missense_snp_complete = missense_snp_complete[~missense_snp_complete['am_class'].isnull()]

missense_complete_duplicates = missense_snp_complete[missense_snp_complete.duplicated(subset=['Pos', 'Ref', 'Alt'], keep=False)]

In [89]:
missense_complete_duplicates

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,transcript_id,am_pathogenicity,am_class,am_source
11783,chr2,20253922,C,T,0.629,13,23,0/1,,SNP,...,,,,,ACH-000856,23369.0,ENST00000403432.5,0.6862,likely_pathogenic,isoform
11784,chr2,20253922,C,T,0.629,13,23,0/1,,SNP,...,,,,,ACH-000856,23369.0,ENST00000319801.9,0.3923,ambiguous,isoform
11797,chr14,77822473,G,C,0.467,15,13,0/1,,SNP,...,,,,,ACH-000856,57143.0,ENST00000238561.10,0.2329,likely_benign,isoform
11798,chr14,77822473,G,C,0.467,15,13,0/1,,SNP,...,,,,,ACH-000856,57143.0,ENST00000556048.5,0.5121,ambiguous,isoform
11819,chr3,65356554,G,C,0.434,12,11,0/1,,SNP,...,,,,,ACH-000097,Unknown,ENST00000621418.4,0.5399,ambiguous,isoform
11820,chr3,65356554,G,C,0.434,12,11,0/1,,SNP,...,,,,,ACH-000097,Unknown,ENST00000402939.6,0.2632,likely_benign,isoform
11844,chr11,65121571,T,C,0.176,41,8,0/1,,SNP,...,,,,,ACH-000725,2197.0,ENST00000527548.5,0.2911,likely_benign,isoform
11845,chr11,65121571,T,C,0.176,41,8,0/1,,SNP,...,,,,,ACH-000725,2197.0,ENST00000279259.7,0.3802,ambiguous,isoform
11847,chr12,80265109,A,T,0.345,19,9,0/1,,SNP,...,,,,,ACH-000725,283310.0,ENST00000646859.1,0.5856,likely_pathogenic,isoform
11848,chr12,80265109,A,T,0.345,19,9,0/1,,SNP,...,,,,,ACH-000725,283310.0,ENST00000547103.6,0.4175,ambiguous,isoform


In [64]:
missense_snp_complete

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,transcript_id,am_pathogenicity,am_class,am_source
0,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,,,,,ACH-000856,339451.0,ENST00000338591.8,0.9941,likely_pathogenic,primary
1,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,,,,,ACH-000856,5192.0,ENST00000447513.6,0.1815,likely_benign,primary
2,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,,,,,ACH-000856,55229.0,ENST00000378466.9,0.9989,likely_pathogenic,primary
3,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,,,,,ACH-000856,8764.0,ENST00000355716.4,0.1136,likely_benign,primary
5,chr1,5867853,C,T,0.600,18,26,0/1,,SNP,...,,,,,ACH-000856,261734.0,ENST00000378156.9,0.1576,likely_benign,primary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12229,chr11,64118213,C,T,0.984,0,60,1|1,,SNP,...,,,,,ACH-000234,Unknown,ENST00000246841.3,0.1162,likely_benign,isoform
12230,chr12,2566502,T,G,0.268,7,2,0|1,2566502.0,SNP,...,,,,,ACH-000234,775.0,ENST00000399655.6,0.9342,likely_pathogenic,isoform
12231,chr12,70635797,G,C,0.325,32,15,0/1,,SNP,...,,,,,ACH-000234,5787.0,ENST00000334414.10,0.0939,likely_benign,isoform
12232,chr12,88192668,G,C,0.402,58,40,0/1,,SNP,...,,,,,ACH-000234,160418.0,ENST00000266712.11,0.1018,likely_benign,isoform


In [45]:
end_time = datetime.datetime.now()

running_time = end_time - start_time

print(f'Running time: {running_time}')

Running time: 0:06:47.242254


In [88]:
#How many of the transcripts in the mutations file are in the alpha missense data?
# 10,563 transcripts with mutations in our cell lines of interest
available_transcripts = set(missense_snp_transcripts).intersection(set(alpha_missense_primary_transcripts + alpha_missense_isoform_transcripts))
print(len(available_transcripts))
missense_snp[missense_snp['Transcript'].isin(available_transcripts)]

6718


Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29670,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,0.976,3.000000,,,,,,,ACH-000856,339451.0
29674,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,0.436,1.000000,,,,,,,ACH-000856,55229.0
29675,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,0.071,2.000000,,,,,,,ACH-000856,8764.0
29676,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,0.166928,,,,,,,ACH-000856,100287898.0
29677,chr1,5867853,C,T,0.600,18,26,0/1,,SNP,...,0.377,3.000000,,,,,,,ACH-000856,261734.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296633,chrX,153954753,T,C,0.857,0,6,0/1,,SNP,...,0.006,2.000000,,,,,,,ACH-000234,3054.0
1296636,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,,,,,ACH-000234,4508.0
1296637,chrM,9055,G,A,0.952,0,20,1|1,,SNP,...,,,,,,,,,ACH-000234,4508.0
1296640,chrM,11025,T,C,0.946,0,19,1|1,,SNP,...,,,,,,,,,ACH-000234,4538.0


In [None]:
# Function assigns lof and gof label depending on whether protein is onc/tsg/kinase/other
def lof_gof(x):

    if x in tsgs:
        return 'lof'
    elif x in oncs:
        return 'gof'
    elif x in kinases:
        return 'gof'
    else:
        return 'lof'

In [None]:
# Add lof_gof column and map to lof_gof function. Each mutation (row) in df3_sm will be labelled lof/gof
df3_sm['lof_gof'] = df3_sm['Protein stable ID'].map(lof_gof)