In [1]:
import pandas as pd
import numpy as np
import sys
import re
import functools
import json
import random
import os
import math
import datetime

import matplotlib.pyplot as plt

In [2]:
start_time = datetime.datetime.now()

In [3]:
#os.getcwd()

# Linux workstation
#data_path = '/home/db600/phd/data/'

# Laptop
data_path = 'C:\\Users\\dan\\Documents\\phd\\data\\'

os.listdir(data_path)

['alphamissense',
 'biomart',
 'dependant',
 'depmap',
 'missing_gene_names.csv',
 'pathway_commons',
 'reactome',
 'reactome_rows_to_drop.csv',
 'string']

In [4]:
# Define data paths
mut_path = data_path + 'depmap\\OmicsSomaticMutations.csv'
exp_path = data_path + 'depmap\\OmicsExpressionProteinCodingGenesTPMLogp1.csv' 
conv_path = data_path + 'biomart\\ensembl_biomart_plus_fasta.csv'

In [5]:
# Set some config variables
remove_transcript_versions = False

In [6]:
# Read the multi_gene_converter into a DF
conv = pd.read_csv(conv_path, header = 0, index_col = 0)
conv = conv.drop(columns='Unnamed: 0')

In [7]:
conv.head()

Unnamed: 0,Gene stable ID,HGNC symbol,Chromosome/scaffold name,Transcript stable ID,Protein stable ID,UniProtKB/TrEMBL ID,UniProtKB/Swiss-Prot ID,Peptide
0,ENSG00000198888,MT-ND1,MT,ENST00000361390,ENSP00000354687,U5Z754,P03886,MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLL...
1,ENSG00000198763,MT-ND2,MT,ENST00000361453,ENSP00000355046,Q7GXY9,P03891,MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVLTK...
2,ENSG00000198804,MT-CO1,MT,ENST00000361624,ENSP00000354499,U5YWV7,P00395,MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGN...
3,ENSG00000198712,MT-CO2,MT,ENST00000361739,ENSP00000354876,U5Z487,P00403,MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFLTL...
4,ENSG00000228253,MT-ATP8,MT,ENST00000361851,ENSP00000355265,U5YV54,P03928,MPQLNTTVWPTMITPMLLTLFLITQLKMLNTNYHLPPSPKPMKMKN...


In [8]:
def load_list(path):
    with open(path) as f:
        g = json.load(f)
    return g

kinases_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\kinases.json"
oncs_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\oncs.json"
tsgs_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\tsgs.json"

kinases = load_list(kinases_path)
oncs = load_list(oncs_path)
tsgs =load_list(tsgs_path)

In [9]:
# Get list of training cell lines used in original program
# Manually saved to CSV earlier
cell_lines = pd.read_csv(data_path + 'dependant\\original_training_cell_lines.csv')
cell_lines = cell_lines.rename(columns={'cell_line': 'CCLEName'})
cell_lines

Unnamed: 0,CCLEName
0,HCC1428_BREAST
1,HCC1806_BREAST
2,HCC1937_BREAST
3,MDAMB231_BREAST
4,HCC202_BREAST
5,CAL51_BREAST
6,MDAMB468_BREAST
7,KPL1_BREAST
8,MDAMB415_BREAST
9,MDAMB157_BREAST


In [10]:
# Load the DepMap model metadata
depmap_model = pd.read_csv(data_path + 'depmap\\Model.csv')

# Add depmap model ID to the list of 39 original training cell lines
cell_lines = pd.merge(cell_lines, depmap_model[['ModelID', 'CCLEName']], on='CCLEName', how='left')

# View the cell line list
cell_lines

Unnamed: 0,CCLEName,ModelID
0,HCC1428_BREAST,ACH-000352
1,HCC1806_BREAST,ACH-000624
2,HCC1937_BREAST,ACH-000223
3,MDAMB231_BREAST,ACH-000768
4,HCC202_BREAST,ACH-000725
5,CAL51_BREAST,ACH-000856
6,MDAMB468_BREAST,ACH-000849
7,KPL1_BREAST,ACH-000028
8,MDAMB415_BREAST,ACH-000876
9,MDAMB157_BREAST,ACH-000621


In [11]:
# Load the expression and mutation data
mutation_df = pd.read_csv(mut_path, low_memory=False)
expression_df = pd.read_csv(exp_path)

In [12]:
#####################
# Checks for later on
#####################
# Check there are no rows for 'ACH-000600' cell line - was used in original algorithm but now appears to be missing from depmap mutation data
print('Mutation rows for cell line \'ACH-000600\': ' + str(len(mutation_df[mutation_df['ModelID'] == 'ACH-000600'])))
print('Expression rows for cell line \'ACH-000600\': ' + str(len(mutation_df[mutation_df['ModelID'] == 'ACH-000600'])))

# How many SNP missense mutations are there when you exclude dups across 'Chrom', 'Pos', 'Alt', 'Ref', 'Transcript' cols?
# Make the same selection we have for 'missense_snp' later on and print length to screen
missense_snp_delta = mutation_df[mutation_df['ModelID'].isin(cell_lines['ModelID'])]
missense_snp_delta = missense_snp_delta[(missense_snp_delta['VariantInfo']=='MISSENSE') & (missense_snp_delta['VariantType']=='SNP')]
print(f'Raw missense SNP rows: {len(missense_snp_delta)}')

# Filter duplicates across a set of cols and print new length to screen
missense_snp_delta = missense_snp_delta.drop_duplicates(subset=['Chrom', 'Pos', 'Alt', 'Ref', 'Transcript', 'ModelID'])
print(f'Dups removed: {len(missense_snp_delta)}')

# How many unique transcripts?
print('Unique transcripts: ' + str(len(missense_snp_delta['Transcript'].unique())))

del missense_snp_delta

Mutation rows for cell line 'ACH-000600': 0
Expression rows for cell line 'ACH-000600': 0
Raw missense SNP rows: 12299
Dups removed: 12299
Unique transcripts: 7523


In [13]:
# Preview expression data
expression_df.head()

Unnamed: 0.1,Unnamed: 0,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
0,ACH-001113,4.331992,0.0,7.36466,2.792855,4.471187,0.028569,1.226509,3.044394,6.500005,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.0,0.275007,0.0,0.0,0.0
1,ACH-001289,4.567424,0.584963,7.106641,2.543496,3.50462,0.0,0.189034,3.813525,4.221877,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.0,0.014355,0.0,0.0,0.0
2,ACH-001339,3.15056,0.0,7.379118,2.333424,4.228049,0.056584,1.31034,6.687201,3.682573,...,0.594549,1.097611,0.831877,2.946731,0.475085,0.0,0.084064,0.0,0.0,0.042644
3,ACH-001538,5.08534,0.0,7.154211,2.545968,3.084064,0.0,5.86839,6.165309,4.489928,...,0.214125,0.632268,0.298658,1.641546,0.443607,0.0,0.028569,0.0,0.0,0.0
4,ACH-000242,6.729417,0.0,6.537917,2.456806,3.867896,0.799087,7.208478,5.570159,7.127117,...,1.117695,2.358959,0.084064,1.910733,0.0,0.0,0.464668,0.0,0.0,0.0


In [14]:
# Preview mutation data
mutation_df.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
0,chr1,1242864,GC,CT,0.31,19,8,0/1,,DNP,...,,,,,,,,,ACH-000839,388581.0
1,chr1,10647969,A,G,0.4,29,19,0|1,10647969.0,SNP,...,0.234,3.0,,,,,,,ACH-000839,54897.0
2,chr1,10648097,T,G,0.349,21,10,0/1,,SNP,...,,0.0,,,,,,,ACH-000839,54897.0
3,chr1,13198424,G,A,0.833,0,4,0/1,,SNP,...,0.002,,,,,,,,ACH-000839,400736.0
4,chr1,13225068,A,G,0.396,34,24,0/1,,SNP,...,,0.0,,,,,,,ACH-000839,391003.0


In [15]:
mutation_df.columns

Index(['Chrom', 'Pos', 'Ref', 'Alt', 'AF', 'RefCount', 'AltCount', 'GT', 'PS',
       'VariantType', 'VariantInfo', 'DNAChange', 'ProteinChange',
       'HugoSymbol', 'HgncName', 'HgncFamily', 'Transcript', 'TranscriptExon',
       'TranscriptStrand', 'UniprotID', 'Str', 'DbsnpID', 'DbsnpFilter',
       'Issues', 'GcContent', 'LineageAssociation', 'CancerMolecularGenetics',
       'CCLEDeleterious', 'StructuralRelation', 'CosmicHotspot',
       'CosmicOverlappingMutations', 'AssociatedWith', 'LoF', 'Driver',
       'LikelyDriver', 'TranscriptLikelyLoF', 'CivicID', 'CivicDescription',
       'CivicScore', 'Popaf', 'LikelyGoF', 'LikelyLoF', 'HessDriver',
       'HessSignature', 'CscapeScore', 'DannScore', 'RevelScore',
       'Funseq2Score', 'PharmgkbID', 'DidaID', 'DidaName', 'GwasDisease',
       'GwasPmID', 'GTexGene', 'ModelID', 'EntrezGeneID'],
      dtype='object')

In [16]:
mutation_df['TranscriptStrand'].value_counts()

+    711270
-    696829
Name: TranscriptStrand, dtype: int64

In [17]:
# Remove anything after "." in the 'Annotation_Transcript' column if remove_transcript_versions
if remove_transcript_versions:
    mutation_df['Transcript'] = mutation_df['Transcript'].map(lambda x: x.split('.')[0])

In [18]:
# Take a look at the mutation classes
mutation_df['VariantInfo'].unique()

array(['MISSENSE', 'SILENT', 'IN_FRAME_INS', 'SPLICE_SITE', 'NONSENSE',
       'FRAME_SHIFT_DEL', 'NONSTOP', 'START_CODON_SNP', 'IN_FRAME_DEL',
       'FRAME_SHIFT_INS', 'START_CODON_INS', 'FIVE_PRIME_FLANK', 'INTRON',
       'THREE_PRIME_UTR'], dtype=object)

In [19]:
# Select all mutations for our cell lines of interest only: where 'ModelID' is in cell_lines['ModelID']
#mutation_df = mutation_df.loc[mutation_df['Tumor_Sample_Barcode'].map(lambda x: x in cell_lines)]

mutation_df = mutation_df[mutation_df['ModelID'].isin(cell_lines['ModelID'])]

mutation_df.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29669,chr1,951180,G,A,0.39,26,15,0/1,,SNP,...,,0.0,,,,,,,ACH-000856,26155.0
29670,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,0.976,3.0,,,,,,,ACH-000856,339451.0
29671,chr1,1046671,C,T,0.25,30,8,0/1,,SNP,...,,0.0,,,,,,,ACH-000856,375790.0
29672,chr1,2358684,C,T,0.565,11,14,0/1,,SNP,...,,0.0,,,,,,,ACH-000856,79906.0
29673,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,0.416,3.0,,,,,,,ACH-000856,5192.0


In [20]:
# Check we've got the right number of ModelID values (should be 39 - it's only 38)
len(mutation_df['ModelID'].unique())

38

In [21]:
# A cell line is missing from the mutation data. Which one is it?
missing_cell_line = cell_lines[~cell_lines['ModelID'].isin(mutation_df['ModelID'])]

missing_cell_line

Unnamed: 0,CCLEName,ModelID
28,SLR26_KIDNEY,ACH-000600


In [22]:
# Do the same for expression data to check the original cell lines in use
# Select all expression data for our cell lines of interest only: where 'ModelID' is in cell_lines['ModelID']
expression_df = expression_df[expression_df['Unnamed: 0'].isin(cell_lines['ModelID'])]

expression_df.head()

Unnamed: 0.1,Unnamed: 0,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
15,ACH-000792,3.280956,0.0,6.391115,1.752749,3.436961,0.084064,0.422233,6.617798,4.383359,...,0.757023,0.454176,0.214125,1.778209,0.731183,0.028569,0.31034,0.056584,0.028569,0.028569
46,ACH-000138,4.649615,0.0,6.43796,2.831877,3.489286,0.014355,3.725741,5.494736,6.487197,...,1.280956,0.871844,1.480265,1.952334,0.084064,0.028569,1.9855,0.028569,0.028569,0.0
82,ACH-000856,5.705701,0.0,6.413628,3.300124,4.052242,0.124328,3.032101,6.22978,4.637494,...,1.608809,0.0,0.485427,2.646163,0.201634,0.0,0.056584,0.0,0.124328,0.0
97,ACH-000222,5.383704,0.111031,6.452035,2.613532,2.260026,0.070389,0.056584,5.504303,5.592756,...,0.189034,0.389567,0.321928,2.324811,0.238787,0.0,0.0,0.0,0.0,0.0
113,ACH-000223,4.903038,0.0,7.177918,2.744161,4.648465,0.15056,0.070389,5.820179,4.374344,...,2.440952,1.695994,0.367371,1.505891,0.214125,0.137504,0.704872,0.0,0.0,0.0


In [23]:
# Check we've got the right number of ModelID values (should be 39 - it's only 38)
len(expression_df['Unnamed: 0'].unique())

38

In [24]:
# Take a look at the mutation classes
mutation_df['VariantInfo'].unique()

array(['SILENT', 'MISSENSE', 'FRAME_SHIFT_DEL', 'FRAME_SHIFT_INS',
       'SPLICE_SITE', 'NONSENSE', 'IN_FRAME_DEL', 'NONSTOP',
       'IN_FRAME_INS', 'START_CODON_SNP', 'START_CODON_INS'], dtype=object)

In [25]:
# How many unique transcripts are there?
mutation_df_transcripts = mutation_df['Transcript'].unique()
mutation_df_transcripts

array(['ENST00000327044.7', 'ENST00000338591.8', 'ENST00000379370.7', ...,
       'ENST00000216146.9', 'ENST00000412172.4', 'ENST00000375722.5'],
      dtype=object)

In [26]:
# First we separate out the badly pathogenic mutations - these are assumed to result in loss of function
pathogenic = ('FRAME_SHIFT_DEL', 'FRAME_SHIFT_INS', 'NONSENSE', 'NONSTOP', 'START_CODON_INS')

##### NOTE ######
# These are the variations that were considered pathogenic on the original version
# pathogenic = ['Frame_Shift_Del', 'Frame_Shift_Ins','Nonsense_Mutation','Nonstop_Mutation','Stop_Codon_Del']

# I will keep this the same for now (note no 'Stop_Codon_Del' class in the new data - presumably because it's the same a NONSTOP) but note:

# IN_FRAME_DEL - this is likely damaging but could be LOF or GOF -  select which depending on whether its onc, tsg or kinase?
# IN_FRAME_INS - as above
# START_CODON_INS - this is likely to prevent the translation of the protein, so LOF?
# START_CODON_SNP - this may prevent translation if the SNP switched the codon from methianine to another amino acid

# Filter mut_df to only include rows where the variant classification is in the pathogenic list defined above
pathogenic_mutations = mutation_df.loc[mutation_df['VariantInfo'].isin(pathogenic)]

pathogenic_mutations.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29678,chr1,6197724,CT,C,0.534,13,15,0/1,,DEL,...,,,,,,,,,ACH-000856,6146.0
29685,chr1,12244554,A,AG,0.556,16,20,0/1,,INS,...,,,,,,,,,ACH-000856,55187.0
29699,chr1,20633885,GA,G,0.462,13,11,0/1,,DEL,...,,,,,,,,,ACH-000856,65018.0
29704,chr1,26773456,C,T,0.389,21,13,0/1,,SNP,...,,5.0,,,,,,,ACH-000856,8289.0
29705,chr1,26779439,TG,T,0.59,15,23,0/1,,DEL,...,,,,,,,,,ACH-000856,8289.0


In [27]:
# See if DepMap classification agrees that these are all likely to be LOF (it does)
pathogenic_mutations['LikelyGoF'].value_counts()

False    1345
Name: LikelyGoF, dtype: int64

In [28]:
pathogenic_mutations['LikelyLoF'].value_counts()

True    1345
Name: LikelyLoF, dtype: int64

In [29]:
# Group by Tumour_Sample_Barcode (cell-line name) so we have 39 rows (one for each cell line), and a column containing comma seperated list
# of all the highly pathogenic mutations in that sample
path_muts_per_sample = pathogenic_mutations.groupby('ModelID')['HugoSymbol'].apply(lambda x: ', '.join(x)).reset_index()

# Check there are no consecutive commas (denoting missing values)
#path_muts_per_sample[path_muts_per_sample['HugoSymbol'].str.contains(", , ")]
path_muts_per_sample

Unnamed: 0,ModelID,HugoSymbol
0,ACH-000028,"ADGRL2, SMCP, FAM228B, NECTIN3, TMEM175, CLCN3..."
1,ACH-000060,"PALMD, TGFBR2, PBRM1, ERVW-1, ERVW-1, MUC12, J..."
2,ACH-000097,"ATAD3C, PLCH2, GBP1, GTDC1, VIL1, VIL1, SLC4A7..."
3,ACH-000118,"CCDC27, CROCC, RSRP1, RHCE, COL8A2, TNS1, PER2..."
4,ACH-000127,"GPATCH3, ZMYM4, P3H1, KANK4, UBAP2L, OR10T2, G..."
5,ACH-000138,"PHGDH, STRIP2, TNS2, NCOR2, TSNAXIP1, NLGN3"
6,ACH-000148,"EPHA10, ITPRID2, PARD3B, POLQ, MEF2C, MYL10, H..."
7,ACH-000159,"SDHB, TRIM33, FAM228B, TRIM43, VHL, FLT4, MUC1..."
8,ACH-000178,"ARID1A, FANCG, GRID1, ANGPTL5, SIPA1L1, FMN1, ..."
9,ACH-000222,"CSMD2, COL8A2, POGK, PRRC2C, RD3, ANKRD36C, MY..."


In [30]:
# Write to csv
path_muts_per_sample.to_csv(data_path + '\\dependant\\pathogenic_mutations_per_sample.csv')

In [31]:
# Select all rows of mutation DF mutation_df where VariantInfo = 'MISSENSE' and VariantType = 'SNP' 
# May also want to include START_CODON_SNP here later (not sure if SNPs in start codon will be covered by the tools that assess mutations - introns only?)
missense_snp = mutation_df[(mutation_df['VariantInfo']=='MISSENSE') & (mutation_df['VariantType']=='SNP')]

In [32]:
missense_snp

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29670,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,0.976,3.000000,,,,,,,ACH-000856,339451.0
29673,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,0.416,3.000000,,,,,,,ACH-000856,5192.0
29674,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,0.436,1.000000,,,,,,,ACH-000856,55229.0
29675,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,0.071,2.000000,,,,,,,ACH-000856,8764.0
29676,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,0.166928,,,,,,,ACH-000856,100287898.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296633,chrX,153954753,T,C,0.857,0,6,0/1,,SNP,...,0.006,2.000000,,,,,,,ACH-000234,3054.0
1296636,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,,,,,ACH-000234,4508.0
1296637,chrM,9055,G,A,0.952,0,20,1|1,,SNP,...,,,,,,,,,ACH-000234,4508.0
1296640,chrM,11025,T,C,0.946,0,19,1|1,,SNP,...,,,,,,,,,ACH-000234,4538.0


In [33]:
# How many unique missense snp transcripts are there?
missense_snp_transcripts = missense_snp['Transcript'].unique()
missense_snp_transcripts

array(['ENST00000338591.8', 'ENST00000447513.7', 'ENST00000378466.9', ...,
       'ENST00000398145.6', 'ENST00000610913.2', 'ENST00000216146.9'],
      dtype=object)

In [34]:
# How many missing transcripts are there?
missense_snp[missense_snp['Transcript'].isnull()]

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID


In [35]:
# Select all DNP and TNP missense mutations (in original version these are not assessed)
missense_dnp_tnp = mutation_df[(mutation_df['VariantInfo']=='MISSENSE') & ((mutation_df['VariantType'] == 'DNP') | (mutation_df['VariantType'] == 'TNP'))]
missense_dnp_tnp

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29686,chr1,12777228,GT,AA,0.214,33,9,0|1,12777228.0,DNP,...,,,,,,,,,ACH-000856,390999.0
29687,chr1,12777427,GA,AG,0.155,38,6,0|1,12777421.0,DNP,...,,,,,,,,,ACH-000856,390999.0
29688,chr1,12859754,CC,TG,0.222,20,5,0|1,12859749.0,DNP,...,,,,,,,,,ACH-000856,65122.0
29689,chr1,12861232,AC,TG,0.315,13,6,0/1,,DNP,...,,,,,,,,,ACH-000856,65122.0
29697,chr1,17438418,CC,AT,0.450,10,8,0/1,,DNP,...,,,,,,,,,ACH-000856,55920.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296496,chr12,11091653,GT,TC,0.780,5,19,0|1,11091626.0,DNP,...,,,,,,,,,ACH-000234,259291.0
1296497,chr12,11091770,GC,AT,0.841,4,23,0|1,11091770.0,DNP,...,,,,,,,,,ACH-000234,259291.0
1296498,chr12,11092131,AA,GG,0.934,2,40,1|1,11092122.0,DNP,...,,,,,,,,,ACH-000234,259291.0
1296570,chr17,36212342,CG,GA,0.347,14,7,0|1,36212342.0,DNP,...,,,,,,,,,ACH-000234,9560.0


In [36]:
# Check what's in the chromosomes column
missense_snp['Chrom'].unique()

array(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
       'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
       'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
       'chrX', 'chrM', 'chrY'], dtype=object)

In [37]:
# Create a VCF file with the SNP missense mutation data

# Copy a subset of columns from the missense_snp dataframe
missense_snp_vcf = missense_snp[['Chrom', 'Pos', 'DbsnpID',  'Ref', 'Alt']].copy()

# Rename the columns to match VCF format requirements
missense_snp_vcf.rename(columns={'Chrom' : '#CHROM' , 'Pos': 'POS', 'Ref': 'REF', 'Alt': 'ALT', 'DbsnpID' : 'ID'}, inplace=True)

# Remove 'chr' string from the chromosome column values - Fathmm-XF won't recognise this this 
missense_snp_vcf['#CHROM'] = missense_snp_vcf['#CHROM'].str.replace('chr', '')

# Write to CSV
missense_snp_vcf.to_csv(data_path + '\\dependant\\depmap_mutations_for_fathmm.vcf', sep='\t', index=False)

print(len(missense_snp_vcf))
missense_snp_vcf.head()

del missense_snp_vcf

12299


## Alpha Missense mutation analysis starts here

### Load and merge the primary data

In [38]:
# Load and preview the alpha missense data (primary assembly)
alpha_missense_primary = pd.read_csv(data_path + 'alphamissense\\AlphaMissense_hg38.tsv', skiprows=3, sep='\t')

In [39]:
# Print length to screen:
print(f'raw length: {len(alpha_missense_primary)}')

if remove_transcript_versions:
    # Remove anything after "." in the 'Annotation_Transcript' column if remove_transcript_versions
    alpha_missense_primary['transcript_id'] = alpha_missense_primary['transcript_id'].map(lambda x: x.split('.')[0])

# Only select transcript IDs that are in the mutation data
alpha_missense_primary = alpha_missense_primary[alpha_missense_primary['transcript_id'].isin(missense_snp_transcripts)]

# Save new length and print to screen
alpha_missense_primary_len = len(alpha_missense_primary)
print(f'filtered length: {alpha_missense_primary_len}')

# Save a unique list of transcript IDs 
alpha_missense_primary_transcripts = set(alpha_missense_primary['transcript_id'].tolist())

alpha_missense_primary.head()

raw length: 71697556
filtered length: 24330476


Unnamed: 0,#CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class
2018,chr1,925945,T,G,hg38,Q96NU1,ENST00000342066.8,S2A,0.6065,likely_pathogenic
2019,chr1,925945,T,C,hg38,Q96NU1,ENST00000342066.8,S2P,0.8999,likely_pathogenic
2020,chr1,925945,T,A,hg38,Q96NU1,ENST00000342066.8,S2T,0.6602,likely_pathogenic
2021,chr1,925946,C,G,hg38,Q96NU1,ENST00000342066.8,S2C,0.8753,likely_pathogenic
2022,chr1,925946,C,T,hg38,Q96NU1,ENST00000342066.8,S2F,0.9876,likely_pathogenic


In [40]:
# Rename columns in alpha_missense_primary df to match those in missense_snp for the merging criteria
alpha_missense_primary.rename(columns={'#CHROM': 'Chrom', 'POS': 'Pos', 'REF': 'Ref', 'ALT': 'Alt', 'transcript_id': 'Transcript'}, inplace=True)

# Add a column so we can later identify the source of the data when merged
alpha_missense_primary['am_source'] = 'primary'

# Now, merge the DataFrames
missense_snp_extended = missense_snp.merge(alpha_missense_primary[['Chrom', 'Pos', 'Ref', 'Alt', 'Transcript', 'am_pathogenicity', 'am_class', 'am_source']], 
                on=['Chrom', 'Pos', 'Ref', 'Alt', 'Transcript'], 
                how='left')

#del alpha_missense_primary

missense_snp_extended

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,am_pathogenicity,am_class,am_source
0,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,,,,,,ACH-000856,339451.0,0.9941,likely_pathogenic,primary
1,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,,,,,,ACH-000856,5192.0,,,
2,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,,,,,,ACH-000856,55229.0,0.9989,likely_pathogenic,primary
3,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,,,,,,ACH-000856,8764.0,0.1136,likely_benign,primary
4,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,,ACH-000856,100287898.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12294,chrX,153954753,T,C,0.857,0,6,0/1,,SNP,...,,,,,,ACH-000234,3054.0,,,
12295,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,,ACH-000234,4508.0,0.1314,likely_benign,primary
12296,chrM,9055,G,A,0.952,0,20,1|1,,SNP,...,,,,,,ACH-000234,4508.0,0.1589,likely_benign,primary
12297,chrM,11025,T,C,0.946,0,19,1|1,,SNP,...,,,,,,ACH-000234,4538.0,0.1123,likely_benign,primary


In [41]:
# Count rows again after merge
len(missense_snp_extended)

12299

In [42]:
# Save the rows that still don't have an alpha missense prediction - we will check the isoform data for these:
missing_missense_predictions = missense_snp_extended[missense_snp_extended['am_class'].isnull()]

missing_missense_predictions

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,am_pathogenicity,am_class,am_source
1,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,,,,,,ACH-000856,5192.0,,,
4,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,,ACH-000856,100287898.0,,,
8,chr1,10275492,G,A,0.222,21,6,0/1,,SNP,...,,,,,,ACH-000856,23095.0,,,
14,chr1,20144618,T,C,0.344,19,9,0/1,,SNP,...,,,,,,ACH-000856,64600.0,,,
17,chr1,33159815,C,T,0.514,16,21,0/1,,SNP,...,,,,,,ACH-000856,55223.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12278,chr19,54242117,T,C,0.508,204,211,0/1,,SNP,...,,,,,,ACH-000234,79168.0,,,
12283,chr20,33770234,C,G,0.266,74,26,0/1,,SNP,...,,,,,,ACH-000234,84905.0,,,
12288,chr22,26472373,C,T,0.604,106,154,0/1,,SNP,...,,,,,,ACH-000234,89781.0,,,
12289,chr22,37207205,G,A,0.726,13,38,0/1,,SNP,...,,,,,,ACH-000234,6753.0,,,


In [43]:
# Now that rows with missing predictions have been saved to a separate df, drop them from this one
# Should be 8198 (12299-4101) or 7664 (12299-4635)
missense_snp_extended = missense_snp_extended.dropna(subset=['am_class'])

missense_snp_extended

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,am_pathogenicity,am_class,am_source
0,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,,,,,,ACH-000856,339451.0,0.9941,likely_pathogenic,primary
2,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,,,,,,ACH-000856,55229.0,0.9989,likely_pathogenic,primary
3,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,,,,,,ACH-000856,8764.0,0.1136,likely_benign,primary
5,chr1,5867853,C,T,0.600,18,26,0/1,,SNP,...,,,,,,ACH-000856,261734.0,0.1576,likely_benign,primary
6,chr1,6580736,G,A,0.500,16,16,0/1,,SNP,...,,,,,,ACH-000856,3104.0,0.1870,likely_benign,primary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12293,chrX,73214201,T,A,0.972,2,108,1|1,,SNP,...,,,,,,ACH-000234,4674.0,0.1350,likely_benign,primary
12295,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,,ACH-000234,4508.0,0.1314,likely_benign,primary
12296,chrM,9055,G,A,0.952,0,20,1|1,,SNP,...,,,,,,ACH-000234,4508.0,0.1589,likely_benign,primary
12297,chrM,11025,T,C,0.946,0,19,1|1,,SNP,...,,,,,,ACH-000234,4538.0,0.1123,likely_benign,primary


In [44]:
# Drop duplicates 
missense_snp_extended = missense_snp_extended.drop_duplicates(subset=['Chrom', 'Pos', 'Ref', 'Alt', 'Transcript', 'am_class', 'ModelID'], keep="first")

In [45]:
missense_snp_extended

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,am_pathogenicity,am_class,am_source
0,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,,,,,,ACH-000856,339451.0,0.9941,likely_pathogenic,primary
2,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,,,,,,ACH-000856,55229.0,0.9989,likely_pathogenic,primary
3,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,,,,,,ACH-000856,8764.0,0.1136,likely_benign,primary
5,chr1,5867853,C,T,0.600,18,26,0/1,,SNP,...,,,,,,ACH-000856,261734.0,0.1576,likely_benign,primary
6,chr1,6580736,G,A,0.500,16,16,0/1,,SNP,...,,,,,,ACH-000856,3104.0,0.1870,likely_benign,primary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12293,chrX,73214201,T,A,0.972,2,108,1|1,,SNP,...,,,,,,ACH-000234,4674.0,0.1350,likely_benign,primary
12295,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,,ACH-000234,4508.0,0.1314,likely_benign,primary
12296,chrM,9055,G,A,0.952,0,20,1|1,,SNP,...,,,,,,ACH-000234,4508.0,0.1589,likely_benign,primary
12297,chrM,11025,T,C,0.946,0,19,1|1,,SNP,...,,,,,,ACH-000234,4538.0,0.1123,likely_benign,primary


### Load and merge the isoform data

In [46]:
# Load and preview the alpha missense data (isoforms)
alpha_missense_isoforms = pd.read_csv(data_path + '\\alphamissense\\AlphaMissense_isoforms_hg38.tsv', skiprows=3, sep='\t')

In [47]:
# Print length to screen:
print(f'raw length: {len(alpha_missense_isoforms)}')

if remove_transcript_versions:
    # Remove anything after "." in the 'Annotation_Transcript' column if remove_transcript_versions
    alpha_missense_isoforms['transcript_id'] = alpha_missense_isoforms['transcript_id'].map(lambda x: x.split('.')[0])

# Only select transcript IDs that are in the mutation data
alpha_missense_isoforms = alpha_missense_isoforms[alpha_missense_isoforms['transcript_id'].isin(missense_snp_transcripts)]

# Save new length and print to screen
alpha_missense_isoforms_len = len(alpha_missense_isoforms)
print(f'filtered length: {alpha_missense_isoforms_len}')

# Save a unique list of transcript IDs 
alpha_missense_isoform_transcripts = set(alpha_missense_isoforms['transcript_id'].tolist())

alpha_missense_isoforms.head()

raw length: 144559028
filtered length: 10813707


Unnamed: 0,#CHROM,POS,REF,ALT,genome,transcript_id,protein_variant,am_pathogenicity,am_class
47794,chr1,976176,G,C,hg38,ENST00000341290.6,S676C,0.1238,likely_benign
47795,chr1,976176,G,A,hg38,ENST00000341290.6,S676F,0.2443,likely_benign
47796,chr1,976176,G,T,hg38,ENST00000341290.6,S676Y,0.1965,likely_benign
47797,chr1,976177,A,C,hg38,ENST00000341290.6,S676A,0.113,likely_benign
47798,chr1,976177,A,G,hg38,ENST00000341290.6,S676P,0.1341,likely_benign


In [48]:
# Rename columns in alpha_missense_isoforms df to match those in missense_snp_extended for the merging criteria
alpha_missense_isoforms.rename(columns={'#CHROM': 'Chrom', 'POS': 'Pos', 'REF': 'Ref', 'ALT': 'Alt', 'transcript_id': 'Transcript'}, inplace=True)

# Add a column, so we can later identify the source of the data when merged
alpha_missense_isoforms['am_source'] = 'isoform'

# These cols were added during the first merge. Drop them so we can add them again, otherwise we will get _x _y appended cols
missing_missense_predictions = missing_missense_predictions.drop(columns=['am_pathogenicity', 'am_class', 'am_source']) 

# Now, merge the DataFrames
missing_missense_predictions = missing_missense_predictions.merge(alpha_missense_isoforms[['Chrom', 'Pos', 'Ref', 'Alt', 'Transcript', 'am_pathogenicity', 'am_class', 'am_source']], 
                on=['Chrom', 'Pos', 'Ref', 'Alt', 'Transcript'], 
                how='left')

#del alpha_missense_isoforms

missing_missense_predictions

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,am_pathogenicity,am_class,am_source
0,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,,,,,,ACH-000856,5192.0,,,
1,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,,ACH-000856,100287898.0,0.0870,likely_benign,isoform
2,chr1,10275492,G,A,0.222,21,6,0/1,,SNP,...,,,,,,ACH-000856,23095.0,0.9657,likely_pathogenic,isoform
3,chr1,20144618,T,C,0.344,19,9,0/1,,SNP,...,,,,,,ACH-000856,64600.0,,,
4,chr1,33159815,C,T,0.514,16,21,0/1,,SNP,...,,,,,,ACH-000856,55223.0,0.4059,ambiguous,isoform
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4757,chr19,54242117,T,C,0.508,204,211,0/1,,SNP,...,,,,,,ACH-000234,79168.0,0.0395,likely_benign,isoform
4758,chr20,33770234,C,G,0.266,74,26,0/1,,SNP,...,,,,,,ACH-000234,84905.0,0.3262,likely_benign,isoform
4759,chr22,26472373,C,T,0.604,106,154,0/1,,SNP,...,,,,,,ACH-000234,89781.0,0.0965,likely_benign,isoform
4760,chr22,37207205,G,A,0.726,13,38,0/1,,SNP,...,,,,,,ACH-000234,6753.0,0.0889,likely_benign,isoform


In [49]:
# How many still don't have a prediction?
missing_missense_predictions[missing_missense_predictions['am_class'].isnull()]

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,am_pathogenicity,am_class,am_source
0,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,,,,,,ACH-000856,5192.0,,,
3,chr1,20144618,T,C,0.344,19,9,0/1,,SNP,...,,,,,,ACH-000856,64600.0,,,
5,chr1,43437866,C,T,0.421,21,17,0/1,,SNP,...,,,,,,ACH-000856,23334.0,,,
7,chr1,64839630,T,C,0.406,20,13,0/1,,SNP,...,,,,,,ACH-000856,3716.0,,,
9,chr1,145340826,T,A,0.750,0,2,0/1,,SNP,...,,,,,,ACH-000856,100288142.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4739,chr15,61920539,T,C,0.193,51,11,0/1,,SNP,...,,,,,,ACH-000234,54832.0,,,
4741,chr15,74133133,C,A,0.541,53,70,0/1,,SNP,...,,,,,,ACH-000234,57611.0,,,
4743,chr15,88859328,C,T,0.563,6,8,0/1,,SNP,...,,,,,,ACH-000234,176.0,,,
4748,chr17,28547785,G,A,0.296,225,98,0/1,,SNP,...,,,,,,ACH-000234,9094.0,,,


In [50]:
# Drop any that still don't have a prediction
# missing_missense_predictions = missing_missense_predictions.dropna(subset='am_class')
# missing_missense_predictions

In [51]:
# Drop duplicates
missing_missense_predictions = missing_missense_predictions.drop_duplicates(subset=['Chrom', 'Pos', 'Ref', 'Alt', 'Transcript', 'am_class', 'ModelID'], keep='first')
missing_missense_predictions

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,am_pathogenicity,am_class,am_source
0,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,,,,,,ACH-000856,5192.0,,,
1,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,,ACH-000856,100287898.0,0.0870,likely_benign,isoform
2,chr1,10275492,G,A,0.222,21,6,0/1,,SNP,...,,,,,,ACH-000856,23095.0,0.9657,likely_pathogenic,isoform
3,chr1,20144618,T,C,0.344,19,9,0/1,,SNP,...,,,,,,ACH-000856,64600.0,,,
4,chr1,33159815,C,T,0.514,16,21,0/1,,SNP,...,,,,,,ACH-000856,55223.0,0.4059,ambiguous,isoform
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4757,chr19,54242117,T,C,0.508,204,211,0/1,,SNP,...,,,,,,ACH-000234,79168.0,0.0395,likely_benign,isoform
4758,chr20,33770234,C,G,0.266,74,26,0/1,,SNP,...,,,,,,ACH-000234,84905.0,0.3262,likely_benign,isoform
4759,chr22,26472373,C,T,0.604,106,154,0/1,,SNP,...,,,,,,ACH-000234,89781.0,0.0965,likely_benign,isoform
4760,chr22,37207205,G,A,0.726,13,38,0/1,,SNP,...,,,,,,ACH-000234,6753.0,0.0889,likely_benign,isoform


In [52]:
missing_missense_predictions[missing_missense_predictions.duplicated(subset=['Chrom', 'Pos', 'Ref', 'Alt', 'Transcript', 'ModelID'], keep=False)]

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,am_pathogenicity,am_class,am_source


In [53]:
missense_snp_complete = pd.concat([missense_snp_extended, missing_missense_predictions], axis=0, ignore_index=True)

In [54]:
missense_snp_complete

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,am_pathogenicity,am_class,am_source
0,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,,,,,,ACH-000856,339451.0,0.9941,likely_pathogenic,primary
1,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,,,,,,ACH-000856,55229.0,0.9989,likely_pathogenic,primary
2,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,,,,,,ACH-000856,8764.0,0.1136,likely_benign,primary
3,chr1,5867853,C,T,0.600,18,26,0/1,,SNP,...,,,,,,ACH-000856,261734.0,0.1576,likely_benign,primary
4,chr1,6580736,G,A,0.500,16,16,0/1,,SNP,...,,,,,,ACH-000856,3104.0,0.1870,likely_benign,primary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12294,chr19,54242117,T,C,0.508,204,211,0/1,,SNP,...,,,,,,ACH-000234,79168.0,0.0395,likely_benign,isoform
12295,chr20,33770234,C,G,0.266,74,26,0/1,,SNP,...,,,,,,ACH-000234,84905.0,0.3262,likely_benign,isoform
12296,chr22,26472373,C,T,0.604,106,154,0/1,,SNP,...,,,,,,ACH-000234,89781.0,0.0965,likely_benign,isoform
12297,chr22,37207205,G,A,0.726,13,38,0/1,,SNP,...,,,,,,ACH-000234,6753.0,0.0889,likely_benign,isoform


In [55]:
# Remove any rows with null am_class (shouldn't be any)
#missense_snp_complete = missense_snp_complete[~missense_snp_complete['am_class'].isnull()]
#missense_snp_complete

In [68]:
# Remove duplicates
missense_snp_complete = missense_snp_complete.drop_duplicates(subset=['Chrom', 'Pos', 'Ref', 'Alt', 'Transcript', 'am_class', 'ModelID'], keep="first")
missense_snp_complete[missense_snp_complete['am_class'].isnull()]

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,am_pathogenicity,am_class,am_source
7664,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,,,,,,ACH-000856,5192.0,,,
7667,chr1,20144618,T,C,0.344,19,9,0/1,,SNP,...,,,,,,ACH-000856,64600.0,,,
7669,chr1,43437866,C,T,0.421,21,17,0/1,,SNP,...,,,,,,ACH-000856,23334.0,,,
7671,chr1,64839630,T,C,0.406,20,13,0/1,,SNP,...,,,,,,ACH-000856,3716.0,,,
7673,chr1,145340826,T,A,0.750,0,2,0/1,,SNP,...,,,,,,ACH-000856,100288142.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12276,chr15,61920539,T,C,0.193,51,11,0/1,,SNP,...,,,,,,ACH-000234,54832.0,,,
12278,chr15,74133133,C,A,0.541,53,70,0/1,,SNP,...,,,,,,ACH-000234,57611.0,,,
12280,chr15,88859328,C,T,0.563,6,8,0/1,,SNP,...,,,,,,ACH-000234,176.0,,,
12285,chr17,28547785,G,A,0.296,225,98,0/1,,SNP,...,,,,,,ACH-000234,9094.0,,,


In [57]:
len(missense_snp_complete['Transcript'].unique())

7523

In [58]:
# Check for earlier
# How many of the transcripts in the mutations file are in the alpha missense data?
# 11780 SNP missense mutations in 7823 transcripts in our cell lines of interest

all_am_transcripts = alpha_missense_primary_transcripts.union(alpha_missense_isoform_transcripts)

available_transcripts = set(missense_snp_transcripts).intersection(all_am_transcripts)

print(len(available_transcripts))
check = missense_snp[missense_snp['Transcript'].isin(available_transcripts)].drop_duplicates(subset=['Chrom', 'Pos', 'Ref', 'Alt', 'Transcript', 'ModelID'], keep="first")
print(len(missense_snp))
print(len(check))
check

6718
12299
10876


Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29670,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,0.976,3.000000,,,,,,,ACH-000856,339451.0
29674,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,0.436,1.000000,,,,,,,ACH-000856,55229.0
29675,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,0.071,2.000000,,,,,,,ACH-000856,8764.0
29676,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,0.166928,,,,,,,ACH-000856,100287898.0
29677,chr1,5867853,C,T,0.600,18,26,0/1,,SNP,...,0.377,3.000000,,,,,,,ACH-000856,261734.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296633,chrX,153954753,T,C,0.857,0,6,0/1,,SNP,...,0.006,2.000000,,,,,,,ACH-000234,3054.0
1296636,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,,,,,ACH-000234,4508.0
1296637,chrM,9055,G,A,0.952,0,20,1|1,,SNP,...,,,,,,,,,ACH-000234,4508.0
1296640,chrM,11025,T,C,0.946,0,19,1|1,,SNP,...,,,,,,,,,ACH-000234,4538.0


In [59]:
# Which transcript is available but not in final dataset?
# Note: ENST00000400841. It's on the X chrom but alpha missense has it on Y so it's not being picked up
set(available_transcripts) - (set(missense_snp_complete['Transcript']))

set()

In [60]:
missense_snp[missense_snp['Transcript'] == 'ENST00000400841']
alpha_missense_primary[(alpha_missense_primary['Transcript'] == 'ENST00000400841')]

#missense_snp['Issues'].value_counts()

Unnamed: 0,Chrom,Pos,Ref,Alt,genome,uniprot_id,Transcript,protein_variant,am_pathogenicity,am_class,am_source


In [61]:
# Function assigns lof and gof label depending on whether protein is onc/tsg/kinase/other
def lof_gof(x):

    if x in tsgs:
        return 'lof'
    elif x in oncs:
        return 'gof'
    elif x in kinases:
        return 'gof'
    else:
        return 'lof'

In [62]:
# Add lof_gof column and map to lof_gof function. Each mutation (row) in df3_sm will be labelled lof/gof
# df3_sm['lof_gof'] = df3_sm['Protein stable ID'].map(lof_gof)

In [63]:
# How long did that take? 
end_time = datetime.datetime.now()
running_time = end_time - start_time
print(f'Running time: {running_time}')

Running time: 0:04:43.288684
