In [6]:
import pandas as pd
import numpy as np
import sys
import re
import functools
import json
import random
import os
import math

import matplotlib.pyplot as plt

In [80]:
#os.getcwd()

# Linux workstation
#data_path = '/home/db600/phd/data/'

# Laptop
data_path = 'C:\\Users\\dan\\Documents\\phd\\data'

os.listdir(data_path)

['alphamissense',
 'biomart',
 'dependant',
 'depmap',
 'missing_gene_names.csv',
 'pathway_commons',
 'reactome',
 'reactome_rows_to_drop.csv',
 'string']

In [8]:
mut_path = data_path + 'depmap\\OmicsSomaticMutations.csv'
exp_path = data_path + 'depmap\\OmicsExpressionProteinCodingGenesTPMLogp1.csv' 
conv_path = data_path + 'biomart\\ensembl_biomart_plus_fasta.csv'

In [9]:
# Read the multi_gene_converter into a DF
conv = pd.read_csv(conv_path, header = 0, index_col = 0)
conv = conv.drop(columns='Unnamed: 0')

In [10]:
conv.head()

Unnamed: 0,Gene stable ID,HGNC symbol,Chromosome/scaffold name,Transcript stable ID,Protein stable ID,UniProtKB/TrEMBL ID,UniProtKB/Swiss-Prot ID,Peptide
0,ENSG00000198888,MT-ND1,MT,ENST00000361390,ENSP00000354687,U5Z754,P03886,MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLL...
1,ENSG00000198763,MT-ND2,MT,ENST00000361453,ENSP00000355046,Q7GXY9,P03891,MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVLTK...
2,ENSG00000198804,MT-CO1,MT,ENST00000361624,ENSP00000354499,U5YWV7,P00395,MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGN...
3,ENSG00000198712,MT-CO2,MT,ENST00000361739,ENSP00000354876,U5Z487,P00403,MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFLTL...
4,ENSG00000228253,MT-ATP8,MT,ENST00000361851,ENSP00000355265,U5YV54,P03928,MPQLNTTVWPTMITPMLLTLFLITQLKMLNTNYHLPPSPKPMKMKN...


In [11]:
def load_list(path):
    with open(path) as f:
        g = json.load(f)
    return g

kinases_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\kinases.json"
oncs_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\oncs.json"
tsgs_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\tsgs.json"

kinases = load_list(kinases_path)
oncs = load_list(oncs_path)
tsgs =load_list(tsgs_path)

In [12]:
# Get list of training cell lines used in original program
# Manually saved to CSV earlier
cell_lines = pd.read_csv(data_path + 'dependant\\original_training_cell_lines.csv')
cell_lines = cell_lines.rename(columns={'cell_line': 'CCLEName'})
cell_lines

Unnamed: 0,CCLEName
0,HCC1428_BREAST
1,HCC1806_BREAST
2,HCC1937_BREAST
3,MDAMB231_BREAST
4,HCC202_BREAST
5,CAL51_BREAST
6,MDAMB468_BREAST
7,KPL1_BREAST
8,MDAMB415_BREAST
9,MDAMB157_BREAST


In [13]:
# Load the DepMap model metadata
depmap_model = pd.read_csv(data_path + 'depmap\\Model.csv')

# Add depmap model ID to the list of 39 original training cell lines
cell_lines = pd.merge(cell_lines, depmap_model[['ModelID', 'CCLEName']], on='CCLEName', how='left')

# View the cell line list
cell_lines

Unnamed: 0,CCLEName,ModelID
0,HCC1428_BREAST,ACH-000352
1,HCC1806_BREAST,ACH-000624
2,HCC1937_BREAST,ACH-000223
3,MDAMB231_BREAST,ACH-000768
4,HCC202_BREAST,ACH-000725
5,CAL51_BREAST,ACH-000856
6,MDAMB468_BREAST,ACH-000849
7,KPL1_BREAST,ACH-000028
8,MDAMB415_BREAST,ACH-000876
9,MDAMB157_BREAST,ACH-000621


In [14]:
# Load the expression and mutation data
mut_df0 = pd.read_csv(mut_path, low_memory=False)
exp_df = pd.read_csv(exp_path)

In [15]:
# Preview expression data
exp_df.head()

Unnamed: 0.1,Unnamed: 0,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
0,ACH-001113,4.331992,0.0,7.36466,2.792855,4.471187,0.028569,1.226509,3.044394,6.500005,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.0,0.275007,0.0,0.0,0.0
1,ACH-001289,4.567424,0.584963,7.106641,2.543496,3.50462,0.0,0.189034,3.813525,4.221877,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.0,0.014355,0.0,0.0,0.0
2,ACH-001339,3.15056,0.0,7.379118,2.333424,4.228049,0.056584,1.31034,6.687201,3.682573,...,0.594549,1.097611,0.831877,2.946731,0.475085,0.0,0.084064,0.0,0.0,0.042644
3,ACH-001538,5.08534,0.0,7.154211,2.545968,3.084064,0.0,5.86839,6.165309,4.489928,...,0.214125,0.632268,0.298658,1.641546,0.443607,0.0,0.028569,0.0,0.0,0.0
4,ACH-000242,6.729417,0.0,6.537917,2.456806,3.867896,0.799087,7.208478,5.570159,7.127117,...,1.117695,2.358959,0.084064,1.910733,0.0,0.0,0.464668,0.0,0.0,0.0


In [16]:
# Preview mutation data
mut_df0.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
0,chr1,1242864,GC,CT,0.31,19,8,0/1,,DNP,...,,,,,,,,,ACH-000839,388581.0
1,chr1,10647969,A,G,0.4,29,19,0|1,10647969.0,SNP,...,0.234,3.0,,,,,,,ACH-000839,54897.0
2,chr1,10648097,T,G,0.349,21,10,0/1,,SNP,...,,0.0,,,,,,,ACH-000839,54897.0
3,chr1,13198424,G,A,0.833,0,4,0/1,,SNP,...,0.002,,,,,,,,ACH-000839,400736.0
4,chr1,13225068,A,G,0.396,34,24,0/1,,SNP,...,,0.0,,,,,,,ACH-000839,391003.0


In [68]:
mut_df0.columns

Index(['Chrom', 'Pos', 'Ref', 'Alt', 'AF', 'RefCount', 'AltCount', 'GT', 'PS',
       'VariantType', 'VariantInfo', 'DNAChange', 'ProteinChange',
       'HugoSymbol', 'HgncName', 'HgncFamily', 'Transcript', 'TranscriptExon',
       'TranscriptStrand', 'UniprotID', 'Str', 'DbsnpID', 'DbsnpFilter',
       'Issues', 'GcContent', 'LineageAssociation', 'CancerMolecularGenetics',
       'CCLEDeleterious', 'StructuralRelation', 'CosmicHotspot',
       'CosmicOverlappingMutations', 'AssociatedWith', 'LoF', 'Driver',
       'LikelyDriver', 'TranscriptLikelyLoF', 'CivicID', 'CivicDescription',
       'CivicScore', 'Popaf', 'LikelyGoF', 'LikelyLoF', 'HessDriver',
       'HessSignature', 'CscapeScore', 'DannScore', 'RevelScore',
       'Funseq2Score', 'PharmgkbID', 'DidaID', 'DidaName', 'GwasDisease',
       'GwasPmID', 'GTexGene', 'ModelID', 'EntrezGeneID'],
      dtype='object')

In [70]:
mut_df0['TranscriptStrand'].value_counts()

+    711270
-    696829
Name: TranscriptStrand, dtype: int64

In [18]:
# Take a look at the mutation classes
mut_df0['VariantInfo'].unique()

array(['MISSENSE', 'SILENT', 'IN_FRAME_INS', 'SPLICE_SITE', 'NONSENSE',
       'FRAME_SHIFT_DEL', 'NONSTOP', 'START_CODON_SNP', 'IN_FRAME_DEL',
       'FRAME_SHIFT_INS', 'START_CODON_INS', 'FIVE_PRIME_FLANK', 'INTRON',
       'THREE_PRIME_UTR'], dtype=object)

In [17]:
# Select all mutations for our cell lines of interest only: where 'ModelID' is in cell_lines['ModelID']
#mut_df = mut_df0.loc[mut_df0['Tumor_Sample_Barcode'].map(lambda x: x in cell_lines)]

mut_df = mut_df0[mut_df0['ModelID'].isin(cell_lines['ModelID'])]

mut_df.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29669,chr1,951180,G,A,0.39,26,15,0/1,,SNP,...,,0.0,,,,,,,ACH-000856,26155.0
29670,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,0.976,3.0,,,,,,,ACH-000856,339451.0
29671,chr1,1046671,C,T,0.25,30,8,0/1,,SNP,...,,0.0,,,,,,,ACH-000856,375790.0
29672,chr1,2358684,C,T,0.565,11,14,0/1,,SNP,...,,0.0,,,,,,,ACH-000856,79906.0
29673,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,0.416,3.0,,,,,,,ACH-000856,5192.0


In [19]:
# Check we've got the right number of ModelID values (should be 39)
mut_df['ModelID'].unique()

array(['ACH-000856', 'ACH-000601', 'ACH-000097', 'ACH-000725',
       'ACH-000621', 'ACH-000223', 'ACH-000248', 'ACH-000910',
       'ACH-000159', 'ACH-000792', 'ACH-000060', 'ACH-000859',
       'ACH-000330', 'ACH-000352', 'ACH-000768', 'ACH-000374',
       'ACH-000178', 'ACH-000573', 'ACH-000624', 'ACH-000907',
       'ACH-000495', 'ACH-000127', 'ACH-000849', 'ACH-000262',
       'ACH-000876', 'ACH-000684', 'ACH-000265', 'ACH-000699',
       'ACH-000459', 'ACH-000222', 'ACH-000138', 'ACH-000246',
       'ACH-000028', 'ACH-000118', 'ACH-000320', 'ACH-000281',
       'ACH-000148', 'ACH-000234'], dtype=object)

In [20]:
# Remove anything after "." in the 'Annotation_Transcript' column
mut_df = mut_df.copy()
mut_df['Transcript'] = mut_df['Transcript'].map(lambda x: x.split('.')[0])

mut_df[['HugoSymbol', 'Transcript']].head()

Unnamed: 0,HugoSymbol,Transcript
29669,NOC2L,ENST00000327044
29670,KLHL17,ENST00000338591
29671,AGRN,ENST00000379370
29672,MORN1,ENST00000378529
29673,PEX10,ENST00000447513


In [21]:
# Take a look at the mutation classes
mut_df['VariantInfo'].unique()

array(['SILENT', 'MISSENSE', 'FRAME_SHIFT_DEL', 'FRAME_SHIFT_INS',
       'SPLICE_SITE', 'NONSENSE', 'IN_FRAME_DEL', 'NONSTOP',
       'IN_FRAME_INS', 'START_CODON_SNP', 'START_CODON_INS'], dtype=object)

In [23]:
#first we separate out the badly pathogenic mutations - these are assumed to result in loss of function
pathogenic = ('FRAME_SHIFT_DEL', 'FRAME_SHIFT_INS', 'NONSENSE', 'NONSTOP', 'START_CODON_INS')

##### NOTE ######
# These are the variations that were considered pathogenic on the original version
# pathogenic = ['Frame_Shift_Del', 'Frame_Shift_Ins','Nonsense_Mutation','Nonstop_Mutation','Stop_Codon_Del']

# I will keep this the same for now (note no 'Stop_Codon_Del' class in the new data - presumably because it's the same a NONSTOP) but note:

# IN_FRAME_DEL - this is likely damaging but could be LOF or GOF -  select which depending on whether its onc, tsg or kinase?
# IN_FRAME_INS - as above
# START_CODON_INS - this is likely to prevent the translation of the protein, so LOF?
# START_CODON_SNP - this may prevent translation if the SNP switched the codon from methianine to another amino acid

# Filter mut_df to only include rows where the variant classification is in the pathogenic list defined above
pathogenic_mutations = mut_df.loc[mut_df['VariantInfo'].isin(pathogenic)]

pathogenic_mutations.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29678,chr1,6197724,CT,C,0.534,13,15,0/1,,DEL,...,,,,,,,,,ACH-000856,6146.0
29685,chr1,12244554,A,AG,0.556,16,20,0/1,,INS,...,,,,,,,,,ACH-000856,55187.0
29699,chr1,20633885,GA,G,0.462,13,11,0/1,,DEL,...,,,,,,,,,ACH-000856,65018.0
29704,chr1,26773456,C,T,0.389,21,13,0/1,,SNP,...,,5.0,,,,,,,ACH-000856,8289.0
29705,chr1,26779439,TG,T,0.59,15,23,0/1,,DEL,...,,,,,,,,,ACH-000856,8289.0


In [24]:
# See if DepMap classification agrees that these are all likely to be LOF
pathogenic_mutations['LikelyGoF'].value_counts()

False    1345
Name: LikelyGoF, dtype: int64

In [25]:
pathogenic_mutations['LikelyLoF'].value_counts()

True    1345
Name: LikelyLoF, dtype: int64

In [26]:
# Group by Tumour_Sample_Barcode (cell-line name) so we have 39 rows (one for each cell line), and a column containing comma seperated list
# of all the highly pathogenic mutations in that sample
path_muts_per_sample = pathogenic_mutations.groupby('ModelID')['HugoSymbol'].apply(lambda x: ', '.join(x)).reset_index()

# Check there are no consecutive commas (denoting missing values)
#path_muts_per_sample[path_muts_per_sample['HugoSymbol'].str.contains(", , ")]
path_muts_per_sample

Unnamed: 0,ModelID,HugoSymbol
0,ACH-000028,"ADGRL2, SMCP, FAM228B, NECTIN3, TMEM175, CLCN3..."
1,ACH-000060,"PALMD, TGFBR2, PBRM1, ERVW-1, ERVW-1, MUC12, J..."
2,ACH-000097,"ATAD3C, PLCH2, GBP1, GTDC1, VIL1, VIL1, SLC4A7..."
3,ACH-000118,"CCDC27, CROCC, RSRP1, RHCE, COL8A2, TNS1, PER2..."
4,ACH-000127,"GPATCH3, ZMYM4, P3H1, KANK4, UBAP2L, OR10T2, G..."
5,ACH-000138,"PHGDH, STRIP2, TNS2, NCOR2, TSNAXIP1, NLGN3"
6,ACH-000148,"EPHA10, ITPRID2, PARD3B, POLQ, MEF2C, MYL10, H..."
7,ACH-000159,"SDHB, TRIM33, FAM228B, TRIM43, VHL, FLT4, MUC1..."
8,ACH-000178,"ARID1A, FANCG, GRID1, ANGPTL5, SIPA1L1, FMN1, ..."
9,ACH-000222,"CSMD2, COL8A2, POGK, PRRC2C, RD3, ANKRD36C, MY..."


In [27]:
# Write to csv
path_muts_per_sample.to_csv(data_path + '\\dependant\\pathogenic_mutations_per_sample.csv')

In [28]:
# Select all rows of mutation DF (mut_df) where type = 'MISSENSE'
# May also want to include START_CODON_SNP here later
missense_mutations = mut_df[mut_df['VariantInfo']=='MISSENSE']

In [29]:
missense_mutations

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29670,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,0.976,3.000000,,,,,,,ACH-000856,339451.0
29673,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,0.416,3.000000,,,,,,,ACH-000856,5192.0
29674,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,0.436,1.000000,,,,,,,ACH-000856,55229.0
29675,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,0.071,2.000000,,,,,,,ACH-000856,8764.0
29676,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,0.166928,,,,,,,ACH-000856,100287898.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296633,chrX,153954753,T,C,0.857,0,6,0/1,,SNP,...,0.006,2.000000,,,,,,,ACH-000234,3054.0
1296636,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,,,,,ACH-000234,4508.0
1296637,chrM,9055,G,A,0.952,0,20,1|1,,SNP,...,,,,,,,,,ACH-000234,4508.0
1296640,chrM,11025,T,C,0.946,0,19,1|1,,SNP,...,,,,,,,,,ACH-000234,4538.0


In [30]:
# See how many DNP and TNP - currently these are not assessed
missense_mutations[(missense_mutations['VariantType'] == 'DNP') | (missense_mutations['VariantType'] == 'TNP')]

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,RevelScore,Funseq2Score,PharmgkbID,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID
29686,chr1,12777228,GT,AA,0.214,33,9,0|1,12777228.0,DNP,...,,,,,,,,,ACH-000856,390999.0
29687,chr1,12777427,GA,AG,0.155,38,6,0|1,12777421.0,DNP,...,,,,,,,,,ACH-000856,390999.0
29688,chr1,12859754,CC,TG,0.222,20,5,0|1,12859749.0,DNP,...,,,,,,,,,ACH-000856,65122.0
29689,chr1,12861232,AC,TG,0.315,13,6,0/1,,DNP,...,,,,,,,,,ACH-000856,65122.0
29697,chr1,17438418,CC,AT,0.450,10,8,0/1,,DNP,...,,,,,,,,,ACH-000856,55920.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296496,chr12,11091653,GT,TC,0.780,5,19,0|1,11091626.0,DNP,...,,,,,,,,,ACH-000234,259291.0
1296497,chr12,11091770,GC,AT,0.841,4,23,0|1,11091770.0,DNP,...,,,,,,,,,ACH-000234,259291.0
1296498,chr12,11092131,AA,GG,0.934,2,40,1|1,11092122.0,DNP,...,,,,,,,,,ACH-000234,259291.0
1296570,chr17,36212342,CG,GA,0.347,14,7,0|1,36212342.0,DNP,...,,,,,,,,,ACH-000234,9560.0


In [31]:
# Check what's in the chromosomes column
missense_mutations['Chrom'].unique()

array(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
       'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
       'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
       'chrX', 'chrM', 'chrY'], dtype=object)

In [99]:
# Create a VCF file with the SNP missense mutation data
mut_vcf = mut_df[(mut_df['VariantInfo']=='MISSENSE') & (mut_df['VariantType']=='SNP')]
mut_vcf = mut_vcf[['Chrom', 'Pos', 'DbsnpID',  'Ref', 'Alt']].copy()
mut_vcf.rename(columns={'Chrom' : '#CHROM' , 'Pos': 'POS', 'Ref': 'REF', 'Alt': 'ALT', 'DbsnpID' : 'ID'}, inplace=True)

mut_vcf['#CHROM'] = mut_vcf['#CHROM'].str.replace('chr', '')

mut_vcf.to_csv(data_path + '\\dependant\\depmap_mutations_for_fathmm.vcf', sep='\t', index=False)

print(len(mut_vcf))
mut_vcf.head()

12299


Unnamed: 0,#CHROM,POS,ID,REF,ALT
29670,1,963249,rs573683289,G,A
29673,1,2412394,,C,T
29674,1,2512914,,G,T
29675,1,2561696,,C,T
29676,1,2789833,,C,A


In [32]:
# Get unique transcript IDs from missense mutations dataframe
missense_transcript_ids = pd.Series(missense_mutations['Transcript'].unique(), name='TranscriptID')
missense_transcript_ids

0       ENST00000338591
1       ENST00000447513
2       ENST00000378466
3       ENST00000355716
4       ENST00000401095
             ...       
7641    ENST00000252998
7642    ENST00000314103
7643    ENST00000398145
7644    ENST00000610913
7645    ENST00000216146
Name: TranscriptID, Length: 7646, dtype: object

In [33]:
# Get unique transcript IDs from biomart conv file
biomart_transcript_ids = pd.Series(conv['Transcript stable ID'].unique(), name='TranscriptID')
biomart_transcript_ids

0         ENST00000361390
1         ENST00000361453
2         ENST00000361624
3         ENST00000361739
4         ENST00000361851
               ...       
122609    ENST00000483899
122610    ENST00000345034
122611    ENST00000375799
122612    ENST00000375793
122613    ENST00000642363
Name: TranscriptID, Length: 122614, dtype: object

In [34]:
# See how many of the missense transcript IDs are in the biomart conv file
missense_transcript_ids[missense_transcript_ids.isin(biomart_transcript_ids)]

0       ENST00000338591
1       ENST00000447513
2       ENST00000378466
3       ENST00000355716
4       ENST00000401095
             ...       
7641    ENST00000252998
7642    ENST00000314103
7643    ENST00000398145
7644    ENST00000610913
7645    ENST00000216146
Name: TranscriptID, Length: 7596, dtype: object

In [35]:
# Get the subset of biomart data that corresponds to the missense mutation transcript IDs, and select a subset of columns
mutation_info = conv.loc[conv['Transcript stable ID'].isin(missense_transcript_ids)]

In [36]:
mutation_info

Unnamed: 0,Gene stable ID,HGNC symbol,Chromosome/scaffold name,Transcript stable ID,Protein stable ID,UniProtKB/TrEMBL ID,UniProtKB/Swiss-Prot ID,Peptide
0,ENSG00000198888,MT-ND1,MT,ENST00000361390,ENSP00000354687,U5Z754,P03886,MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLL...
1,ENSG00000198763,MT-ND2,MT,ENST00000361453,ENSP00000355046,Q7GXY9,P03891,MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVLTK...
2,ENSG00000198804,MT-CO1,MT,ENST00000361624,ENSP00000354499,U5YWV7,P00395,MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGN...
3,ENSG00000198712,MT-CO2,MT,ENST00000361739,ENSP00000354876,U5Z487,P00403,MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFLTL...
4,ENSG00000228253,MT-ATP8,MT,ENST00000361851,ENSP00000355265,U5YV54,P03928,MPQLNTTVWPTMITPMLLTLFLITQLKMLNTNYHLPPSPKPMKMKN...
...,...,...,...,...,...,...,...,...
123087,ENSG00000116990,MYCL,1,ENST00000372816,ENSP00000361903,,P12524,MDYDSYQHYFYDYDCGEDFYRSTAPSEDIWKKFELVPSPPTSPPWG...
123095,ENSG00000132906,CASP9,1,ENST00000333868,ENSP00000330237,,P55211,MDEADRRLLRRCRLRLVEELQVDQLWDALLSRELFRPHMIEDIQRA...
123108,ENSG00000116138,DNAJC16,1,ENST00000375847,ENSP00000365007,,Q9Y2G8,MEVRKLSISWQFLIVLVLILQILSALDFDPYRVLGVSRTASQADIK...
123116,ENSG00000215695,RSC1A1,1,ENST00000345034,ENSP00000341963,,Q92681,MSSLPTSDGFNHPARSSGQSPDVGNPMSLARSVSASVCPIKPSDSD...


In [37]:
# See if there are any duplications (including only the subset of columns we are interested in)
mutation_info[mutation_info.duplicated(subset=['Gene stable ID', 'HGNC symbol','Transcript stable ID'], keep=False)]

Unnamed: 0,Gene stable ID,HGNC symbol,Chromosome/scaffold name,Transcript stable ID,Protein stable ID,UniProtKB/TrEMBL ID,UniProtKB/Swiss-Prot ID,Peptide
17342,ENSG00000167193,CRK,17,ENST00000300574,ENSP00000300574,A0A0S2Z3Q4,P46108,MAGNFDSEERSSWYWGRLSRQEAVALLQGQRHGVFLVRDSSTSPGD...
17343,ENSG00000167193,CRK,17,ENST00000300574,ENSP00000300574,L7RT18,P46108,MAGNFDSEERSSWYWGRLSRQEAVALLQGQRHGVFLVRDSSTSPGD...
60615,ENSG00000204252,HLA-DOA,6,ENST00000229829,ENSP00000229829,A0A1V0E3R0,P06340,MALRAGLVLGFHTLMTLLSPQEAGATKADHMGSYGPAFYQSYGASG...
60616,ENSG00000204252,HLA-DOA,6,ENST00000229829,ENSP00000229829,A0A1V0E3R8,P06340,MALRAGLVLGFHTLMTLLSPQEAGATKADHMGSYGPAFYQSYGASG...
60617,ENSG00000204252,HLA-DOA,6,ENST00000229829,ENSP00000229829,A0A1V0E3N6,P06340,MALRAGLVLGFHTLMTLLSPQEAGATKADHMGSYGPAFYQSYGASG...
60618,ENSG00000204252,HLA-DOA,6,ENST00000229829,ENSP00000229829,A0A1V0E3Q4,P06340,MALRAGLVLGFHTLMTLLSPQEAGATKADHMGSYGPAFYQSYGASG...
60619,ENSG00000204252,HLA-DOA,6,ENST00000229829,ENSP00000229829,A0A1V0E3R4,P06340,MALRAGLVLGFHTLMTLLSPQEAGATKADHMGSYGPAFYQSYGASG...
60620,ENSG00000204252,HLA-DOA,6,ENST00000229829,ENSP00000229829,A0A1V0E3Q7,P06340,MALRAGLVLGFHTLMTLLSPQEAGATKADHMGSYGPAFYQSYGASG...
60621,ENSG00000204252,HLA-DOA,6,ENST00000229829,ENSP00000229829,A0A1V0E3S4,P06340,MALRAGLVLGFHTLMTLLSPQEAGATKADHMGSYGPAFYQSYGASG...
60622,ENSG00000204252,HLA-DOA,6,ENST00000229829,ENSP00000229829,A0A1V0E3Q3,P06340,MALRAGLVLGFHTLMTLLSPQEAGATKADHMGSYGPAFYQSYGASG...


In [38]:
# Drop duplicates based on this subset of cols
mutation_info = mutation_info.drop_duplicates(subset=['Gene stable ID', 'HGNC symbol','Transcript stable ID'], keep='first')

In [39]:
# See if there are still any duplicates with just two columns now
mutation_info[mutation_info.duplicated(subset=['HGNC symbol','Transcript stable ID'], keep=False)]

Unnamed: 0,Gene stable ID,HGNC symbol,Chromosome/scaffold name,Transcript stable ID,Protein stable ID,UniProtKB/TrEMBL ID,UniProtKB/Swiss-Prot ID,Peptide


In [40]:
def get_info(row):
    """using the ready made conv, identify for each mutation the following
    ['Fasta',  'UniProtKB/Swiss-Prot ID',
       'Protein stable ID', 'transcript stable id', 'chromo', 'start', 'end',
       'wild', 'mutant']
    Throw away any rows which don't have this information as it will not be possible to find the missense status
       """
    # Regex pattern that is used to match protein mutations typically represented in a format like 'p.X123Y', where:
    #   'p.' indicates it is a protein mutation.
    #   The first ([A-Z]) is a capture group that matches the one-letter code of the original amino acid.
    #   (\d+) is a capture group that matches one or more digits, representing the position number of the amino acid in the protein sequence.
    #   The second ([A-Z]) is a capture group that matches the one-letter code of the new amino acid after the mutation.
    # Enables extraction of original and new amino acids and their position in the protein sequence from mutation strings.

    code = re.compile('p.([A-Z])(\d+)([A-Z])')

    # Function to check if the amino acid (w) at a specific position (pos) in a sequence (x) matches what's expected.
    #   x is expected to be a string representing an amino acid sequence.
    #   w is the one-letter code of an amino acid.
    #   pos is the position of the amino acid in the sequence (1-indexed).

    def check(x,w,pos):
        """helper function used below to check that our fasta has the right amino acid at the right place"""
        if len(x)>=pos:
            return x[pos-1]==w
        else:
            return False

    # Select relevant columns and assig to 'columns'
    columns = ['Gene stable ID', 'HGNC symbol', 'Peptide','Protein stable ID', 'Transcript stable ID', 'UniProtKB/Swiss-Prot ID']

    # Assign 'Annotation_Transcript'and 'Protein_Change' values fore this row to enst and aa
    enst, aa, position, chrom = row[['Transcript','ProteinChange','Pos','Chrom']]

    # if the enst for this row is in biomart_transcript_ids (conv table)
    # set info = the row where 'Transcript stable ID' == enst, select the columns listed above, and remove dup rows based on the 'Gene stable ID', 'HGNC symbol','Transcript stable ID' cols
    if enst in biomart_transcript_ids.tolist():
        info = conv.loc[conv['Transcript stable ID']==enst][columns].drop_duplicates(subset = ['Gene stable ID', 'HGNC symbol','Transcript stable ID'])
        #info = conv.loc[conv['Transcript stable ID']==enst][columns].drop_duplicates(subset = ['UniProtKB/Swiss-Prot ID'])
        
        # keep count of number of matching conv rows returned
        num_rows.append(len(info))
        
        # if more than one row returned append it to multiple_rows DF for later review
        if len(info) > 1:
            global multiple_rows
            multiple_rows = multiple_rows.append(info)

    # Otherwise info = np.nan
    else:
        info = 'not in transcript IDs'
        
    # print(info)

  # Check that 'info' object is a dataframe and not empty
    if type(info)==pd.DataFrame and info.shape[0]>0:

      # if 'ProteinChange' value (aa) matches the regex,
      # save the original amino acid, position and mutant amino acid as w, pos, m
      # otherwise save as np.nan
        match = code.match(aa)
        if match:
            w = match.group(1)
            pos = int(match.group(2))
            m = match.group(3)
        else:
            w,pos,m = np.nan,np.nan,np.nan

        # Use 'check' helper function to check that Fasta column of current row contains the wild type amino acid (w) in the correct position (pos)
        info0 = info.loc[info['Peptide'].map(lambda x:check(x,w,pos))]

        # If there is at least one row, save it as info1 (there will be one or none)
        if info0.shape[0]>0:
            info1 = info0.iloc[0]
        else:
            info1 = 'w not in pos'
    else:
        info1 = 'dataframe was empty'

    # If info1 is a series (ie. none of the 'else' conditions above were fulfilled)
    # Add columns for 'transcript stable id' , 'wild', 'mutant' and 'pos'
    # Return info1
    if type(info1)==pd.Series:
        info1['transcript stable id']=enst
        info1['wild'],info1['mutant'],info1['pos'] = w,m,pos
        info1['position'], info1['chrom'] = position, chrom
    return info1

# Variables to keep count and store row content when more than one conv row is returned for a missense_mutations ENST
num_rows = []
multiple_rows = pd.DataFrame()

# Apply the 'get_info' function to rows of the missense_mutations dataframe
test_info = missense_mutations.apply(get_info,axis = 1)

In [41]:
test_info

Unnamed: 0,Gene stable ID,HGNC symbol,Peptide,Protein stable ID,Transcript stable ID,UniProtKB/Swiss-Prot ID,transcript stable id,wild,mutant,pos,position,chrom
29670,ENSG00000187961,KLHL17,MQPRSERPAGRTQSPEHGSPGPGPEAPPPPPPQPPAPEAERTRPRQ...,ENSP00000343930,ENST00000338591,Q6TDP4,ENST00000338591,G,S,395,963249,chr1
29673,ENSG00000157911,PEX10,MAPAAASPPEVIRAAQKDEYYRGGLRSAAGGALHSLAGARKWLEWR...,ENSP00000407922,ENST00000447513,O60683,ENST00000447513,A,T,37,2412394,chr1
29674,ENSG00000157881,PANK4,MAECGASGSGSSGDSLDKSITLPPDEIFRNLENAKRFAIDIGGSLT...,ENSP00000367727,ENST00000378466,Q9NVE7,ENST00000378466,F,L,567,2512914,chr1
29675,ENSG00000157873,TNFRSF14,MEPPGDWGPPPWRSTPKTDVLRLVLYLTFLGAPCYAPALPSCKEDE...,ENSP00000347948,ENST00000355716,Q92956,ENST00000355716,A,V,192,2561696,chr1
29676,ENSG00000215912,TTC34,MMSAQELVACLCREGEQHLALGELPLATAFYLAAFSCHAPSALQSV...,ENSP00000383873,ENST00000401095,,ENST00000401095,R,L,433,2789833,chr1
...,...,...,...,...,...,...,...,...,...,...,...,...
1296633,ENSG00000172534,HCFC1,MASAVSPANLPAVLLQPRWKRVVGWSGPVPRPRHGHRAVAIKELIV...,ENSP00000359001,ENST00000369984,,ENST00000369984,S,G,1216,153954753,chrX
1296636,ENSG00000198899,MT-ATP6,MNENLFASFIAPTILGLPAAVLIILFPPLLIPTSKYLINNRLITTQ...,ENSP00000354632,ENST00000361899,P00846,ENST00000361899,A,T,80,8764,chrM
1296637,ENSG00000198899,MT-ATP6,MNENLFASFIAPTILGLPAAVLIILFPPLLIPTSKYLINNRLITTQ...,ENSP00000354632,ENST00000361899,P00846,ENST00000361899,A,T,177,9055,chrM
1296640,ENSG00000198886,MT-ND4,MLKLIVPTIMLLPLTWLSKKHMIWINTTTHSLIISIIPLLFFNQIN...,ENSP00000354961,ENST00000361381,P03905,ENST00000361381,L,P,89,11025,chrM


In [42]:
# Look at the list containing the number of conv rows returned for each missense_mutations ENST
# e.g. how many returned one row, how many returned 2, etc
# Note: dropping duplicates based on my revised subset instead of UniProtKB/Swiss-Prot ID prevents multiple rows being returned
len(num_rows)
num_rows = pd.Series(num_rows, name='Counts')
num_rows.value_counts()

1    13126
Name: Counts, dtype: int64

In [43]:
# This contains all the conv rows that were returned as multiple rows matching a single missense_mutations ENST
# Note: dropping duplicates based on my revised subset instead of UniProtKB/Swiss-Prot ID prevents multiple rows being returned
multiple_rows

In [44]:
# Check for the various conditions that would result from no data in a row
#test_info[test_info['Gene stable ID'] == 'not in transcript IDs']
#test_info[test_info['Gene stable ID'] == 'w not in pos']
#test_info[test_info['Gene stable ID'] == 'dataframe was empty']
#test_info[test_info['Gene stable ID'].isnull()]
test_info[test_info['pos'].isnull()]

Unnamed: 0,Gene stable ID,HGNC symbol,Peptide,Protein stable ID,Transcript stable ID,UniProtKB/Swiss-Prot ID,transcript stable id,wild,mutant,pos,position,chrom


In [45]:
alpha_missense = pd.read_csv(data_path + 'alphamissense\\AlphaMissense_hg38.tsv', skiprows=3, sep='\t')

In [46]:
alpha_missense.head(10)

Unnamed: 0,#CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class
0,chr1,69094,G,T,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign
1,chr1,69094,G,C,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign
2,chr1,69094,G,A,hg38,Q8NH21,ENST00000335137.4,V2M,0.3296,likely_benign
3,chr1,69095,T,C,hg38,Q8NH21,ENST00000335137.4,V2A,0.2609,likely_benign
4,chr1,69095,T,A,hg38,Q8NH21,ENST00000335137.4,V2E,0.2922,likely_benign
5,chr1,69095,T,G,hg38,Q8NH21,ENST00000335137.4,V2G,0.203,likely_benign
6,chr1,69097,A,G,hg38,Q8NH21,ENST00000335137.4,T3A,0.0929,likely_benign
7,chr1,69097,A,C,hg38,Q8NH21,ENST00000335137.4,T3P,0.1264,likely_benign
8,chr1,69097,A,T,hg38,Q8NH21,ENST00000335137.4,T3S,0.0979,likely_benign
9,chr1,69098,C,A,hg38,Q8NH21,ENST00000335137.4,T3N,0.1121,likely_benign


In [47]:
alpha_missense['genome'].value_counts()

hg38    71697556
Name: genome, dtype: int64

In [48]:
alpha_missense['am_class'].value_counts()

likely_benign        40917351
likely_pathogenic    22770557
ambiguous             8009648
Name: am_class, dtype: int64

In [49]:
alpha_missense[alpha_missense['transcript_id'].str.contains('ENST00000335137')]

Unnamed: 0,#CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class
0,chr1,69094,G,T,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign
1,chr1,69094,G,C,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign
2,chr1,69094,G,A,hg38,Q8NH21,ENST00000335137.4,V2M,0.3296,likely_benign
3,chr1,69095,T,C,hg38,Q8NH21,ENST00000335137.4,V2A,0.2609,likely_benign
4,chr1,69095,T,A,hg38,Q8NH21,ENST00000335137.4,V2E,0.2922,likely_benign
...,...,...,...,...,...,...,...,...,...,...
2013,chr1,70004,T,G,hg38,Q8NH21,ENST00000335137.4,F305C,0.0907,likely_benign
2014,chr1,70004,T,C,hg38,Q8NH21,ENST00000335137.4,F305S,0.1053,likely_benign
2015,chr1,70004,T,A,hg38,Q8NH21,ENST00000335137.4,F305Y,0.0795,likely_benign
2016,chr1,70005,T,G,hg38,Q8NH21,ENST00000335137.4,F305L,0.3187,likely_benign


In [50]:
# Rename columns in alpha_missense df to match those in missense_mutations for the merging criteria
alpha_missense.rename(columns={'#CHROM': 'Chrom', 'POS': 'Pos', 'REF': 'Ref', 'ALT': 'Alt'}, inplace=True)

# Now, merge the DataFrames
missense_mutations_extended = missense_mutations.merge(alpha_missense[['Chrom', 'Pos', 'Ref', 'Alt', 'transcript_id', 'am_pathogenicity', 'am_class']], 
                on=['Chrom', 'Pos', 'Ref', 'Alt'], 
                how='left')

missense_mutations_extended

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,transcript_id,am_pathogenicity,am_class
0,chr1,963249,G,A,0.303,22,9,0/1,,SNP,...,,,,,,ACH-000856,339451.0,ENST00000338591.8,0.9941,likely_pathogenic
1,chr1,2412394,C,T,0.391,14,8,0/1,,SNP,...,,,,,,ACH-000856,5192.0,ENST00000447513.6,0.1815,likely_benign
2,chr1,2512914,G,T,0.488,21,19,0/1,,SNP,...,,,,,,ACH-000856,55229.0,ENST00000378466.9,0.9989,likely_pathogenic
3,chr1,2561696,C,T,0.320,24,12,0/1,,SNP,...,,,,,,ACH-000856,8764.0,ENST00000355716.4,0.1136,likely_benign
4,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,,ACH-000856,100287898.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13274,chrX,153954753,T,C,0.857,0,6,0/1,,SNP,...,,,,,,ACH-000234,3054.0,ENST00000310441.12,0.0608,likely_benign
13275,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,,ACH-000234,4508.0,ENST00000361899.2,0.1314,likely_benign
13276,chrM,9055,G,A,0.952,0,20,1|1,,SNP,...,,,,,,ACH-000234,4508.0,ENST00000361899.2,0.1589,likely_benign
13277,chrM,11025,T,C,0.946,0,19,1|1,,SNP,...,,,,,,ACH-000234,4538.0,ENST00000361381.2,0.1123,likely_benign


In [56]:
missense_mutations_isoforms = missense_mutations_extended[missense_mutations_extended['am_pathogenicity'].isnull()]

In [None]:
missense_mutations_extended[(missense_mutations_extended['am_class'].isnull()) & (missense_mutations_extended['VariantType'] == 'SNP')]

In [51]:
alpha_missense_isoforms = pd.read_csv(data_path + '\\alphamissense\\AlphaMissense_isoforms_hg38.tsv', skiprows=3, sep='\t')

In [52]:
alpha_missense_isoforms.head()

Unnamed: 0,#CHROM,POS,REF,ALT,genome,transcript_id,protein_variant,am_pathogenicity,am_class
0,chr1,65568,A,C,hg38,ENST00000641515.2,K2Q,0.0938,likely_benign
1,chr1,65568,A,G,hg38,ENST00000641515.2,K2E,0.0766,likely_benign
2,chr1,65569,A,G,hg38,ENST00000641515.2,K2R,0.0756,likely_benign
3,chr1,65569,A,T,hg38,ENST00000641515.2,K2M,0.1732,likely_benign
4,chr1,65569,A,C,hg38,ENST00000641515.2,K2T,0.1186,likely_benign


In [57]:
# Rename columns in alpha_missense_isoforms df to match those in missense_mutations for the merging criteria
alpha_missense_isoforms.rename(columns={'#CHROM': 'Chrom', 'POS': 'Pos', 'REF': 'Ref', 'ALT': 'Alt'}, inplace=True)

missense_mutations_isoforms = missense_mutations_isoforms.drop(columns=['am_pathogenicity', 'am_class'])

# Now, merge the DataFrames
missense_mutations_isoforms_extended = missense_mutations_isoforms.merge(alpha_missense_isoforms[['Chrom', 'Pos', 'Ref', 'Alt', 'am_pathogenicity', 'am_class']], 
                on=['Chrom', 'Pos', 'Ref', 'Alt'], 
                how='left')

missense_mutations_isoforms_extended.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,transcript_id,am_pathogenicity,am_class
0,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,,ACH-000856,100287898.0,,0.087,likely_benign
1,chr1,12777228,GT,AA,0.214,33,9,0|1,12777228.0,DNP,...,,,,,,ACH-000856,390999.0,,,
2,chr1,12777427,GA,AG,0.155,38,6,0|1,12777421.0,DNP,...,,,,,,ACH-000856,390999.0,,,
3,chr1,12859754,CC,TG,0.222,20,5,0|1,12859749.0,DNP,...,,,,,,ACH-000856,65122.0,,,
4,chr1,12861232,AC,TG,0.315,13,6,0/1,,DNP,...,,,,,,ACH-000856,65122.0,,,


In [59]:
missense_complete = pd.concat([missense_mutations_isoforms_extended, missense_mutations_extended])

print(len(missense_mutations_extended))
print(len(missense_mutations_isoforms_extended))
print(len(missense_complete))

13279
2311
15590


In [61]:
missense_complete[~missense_complete['am_pathogenicity'].isnull()]

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,RefCount,AltCount,GT,PS,VariantType,...,DidaID,DidaName,GwasDisease,GwasPmID,GTexGene,ModelID,EntrezGeneID,transcript_id,am_pathogenicity,am_class
0,chr1,2789833,C,A,0.534,6,7,0/1,,SNP,...,,,,,,ACH-000856,100287898.0,,0.0870,likely_benign
6,chr1,20144618,T,C,0.344,19,9,0/1,,SNP,...,,,,,,ACH-000856,64600.0,,0.5294,ambiguous
8,chr1,153765060,C,T,0.550,18,22,0/1,,SNP,...,,,,,,ACH-000856,65123.0,,0.3609,ambiguous
9,chr1,153765060,C,T,0.550,18,22,0/1,,SNP,...,,,,,,ACH-000856,65123.0,,0.3640,ambiguous
11,chr2,20253922,C,T,0.629,13,23,0/1,,SNP,...,,,,,,ACH-000856,23369.0,,0.6862,likely_pathogenic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13274,chrX,153954753,T,C,0.857,0,6,0/1,,SNP,...,,,,,,ACH-000234,3054.0,ENST00000310441.12,0.0608,likely_benign
13275,chrM,8764,G,A,0.923,0,12,1|1,,SNP,...,,,,,,ACH-000234,4508.0,ENST00000361899.2,0.1314,likely_benign
13276,chrM,9055,G,A,0.952,0,20,1|1,,SNP,...,,,,,,ACH-000234,4508.0,ENST00000361899.2,0.1589,likely_benign
13277,chrM,11025,T,C,0.946,0,19,1|1,,SNP,...,,,,,,ACH-000234,4538.0,ENST00000361381.2,0.1123,likely_benign


In [62]:
missense_complete[~missense_complete['am_pathogenicity'].isnull()]['am_class'].value_counts()

likely_benign        8987
likely_pathogenic    2948
ambiguous            1301
Name: am_class, dtype: int64

In [67]:
from pyliftover import LiftOver
lo = LiftOver('hg38', 'hg19')
lo.convert_coordinate('chr1', 1000000, '-')

[('chr1', 935380, '-', 20849626768)]

In [None]:
mut_df['hg19 translation'] = mut_df.map() 

In [108]:
# manually checking some fathmm-xf results against alpha missense
alpha_missense[(alpha_missense['Chrom']=='chr11') & (alpha_missense['Pos']==7313831)]

Unnamed: 0,Chrom,Pos,Ref,Alt,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class
10649352,chr11,7313831,A,G,hg38,Q86SS6,ENST00000318881.11,R312G,0.9982,likely_pathogenic
10649353,chr11,7313831,A,T,hg38,Q86SS6,ENST00000318881.11,R312W,0.9968,likely_pathogenic
