## Get XYZ coordinates of Mutants for Clustering
Here we collect xyz coords and prepare dataframe for clustering

In [1]:
from pyspark.sql import SparkSession
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionExtractor, InteractionFilter
from ipywidgets import interact, IntSlider
import py3Dmol
import pandas as pd
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', 500)
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import interactive
interactive(True)

from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [2]:
from mmtfPyspark.interactions import VariantInteractionExtractor

#### Input parameters

In [48]:
input_file_name = '../analysis/NRF2_pathway/dataframes/step2/mutations_NRF2v2_step2A.csv' # mutations mapped to 3D protein structures
coverage_output = '../dataframes/PDB/DF_PDB_coverage_all_human.csv' #Structural coverage of human proteins

output_file_name = '../analysis/NRF2_pathway/dataframes/step2/mutations_NRF2v2_step2B.csv' # mutations mapped to XYZ and clustered
output_file_name_2 = '../analysis/NRF2_pathway/dataframes/step2/mutations_NRF2v2_step2B_detailed.csv' # mutations mapped to XYZ and clustered (detailed version)
output_file_name_3 = '../analysis/NRF2_pathway/dataframes/step2/mutations_NRF2v2_step2B_PDB_ranks.csv' # mutations mapped to XYZ and clustered (PDB ranks)

In [4]:
df = pd.read_csv(input_file_name)
df['pdbPosition'] = df['pdbPosition'].astype('str')
df['Chromosome'] = df['Chromosome'].astype('str')


  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
print('Unique UniProts    :', len(df.Hugo_Symbol.unique()),df.Hugo_Symbol.unique() )
print('Unique Variants  :', len(df['variationId'].unique()))

Unique UniProts    : 16 ['NTRK2' 'SIRT1' 'KEAP1' 'AKR1C2' 'TXNRD1' 'AKR1B10' 'NQO1' 'CUL3'
 'AKR1C4' 'NFE2L2' 'AKR1C3' 'SRXN1' 'GSTM3' 'UCHL1' 'G6PD' 'RAB6B']
Unique Variants  : 479


### Get XYZ coordinates of variants

In [6]:
pdbids = list(set([i.upper().split('.')[0] for i in df.structureChainId.unique() if pd.notnull(i)]))

#### Get all interacting residues in a sphere around variants

In [7]:
interactions = pd.DataFrame()

for pdb in pdbids:
    structures = mmtfReader.download_full_mmtf_files([pdb])
    structures = structures.filter(lambda s: s[1].num_models == 1)  ## Currently, only structurs with 1 model are supported
    
    interactionFilter = InteractionFilter(distanceCutoff=2,minInteractions=1)

    tmp = VariantInteractionExtractor().get_variant_interactions(structures, interactionFilter, level='atom').toPandas()
    interactions = pd.concat([interactions,tmp])
    
interactions.head()

Unnamed: 0,structureChainId,queryLigandId,queryLigandChainId,queryLigandNumber,queryAtomName,targetGroupId,targetChainId,targetGroupNumber,targetAtomName,distance,sequenceIndex,sequence,query_x,query_y,query_z,target_x,target_y,target_z
0,1S2A.A,PHE,A,21,CA,PHE,A,21,CB,1.523771,20,MDSKQQCVKLNDGHFMPVLGFGTYAPPEVPRSKALEVTKLAIEAGF...,36.955002,-32.937,53.568001,37.449001,-34.375,53.467999
1,1S2A.A,ARG,A,263,O,ARG,A,263,C,1.230753,262,MDSKQQCVKLNDGHFMPVLGFGTYAPPEVPRSKALEVTKLAIEAGF...,41.277,-14.389,51.493999,41.361,-15.179,50.554001
2,1S2A.A,PRO,A,233,N,SER,A,232,C,1.323599,231,MDSKQQCVKLNDGHFMPVLGFGTYAPPEVPRSKALEVTKLAIEAGF...,17.648001,-19.771999,46.93,17.118,-20.722,47.683998
3,1S2A.A,MET,A,16,C,MET,A,16,O,1.233394,15,MDSKQQCVKLNDGHFMPVLGFGTYAPPEVPRSKALEVTKLAIEAGF...,46.43,-26.716,56.780998,46.651001,-25.711,56.101002
4,1S2A.A,ALA,A,160,CA,ALA,A,160,C,1.510904,159,MDSKQQCVKLNDGHFMPVLGFGTYAPPEVPRSKALEVTKLAIEAGF...,44.421001,-31.063999,70.808998,45.700001,-30.33,70.480003


#### merge with original dataframe

In [8]:
mt = df.merge(interactions, left_on=['structureChainId','pdbPosition'], right_on=['structureChainId','targetGroupNumber'], how='inner')

In [9]:
print('Unique UniProts    :', len(mt.Hugo_Symbol.unique()),mt.Hugo_Symbol.unique() )
print('Unique Variants  :', len(mt['variationId'].unique()))

Unique UniProts    : 16 ['NTRK2' 'SIRT1' 'KEAP1' 'AKR1C2' 'TXNRD1' 'AKR1B10' 'NQO1' 'CUL3'
 'AKR1C4' 'NFE2L2' 'AKR1C3' 'SRXN1' 'GSTM3' 'UCHL1' 'G6PD' 'RAB6B']
Unique Variants  : 469


In [78]:
mt[(mt.Hugo_Symbol==gene)&(mt.structureChainId==struct)&(mt.Protein_Change=='p.A122V')]

Unnamed: 0,alignmentId,bitscore,chainId,error,evalue,exception,identity,identityPositive,message,midlineAlign,path,pdbAlign,pdbFrom,pdbId,pdbNo,pdbSeg,pdbTo,refGenome,residueMapping,segStart,seqAlign,seqFrom,seqId,seqTo,status,timestamp,updateDate,variationId,structureId,pdbPosition,pdbAminoAcid,seqIdentity,tax_id,scientific_name,structureChainId,pdbResNum,pdbSeqNum,uniprotId,uniprotNum,Hugo_Symbol,Entrez_Gene_Id,Variant_Classification,Genome_Change,Chromosome,Tumor_Sample_Barcode,Protein_Change,Reference_Allele,Tumor_Seq_Allele1,ID,var_id,queryLigandId,queryLigandChainId,queryLigandNumber,queryAtomName,targetGroupId,targetChainId,targetGroupNumber,targetAtomName,distance,sequenceIndex,sequence,query_x,query_y,query_z,target_x,target_y,target_z
281276,12045497,1014.600,A,,0.0,,494.0,494.0,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,1,3qfb,3qfb_A_1,1,495,hgvs-grch37,"[Row(pdbAminoAcid='A', pdbPosition=160, queryA...",5,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,5,344933,498,,,2017-10-25,chr12:g.104713303C>T,3QFB,160,A,100.00000,9606,Homo sapiens,3QFB.A,160,180,Q16881,310,TXNRD1,7296,Missense_Mutation,g.chr12:104713303C>T,12,HUH1_LIVER,p.A122V,C,T,TXNRD1_p.A122V,chr12:g.104713303C>T,ALA,A,160,C,ALA,A,160,CA,1.525350,179,MGSSHHHHHHSSGLVPRGSHMNGPEDLPKSYDYDLIIIGGGSGGLA...,60.778999,-94.230003,43.841999,59.402000,-94.024002,44.465000
281277,12045497,1014.600,A,,0.0,,494.0,494.0,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,1,3qfb,3qfb_A_1,1,495,hgvs-grch37,"[Row(pdbAminoAcid='A', pdbPosition=160, queryA...",5,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,5,344933,498,,,2017-10-25,chr12:g.104713303C>T,3QFB,160,A,100.00000,9606,Homo sapiens,3QFB.A,160,180,Q16881,310,TXNRD1,7296,Missense_Mutation,g.chr12:104713303C>T,12,HUH1_LIVER,p.A122V,C,T,TXNRD1_p.A122V,chr12:g.104713303C>T,ALA,A,160,CA,ALA,A,160,N,1.466221,179,MGSSHHHHHHSSGLVPRGSHMNGPEDLPKSYDYDLIIIGGGSGGLA...,59.402000,-94.024002,44.465000,59.533001,-93.165001,45.646000
281278,12045497,1014.600,A,,0.0,,494.0,494.0,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,1,3qfb,3qfb_A_1,1,495,hgvs-grch37,"[Row(pdbAminoAcid='A', pdbPosition=160, queryA...",5,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,5,344933,498,,,2017-10-25,chr12:g.104713303C>T,3QFB,160,A,100.00000,9606,Homo sapiens,3QFB.A,160,180,Q16881,310,TXNRD1,7296,Missense_Mutation,g.chr12:104713303C>T,12,HUH1_LIVER,p.A122V,C,T,TXNRD1_p.A122V,chr12:g.104713303C>T,ALA,A,160,N,ALA,A,160,CA,1.466221,179,MGSSHHHHHHSSGLVPRGSHMNGPEDLPKSYDYDLIIIGGGSGGLA...,59.533001,-93.165001,45.646000,59.402000,-94.024002,44.465000
281279,12045497,1014.600,A,,0.0,,494.0,494.0,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,1,3qfb,3qfb_A_1,1,495,hgvs-grch37,"[Row(pdbAminoAcid='A', pdbPosition=160, queryA...",5,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,5,344933,498,,,2017-10-25,chr12:g.104713303C>T,3QFB,160,A,100.00000,9606,Homo sapiens,3QFB.A,160,180,Q16881,310,TXNRD1,7296,Missense_Mutation,g.chr12:104713303C>T,12,HUH1_LIVER,p.A122V,C,T,TXNRD1_p.A122V,chr12:g.104713303C>T,ILE,A,159,C,ALA,A,160,N,1.335989,179,MGSSHHHHHHSSGLVPRGSHMNGPEDLPKSYDYDLIIIGGGSGGLA...,58.673000,-93.193001,46.667999,59.533001,-93.165001,45.646000
281280,12045497,1014.600,A,,0.0,,494.0,494.0,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,1,3qfb,3qfb_A_1,1,495,hgvs-grch37,"[Row(pdbAminoAcid='A', pdbPosition=160, queryA...",5,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,5,344933,498,,,2017-10-25,chr12:g.104713303C>T,3QFB,160,A,100.00000,9606,Homo sapiens,3QFB.A,160,180,Q16881,310,TXNRD1,7296,Missense_Mutation,g.chr12:104713303C>T,12,HUH1_LIVER,p.A122V,C,T,TXNRD1_p.A122V,chr12:g.104713303C>T,ALA,A,160,C,ALA,A,160,O,1.229356,179,MGSSHHHHHHSSGLVPRGSHMNGPEDLPKSYDYDLIIIGGGSGGLA...,60.778999,-94.230003,43.841999,60.952999,-94.120003,42.630001
281281,12045497,1014.600,A,,0.0,,494.0,494.0,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,1,3qfb,3qfb_A_1,1,495,hgvs-grch37,"[Row(pdbAminoAcid='A', pdbPosition=160, queryA...",5,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,5,344933,498,,,2017-10-25,chr12:g.104713303C>T,3QFB,160,A,100.00000,9606,Homo sapiens,3QFB.A,160,180,Q16881,310,TXNRD1,7296,Missense_Mutation,g.chr12:104713303C>T,12,HUH1_LIVER,p.A122V,C,T,TXNRD1_p.A122V,chr12:g.104713303C>T,THR,A,161,N,ALA,A,160,C,1.324539,179,MGSSHHHHHHSSGLVPRGSHMNGPEDLPKSYDYDLIIIGGGSGGLA...,61.750000,-94.544998,44.686001,60.778999,-94.230003,43.841999
281282,12045497,1014.600,A,,0.0,,494.0,494.0,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,1,3qfb,3qfb_A_1,1,495,hgvs-grch37,"[Row(pdbAminoAcid='A', pdbPosition=160, queryA...",5,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,5,344933,498,,,2017-10-25,chr12:g.104713303C>T,3QFB,160,A,100.00000,9606,Homo sapiens,3QFB.A,160,180,Q16881,310,TXNRD1,7296,Missense_Mutation,g.chr12:104713303C>T,12,HUH1_LIVER,p.A122V,C,T,TXNRD1_p.A122V,chr12:g.104713303C>T,ALA,A,160,CA,ALA,A,160,CB,1.533215,179,MGSSHHHHHHSSGLVPRGSHMNGPEDLPKSYDYDLIIIGGGSGGLA...,59.402000,-94.024002,44.465000,58.463001,-93.374001,43.442001
281283,12045497,1014.600,A,,0.0,,494.0,494.0,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,1,3qfb,3qfb_A_1,1,495,hgvs-grch37,"[Row(pdbAminoAcid='A', pdbPosition=160, queryA...",5,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,5,344933,498,,,2017-10-25,chr12:g.104713303C>T,3QFB,160,A,100.00000,9606,Homo sapiens,3QFB.A,160,180,Q16881,310,TXNRD1,7296,Missense_Mutation,g.chr12:104713303C>T,12,HUH1_LIVER,p.A122V,C,T,TXNRD1_p.A122V,chr12:g.104713303C>T,ALA,A,160,CA,ALA,A,160,C,1.525350,179,MGSSHHHHHHSSGLVPRGSHMNGPEDLPKSYDYDLIIIGGGSGGLA...,59.402000,-94.024002,44.465000,60.778999,-94.230003,43.841999
281284,12045497,1014.600,A,,0.0,,494.0,494.0,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,1,3qfb,3qfb_A_1,1,495,hgvs-grch37,"[Row(pdbAminoAcid='A', pdbPosition=160, queryA...",5,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,5,344933,498,,,2017-10-25,chr12:g.104713303C>T,3QFB,160,A,100.00000,9606,Homo sapiens,3QFB.A,160,180,Q16881,310,TXNRD1,7296,Missense_Mutation,g.chr12:104713303C>T,12,HUH1_LIVER,p.A122V,C,T,TXNRD1_p.A122V,chr12:g.104713303C>T,ALA,A,160,CB,ALA,A,160,CA,1.533215,179,MGSSHHHHHHSSGLVPRGSHMNGPEDLPKSYDYDLIIIGGGSGGLA...,58.463001,-93.374001,43.442001,59.402000,-94.024002,44.465000
281285,12045497,1014.600,A,,0.0,,494.0,494.0,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,1,3qfb,3qfb_A_1,1,495,hgvs-grch37,"[Row(pdbAminoAcid='A', pdbPosition=160, queryA...",5,EDLPKSYDYDLIIIGGGSGGLAAAKEAAQYGKKVMVLDFVTPTPLG...,5,344933,498,,,2017-10-25,chr12:g.104713303C>T,3QFB,160,A,100.00000,9606,Homo sapiens,3QFB.A,160,180,Q16881,310,TXNRD1,7296,Missense_Mutation,g.chr12:104713303C>T,12,HUH1_LIVER,p.A122V,C,T,TXNRD1_p.A122V,chr12:g.104713303C>T,ALA,A,160,O,ALA,A,160,C,1.229356,179,MGSSHHHHHHSSGLVPRGSHMNGPEDLPKSYDYDLIIIGGGSGGLA...,60.952999,-94.120003,42.630001,60.778999,-94.230003,43.841999


### Get R group atom coordinates per mutant

list of atom name nomenclature used in PDB to specify atoms in R groups on a per amino acid level

In [11]:
atom_nom = pd.read_csv('../dataframes/PDB/atom_nomenclature_PDB.csv',index_col=None)

## ** TO Do: make into routine **

In [12]:
col1 = ['Hugo_Symbol','Protein_Change','Tumor_Sample_Barcode','variationId','uniprotId','structureChainId','targetGroupId','targetChainId','targetGroupNumber','targetAtomName','target_x','target_y','target_z']

mtx = mt[col1]

#mtx = df_tmp
#dfx = df_trimmed

dfx = pd.DataFrame()

for i in mtx.targetGroupId.unique():
    
    #only selects rows of i_amino acid with n list of atom names
    n = atom_nom[atom_nom.Name==i].Atom.values.tolist()[0].split(',')#+['CA','CB']
    sel = mtx[(mtx.targetGroupId==i)&(mtx.targetAtomName.isin(n))]
    
    # passes the rows with matching atoms to a trimmed dataframe
    dfx = pd.concat([dfx,sel])
    
    # checks/adds variants with missing atoms
    if len(mtx[mtx.targetGroupId==i].variationId.unique()) == len(sel[sel.targetGroupId==i].variationId.unique()):
        pass
    else:
        var = [x for x in mtx[mtx.targetGroupId==i].variationId.unique() if x not in dfx[dfx.targetGroupId==i].variationId.unique()]
        
        for j in var:
            sel = mtx[(mtx.targetGroupId==i)&(mtx.variationId==j)&(mtx.targetAtomName.isin(['CB']))]
            dfx = pd.concat([dfx,sel])
             
    
dfx = dfx.drop_duplicates()
    
print(len(mtx.variationId.unique()),len(dfx.variationId.unique()))
    

469 469


for some amino acids, we compute the midpoint between two important R-group atoms

In [13]:
mpx_list = []
mpy_list = []
mpz_list = []
atom_list = []

for i in dfx.index:
    a = dfx['Hugo_Symbol'].loc[i]
    b = dfx['Protein_Change'].loc[i]
    c = dfx['Tumor_Sample_Barcode'].loc[i]    
    d = dfx['variationId'].loc[i]    
    e = dfx['structureChainId'].loc[i]   
    
    tmp = dfx[(dfx['Hugo_Symbol']==a)&(dfx['Protein_Change']==b)&(dfx['Tumor_Sample_Barcode']==c)&(dfx['variationId']==d)&(dfx['structureChainId']==e)]
    
    if len(tmp) > 1:
        # compute midpoint between two atoms listed for a single residue
        
        x1 = tmp['target_x'].values[0]
        x2 = tmp['target_x'].values[1]
        
        y1 = tmp['target_y'].values[0]
        y2 = tmp['target_y'].values[1]
        
        z1 = tmp['target_z'].values[0]
        z2 = tmp['target_z'].values[1]
        
        mp_x = np.true_divide(np.add(x1,x2),2)
        mp_y = np.true_divide(np.add(y1,y2),2)
        mp_z = np.true_divide(np.add(z1,z2),2)
        
        atom1 = tmp['targetAtomName'].values[0]
        atom2 = tmp['targetAtomName'].values[1]
        
        mpx_list.append(mp_x)
        mpy_list.append(mp_y)
        mpz_list.append(mp_z)
        atom_list.append(atom1+'_'+atom2)
        
    else: 
        # just append the single atom information
        
        mpx_list.append(tmp['target_x'].values[0])
        mpy_list.append(tmp['target_y'].values[0])
        mpz_list.append(tmp['target_z'].values[0])
        atom_list.append(tmp['targetAtomName'].values[0])
        
dfx['cluster_x'] = mpx_list
dfx['cluster_y'] = mpy_list
dfx['cluster_z'] = mpz_list
dfx['atom_nom'] = atom_list
        
dfx.head(2)        
    

Unnamed: 0,Hugo_Symbol,Protein_Change,Tumor_Sample_Barcode,variationId,uniprotId,structureChainId,targetGroupId,targetChainId,targetGroupNumber,targetAtomName,target_x,target_y,target_z,cluster_x,cluster_y,cluster_z,atom_nom
2,NTRK2,p.C789C,A4FUK_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,chr9:g.87636250C>T,Q16620,4AT5.A,CYS,A,805,SG,-28.995001,12.055,-20.174999,-28.995001,12.055,-20.174999,SG
20346,SIRT1,p.C501C,BICR18_UPPER_AERODIGESTIVE_TRACT,chr10:g.69672376C>T,Q96EB6,4ZZJ.A,CYS,A,501,SG,17.503,86.165001,16.693001,17.503,86.165001,16.693001,SG


## ** ** 

In [14]:
for i in df.variationId.unique():
    if i not in dfx.variationId.unique() and pd.notnull(i):
        print("variant not included in analysis:",i)

variant not included in analysis: chr20:g.633771T>C
variant not included in analysis: chr9:g.87482291C>T
variant not included in analysis: chr9:g.87482291C>A
variant not included in analysis: chr9:g.87482300T>C
variant not included in analysis: chr9:g.87482282G>C
variant not included in analysis: chr9:g.87482304C>G
variant not included in analysis: chr2:g.178095803C>T
variant not included in analysis: chr2:g.178095784T>G
variant not included in analysis: chr2:g.178095900A>G
variant not included in analysis: chr2:g.178095866G>T


In [15]:
dfx = dfx.drop(columns=['targetAtomName','target_x','target_y','target_z'])
dfx.drop_duplicates(inplace=True)

### Merge with PDB coverage data and rank order

In [16]:
cov = pd.read_csv(coverage_output)
cov.head(2)

Unnamed: 0,PDB_CHAIN_ID,SP_PRIMARY,GENE,coverage
0,10GS.A,P09211,GSTP1,0.990476
1,10GS.B,P09211,GSTP1,0.990476


In [17]:
# example
a = '5NLB.A'
b = 'Q14145'
c = 'KEAP1'
cov[(cov.PDB_CHAIN_ID==a)]

Unnamed: 0,PDB_CHAIN_ID,SP_PRIMARY,GENE,coverage
75271,5NLB.A,Q14145,KEAP1,0.246795


In [18]:
dfx = dfx.merge(cov, right_on=['PDB_CHAIN_ID','SP_PRIMARY'],left_on=['structureChainId','uniprotId'])

In [19]:
dfx = dfx.sort_values(by=['Hugo_Symbol','coverage'],ascending=False)
dfx.head()

Unnamed: 0,Hugo_Symbol,Protein_Change,Tumor_Sample_Barcode,variationId,uniprotId,structureChainId,targetGroupId,targetChainId,targetGroupNumber,cluster_x,cluster_y,cluster_z,atom_nom,PDB_CHAIN_ID,SP_PRIMARY,GENE,coverage
6447,UCHL1,p.P43S,HCT15_LARGE_INTESTINE,chr4:g.41259707C>T,P09936,3KW5.A,PRO,A,43,36.817001,56.400002,95.001999,CG,3KW5.A,P09936,UCHL1,1.0
6448,UCHL1,p.V200V,LP1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,chr4:g.41270018C>A,P09936,3KW5.A,VAL,A,200,43.471001,63.093002,75.249001,CG2_CG1,3KW5.A,P09936,UCHL1,1.0
6449,UCHL1,p.E203*,HCC2998_LARGE_INTESTINE,chr4:g.41270025G>T,P09936,3KW5.A,GLU,A,203,41.978001,65.665497,71.873497,OE2_OE1,3KW5.A,P09936,UCHL1,1.0
6450,UCHL1,p.K83K,SNU1040_LARGE_INTESTINE,chr4:g.41262738G>A,P09936,3KW5.A,LYS,A,83,55.077,48.330002,66.055,NZ,3KW5.A,P09936,UCHL1,1.0
6451,UCHL1,p.S89F,MCC26_SKIN,chr4:g.41262755C>T,P09936,3KW5.A,SER,A,89,45.209,41.261002,72.905998,OG,3KW5.A,P09936,UCHL1,1.0


In [20]:
dfx.to_csv(output_file_name_2)

### Compute size of the set of resolved variants per PDB chain

In [21]:
# for the form: p.A69P, sorts by digit

import re
# def sorted_nicely( l ):
#     """ Sorts the given iterable in the way that is expected.

#     Required arguments:
#     l -- The iterable to be sorted.

#     """
#     convert = lambda text: int(text) if text.isdigit() else text
#     alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)][1]
#     return sorted(l, key = alphanum_key)

def extract_numeric( l ):
    regex = re.compile(r'(\d+|\s+)')
    ll = []
    for k in l:
        ll.append(regex.split(k)[1])
    return ll

In [22]:
PDB_ranks = []

for i in dfx.Hugo_Symbol.unique():
    
    for j in dfx[dfx.Hugo_Symbol==i].structureChainId.unique():
        
        tt = dfx[(dfx.Hugo_Symbol==i)&(dfx.structureChainId==j)][['Hugo_Symbol','Protein_Change','structureChainId','targetGroupNumber','coverage','cluster_x','cluster_y','cluster_z']].drop_duplicates()
        
        name = i+'_'+j
        # sort mut_list
        
        prot_list = tt.Protein_Change.unique().tolist()
        mut_list = list(set(extract_numeric((tt.Protein_Change.unique().tolist()))))
        targetId_list = tt.targetGroupNumber.tolist()
        tot_mut_list = list(set(extract_numeric(dfx[dfx.Hugo_Symbol==i].Protein_Change.unique().tolist())))
        
        
        mut_cov = np.true_divide(len(mut_list),len(tot_mut_list))
        nope = np.sort(list(set(extract_numeric([k for k in dfx[dfx.Hugo_Symbol==i].Protein_Change.unique() if k not in prot_list])))).tolist()
        
        
        PDB_ranks.append({'Hugo_Symbol':tt.Hugo_Symbol.unique()[0],'ID': name,'targetId_list':targetId_list, 'structureChainId':tt.structureChainId.unique()[0] ,'var_in_PDB':mut_list,'var_not_in_PDB':nope,'tot_num_variant_pos':len(tot_mut_list),'tot_var_list':tot_mut_list,'mutation_coverage':mut_cov,'coverage':tt.coverage.unique()[0]})

DF_PDB_ranks = pd.DataFrame(PDB_ranks)

DF_PDB_ranks = DF_PDB_ranks.sort_values(['Hugo_Symbol','tot_num_variant_pos','coverage'], ascending=False)



DF_PDB_ranks.head(2)        

Unnamed: 0,Hugo_Symbol,ID,coverage,mutation_coverage,structureChainId,targetId_list,tot_num_variant_pos,tot_var_list,var_in_PDB,var_not_in_PDB
0,UCHL1,UCHL1_3KW5.A,1.0,1.0,3KW5.A,"[43, 200, 203, 83, 89, 100, 207, 213, 39, 173]",10,"[100, 213, 83, 203, 43, 207, 39, 89, 173, 200]","[100, 213, 83, 203, 43, 207, 39, 89, 173, 200]",[]
1,UCHL1,UCHL1_2ETL.B,1.0,1.0,2ETL.B,"[43, 200, 203, 83, 89, 100, 207, 213, 39, 173]",10,"[100, 213, 83, 203, 43, 207, 39, 89, 173, 200]","[100, 213, 83, 203, 43, 207, 39, 89, 173, 200]",[]


In [23]:
DF_PDB_ranks.to_csv(output_file_name_3)

## ** To Do : write algorithm that finds a set of PDBs that maximize coverage over all mutations

example: for Keap1, i maximize protein coverage, mutation coverage and look for unique subsets of mutations covered by these files

in addition, alert for files containing other useful information, such as ligand, ppi, etc. 

In [24]:
NFE2L2_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='NFE2L2'].structureChainId.unique().tolist()

In [25]:
AKR1C4_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='AKR1C4'].structureChainId.unique().tolist()[0:1]

In [26]:
AKR1B10_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='AKR1B10'].structureChainId.unique().tolist()[0:1]

In [27]:
KEAP1_structs = ['4L7B.B','4CXT.A']

In [28]:
CUL3_structs = ['4EOZ.B','4APF.B']

In [29]:
UCHL1_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='UCHL1'].structureChainId.unique().tolist()[0:1]

In [30]:
TXNRD1_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='TXNRD1'].structureChainId.unique().tolist()[0:1]

In [31]:
SRXN1_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='SRXN1'].structureChainId.unique().tolist()[0:1]

In [32]:
SIRT1_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='SIRT1'].structureChainId.unique().tolist()[0:1]

In [33]:
RAB6B_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='RAB6B'].structureChainId.unique().tolist()[0:1]

In [34]:
NTRK2_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='NTRK2'].structureChainId.unique().tolist()[0:2]

In [35]:
NQO1_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='NQO1'].structureChainId.unique().tolist()[0:1]

In [36]:
GSTM3_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='GSTM3'].structureChainId.unique().tolist()[0:1]

In [37]:
G6PD_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='G6PD'].structureChainId.unique().tolist()[0:1]

In [38]:
NQO1_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='NQO1'].structureChainId.unique().tolist()[0:1]

In [39]:
AKR1C3_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='AKR1C3'].structureChainId.unique().tolist()[0:1]

In [40]:
AKR1C2_structs = DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='AKR1C2'].structureChainId.unique().tolist()[0:1]

In [41]:
list_all = KEAP1_structs+NFE2L2_structs+CUL3_structs+AKR1C4_structs+AKR1B10_structs+UCHL1_structs+TXNRD1_structs+SRXN1_structs+SIRT1_structs+RAB6B_structs+NTRK2_structs+NQO1_structs+GSTM3_structs+G6PD_structs+NQO1_structs+AKR1C3_structs+AKR1C2_structs

In [42]:
tot_variants = []
pdb_variants = []

for i in KEAP1_structs:
    pdb_variants.extend(DF_PDB_ranks[(DF_PDB_ranks.Hugo_Symbol=='KEAP1')&(DF_PDB_ranks.structureChainId==i)].var_in_PDB.values[0])
    
pdb_variants = list(set(pdb_variants))

print("Number of KEAP1 variants captured across structures:", np.true_divide(len(set(pdb_variants)),DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='KEAP1'].tot_num_variant_pos.unique()[0]))    
    
tot_variants = []
pdb_variants = []
    
for i in NFE2L2_structs:
    pdb_variants.extend(DF_PDB_ranks[(DF_PDB_ranks.Hugo_Symbol=='NFE2L2')&(DF_PDB_ranks.structureChainId==i)].var_in_PDB.values[0])
    
pdb_variants = list(set(pdb_variants))

print("Number of NRF2 variants captured across structures:", np.true_divide(len(set(pdb_variants)),DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='NFE2L2'].tot_num_variant_pos.unique()[0]))    

tot_variants = []
pdb_variants = []
    
for i in CUL3_structs:
    pdb_variants.extend(DF_PDB_ranks[(DF_PDB_ranks.Hugo_Symbol=='CUL3')&(DF_PDB_ranks.structureChainId==i)].var_in_PDB.values[0])
    
pdb_variants = list(set(pdb_variants))

print("Number of CUL3 variants captured across structures:", np.true_divide(len(set(pdb_variants)),DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='CUL3'].tot_num_variant_pos.unique()[0]))    

tot_variants = []
pdb_variants = []

for i in AKR1C4_structs:
    pdb_variants.extend(DF_PDB_ranks[(DF_PDB_ranks.Hugo_Symbol=='AKR1C4')&(DF_PDB_ranks.structureChainId==i)].var_in_PDB.values[0])
    
pdb_variants = list(set(pdb_variants))

print("Number of AKR1C4 variants captured across structures:", np.true_divide(len(set(pdb_variants)),DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='AKR1C4'].tot_num_variant_pos.unique()[0]))    


tot_variants = []
pdb_variants = []

for i in AKR1B10_structs:
    pdb_variants.extend(DF_PDB_ranks[(DF_PDB_ranks.Hugo_Symbol=='AKR1B10')&(DF_PDB_ranks.structureChainId==i)].var_in_PDB.values[0])
    
pdb_variants = list(set(pdb_variants))

print("Number of AKR1B10 variants captured across structures:", np.true_divide(len(set(pdb_variants)),DF_PDB_ranks[DF_PDB_ranks.Hugo_Symbol=='AKR1B10'].tot_num_variant_pos.unique()[0]))    
    

Number of KEAP1 variants captured across structures: 0.9420289855072463
Number of NRF2 variants captured across structures: 1.0
Number of CUL3 variants captured across structures: 1.0
Number of AKR1C4 variants captured across structures: 1.0
Number of AKR1B10 variants captured across structures: 1.0


In [43]:
mutations_with_ranked_structures = DF_PDB_ranks[DF_PDB_ranks.structureChainId.isin(list_all)][['Hugo_Symbol','structureChainId','targetId_list']]
mutations_with_ranked_structures

Unnamed: 0,Hugo_Symbol,structureChainId,targetId_list
0,UCHL1,3KW5.A,"[43, 200, 203, 83, 89, 100, 207, 213, 39, 173]"
11,TXNRD1,3QFB.A,"[66, 414, 40, 82, 181, 366, 366, 277, 387, 128..."
39,SRXN1,2RII.X,"[101, 51, 49, 53, 82, 102]"
46,SIRT1,5BTR.A,"[395, 207, 468, 393, 318, 383, 412, 379, 432, ..."
59,RAB6B,2FFQ.A,"[90, 150, 163, 111, 62, 39, 154, 140, 58, 20, ..."
67,NTRK2,4AT5.A,"[805, 810, 628, 661, 700, 725, 624, 622, 640, ..."
68,NTRK2,1WWB.X,"[321, 308, 312, 334, 314, 314, 292, 300]"
72,NQO1,5EA2.A,"[219, 219, 85, 193, 78, 134, 171, 199, 200, 272]"
139,NFE2L2,4IFL.P,"[77, 77, 79, 82, 81, 80, 80, 69, 73]"
140,NFE2L2,2FLU.P,"[77, 77, 79, 82, 81, 80, 80, 69, 73]"


## TO DO: make a final df with hugo, structurechainid, residueID, X, y, Z

In [44]:
mutations_with_ranked_structures_full = dfx[dfx.structureChainId.isin(list_all)]
mutations_with_ranked_structures_full.head()

Unnamed: 0,Hugo_Symbol,Protein_Change,Tumor_Sample_Barcode,variationId,uniprotId,structureChainId,targetGroupId,targetChainId,targetGroupNumber,cluster_x,cluster_y,cluster_z,atom_nom,PDB_CHAIN_ID,SP_PRIMARY,GENE,coverage
6447,UCHL1,p.P43S,HCT15_LARGE_INTESTINE,chr4:g.41259707C>T,P09936,3KW5.A,PRO,A,43,36.817001,56.400002,95.001999,CG,3KW5.A,P09936,UCHL1,1.0
6448,UCHL1,p.V200V,LP1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,chr4:g.41270018C>A,P09936,3KW5.A,VAL,A,200,43.471001,63.093002,75.249001,CG2_CG1,3KW5.A,P09936,UCHL1,1.0
6449,UCHL1,p.E203*,HCC2998_LARGE_INTESTINE,chr4:g.41270025G>T,P09936,3KW5.A,GLU,A,203,41.978001,65.665497,71.873497,OE2_OE1,3KW5.A,P09936,UCHL1,1.0
6450,UCHL1,p.K83K,SNU1040_LARGE_INTESTINE,chr4:g.41262738G>A,P09936,3KW5.A,LYS,A,83,55.077,48.330002,66.055,NZ,3KW5.A,P09936,UCHL1,1.0
6451,UCHL1,p.S89F,MCC26_SKIN,chr4:g.41262755C>T,P09936,3KW5.A,SER,A,89,45.209,41.261002,72.905998,OG,3KW5.A,P09936,UCHL1,1.0


In [45]:
mutations_with_ranked_structures_full.to_csv(output_file_name)

In [47]:
gene = 'TXNRD1'
struct = '3QFB.A'
mutations_with_ranked_structures_full[(mutations_with_ranked_structures_full.Hugo_Symbol==gene)&(mutations_with_ranked_structures_full.structureChainId==struct)&(mutations_with_ranked_structures_full.Protein_Change=='p.A122V')]

Unnamed: 0,Hugo_Symbol,Protein_Change,Tumor_Sample_Barcode,variationId,uniprotId,structureChainId,targetGroupId,targetChainId,targetGroupNumber,cluster_x,cluster_y,cluster_z,atom_nom,PDB_CHAIN_ID,SP_PRIMARY,GENE,coverage
3758,TXNRD1,p.A122V,HUH1_LIVER,chr12:g.104713303C>T,Q16881,3QFB.A,ALA,A,160,58.463001,-93.374001,43.442001,CB,3QFB.A,Q16881,TXNRD1,0.762712
