## Compute Interaction Distances between variants and Ligands, PTMs, Drugs
Creates a dataframe of all available interaction data using PDB, MMTF-pySpark, UniProt, and databases like dbPTM

In [1]:
import numpy as np
import py3Dmol
from ipywidgets import interact, IntSlider
import pandas as pd
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
from matplotlib import interactive
interactive(True)
import seaborn as sns
sns.set(style="whitegrid")

import ast
from sklearn.metrics.pairwise import euclidean_distances

from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [2]:
from pyspark.sql import SparkSession
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionExtractor, InteractionFilter

from mmtfPyspark.datasets import pdbToUniProt
from mmtfPyspark.utils import traverseStructureHierarchy, ColumnarStructure
from mmtfPyspark.datasets import drugBankDataset, pdbjMineDataset


In [3]:
%run /home/ebc/variome/GIT/mmtf-pyspark/mmtfPyspark/interactions/ExcludedLigandSets.py

In [4]:
# Initialize Spark
spark = SparkSession.builder.master("local[4]").appName("2C-MapLigandInteractions").getOrCreate()

#### input parameters

In [5]:
input_file_name = '../analysis/NRF2_pathway/dataframes/step2/mutations_NRF2v2_step2B.csv' # mutations and PDB structures

output_file_name = '../analysis/NRF2_pathway/dataframes/step2/mutations_NRF2v2_step2C.csv' # data on interaction distances

path_plots = '../analysis/NRF2_pathway/plots/step2/'
path_df = '../analysis/NRF2_pathway/dataframes/step2/'

In [6]:
df = pd.read_csv(input_file_name,index_col=0)
df.reset_index(inplace=True)
df.head(2)


Unnamed: 0,index,Hugo_Symbol,Protein_Change,Tumor_Sample_Barcode,variationId,uniprotId,structureChainId,targetGroupId,targetChainId,targetGroupNumber,cluster_x,cluster_y,cluster_z,atom_nom,PDB_CHAIN_ID,SP_PRIMARY,GENE,coverage
0,6447,UCHL1,p.P43S,HCT15_LARGE_INTESTINE,chr4:g.41259707C>T,P09936,3KW5.A,PRO,A,43,36.817001,56.400002,95.001999,CG,3KW5.A,P09936,UCHL1,1.0
1,6448,UCHL1,p.V200V,LP1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,chr4:g.41270018C>A,P09936,3KW5.A,VAL,A,200,43.471001,63.093002,75.249001,CG2_CG1,3KW5.A,P09936,UCHL1,1.0


## Compute distance of variants to nearest PTM site

In [7]:
# read in PTM data
ptm_data = pd.read_table('../data/dbPTM/Phosphorylation.txt',header=None)
ptm_data = ptm_data[ptm_data[1].isin(df.uniprotId.unique().tolist())]
ptm_data.columns = ['gene','uniprotId','residue','type','?','seq']


ubq_data = pd.read_table('../data/dbPTM/Ubiquitination.txt',header=None)
ubq_data = ubq_data[ubq_data[1].isin(df.uniprotId.unique().tolist())]
ubq_data.columns = ['gene','uniprotId','residue','type','?','seq']

ptm_data = pd.concat([ptm_data,ubq_data])
print("Total number of PTMs for gene set:",len(ptm_data))

Total number of PTMs for gene set: 326


### Get X Y Z coordinates of PTM residue per PDB 

In [8]:
# Mapping of UniProt to PDB
up = pdbToUniProt.get_cached_residue_mappings().filter("pdbResNum IS NOT NULL").filter("uniprotNum IS NOT NULL")
up_map = up.filter(up.uniprotId.isin(ptm_data.uniprotId.tolist())).toPandas()


In [9]:
# Steps:
# 1) Select PTM residues that have PDB data available
# 2) Merge PTM info with UniProt and PDB info

ptm_residues = pd.DataFrame()

for u in ptm_data.uniprotId.unique():
    df1 = ptm_data[ptm_data.uniprotId==u]
    df2 = up_map[up_map.uniprotId==u]

    for i in df1.index:
        ures = df1.loc[i,'residue']
        change = df1.loc[i,'type']
        seq = df1.loc[i,'seq']

        tmp = df2[df2.uniprotNum==ures]
        tmp.reset_index(inplace=True)
        
        if len(tmp)>0:
            for j in tmp.index:
                if j==0:
                    tmp.insert(0, 'PtmType', change)
                    tmp.insert(0, 'seq', seq)
                else:
                    tmp.at[j,'PtmType'] = change
                    tmp.at[j,'seq'] = seq
            ptm_residues = pd.concat([ptm_residues, tmp])



In [10]:
# 3) Routine to retrieve the XYZ coords and other information from PDB

def get_PTM_pdb_data(u,p):
    
    if '.' in p:
        p = p.split('.')[0]
    
    pdb_data = []

    #gets PDB file and columnar structure attributes
    pdb = mmtfReader.download_full_mmtf_files([p])
    structure = pdb.values().first()
    arrays = ColumnarStructure(structure, firstModelOnly=True)
    x = arrays.get_x_coords()
    y = arrays.get_y_coords()
    z = arrays.get_z_coords()
    group_names = arrays.get_group_names()
    group_numbers = arrays.get_group_numbers()
    chain_names = arrays.get_chain_names()
    group_indices = arrays.get_group_to_atom_indices()

    for i in range(arrays.get_num_groups()):   
        start = group_indices[i]
        end = group_indices[i+1]

        # match residue number in PDB with PTM residue
        if group_numbers[start] in ptm_residues[ptm_residues.structureChainId.str.startswith(p)].pdbResNum.unique().tolist():
        #if (int(group_numbers[start])>120) and (int(group_numbers[start])<129):
            #print(group_names[start] + "-" + chain_names[start] + ":" + group_numbers[start] + " " + str(start) + " - " + str(end) + "   " +str(x[start:end]))
            pdb_data.append({'uniprotId':u,'pdbResName':group_names[start],'structureChainId':p+'.'+chain_names[start],'pdbResNum':str(group_numbers[start]),'atom_idx_start':int(start),'atom_idx_end':int(end),'x_list':x[start:end],'y_list':y[start:end],'z_list':z[start:end]})

            
    tmp = pd.DataFrame(pdb_data)  
    ptm_pdb_data = pd.merge(tmp,ptm_residues[ptm_residues.structureChainId.str.startswith(p)], on=['uniprotId','pdbResNum','structureChainId'])
    
    return (ptm_pdb_data)

In [11]:
DF_PTM_PDB = pd.DataFrame()

for u in ptm_residues.uniprotId.unique():
    for p in ptm_residues[ptm_residues.uniprotId==u].structureChainId.unique():
        
        tmp = get_PTM_pdb_data(u,p)
        DF_PTM_PDB = pd.concat([DF_PTM_PDB,tmp])
        
DF_PTM_PDB.reset_index(inplace=True)
DF_PTM_PDB.head(2)

Unnamed: 0,level_0,atom_idx_end,atom_idx_start,pdbResName,pdbResNum,structureChainId,uniprotId,x_list,y_list,z_list,seq,PtmType,index,pdbSeqNum,uniprotNum
0,0,203,197,SER,23,4ICC.X,O60218,"[-1.774, -1.918, -0.636, -0.295, -3.129, -4.306]","[35.316, 36.393, 36.513, 35.602, 36.143, 36.179]","[2.715, 3.699, 4.548, 5.303, 4.586, 3.79]",MPIVGLGTWKSPLGKVKEAVK,Phosphorylation,7640,23,23
1,1,331,319,TYR,40,4ICC.X,O60218,"[-17.501, -17.076, -16.955, -16.439, -15.693, ...","[37.745, 36.458, 35.448, 35.772, 36.545, 37.25...","[10.468, 9.909, 11.044, 12.119, 9.275, 7.948, ...",EAVKVAIDAGYRHIDCAYVYQ,Phosphorylation,7657,40,40


In [12]:
DF_PTM_PDB.to_csv(path_df+'PTM_PDB_data.csv')

### Compute nearest-neighbor distances from variants to nearby PTM sites

In [13]:
## Note: may need to consider biological assemblies - use mmtf interaction tools

ptm_res = []
ptm_group = []
ptm_desc = []
ptm_dist = []

for i in df.index:
    
    var_x = df.loc[i,'cluster_x']
    var_y = df.loc[i,'cluster_y']
    var_z = df.loc[i,'cluster_z']
    X = [var_x,var_y,var_z]
    
    p = df.loc[i,'structureChainId']
    u = df.loc[i,'uniprotId']
    
    tmp_ptm_df = DF_PTM_PDB[(DF_PTM_PDB['uniprotId']==u)&(DF_PTM_PDB['structureChainId']==p)]
    
    # loop over PTMs for that specific PDB/protein
    
    PTM_dict = {}
    nearest_neighbor = ''
    
    # finds nearest PTM to variant
    for j in tmp_ptm_df.index:
        ptm_x = tmp_ptm_df.loc[j,'x_list']
        ptm_y = tmp_ptm_df.loc[j,'y_list']
        ptm_z = tmp_ptm_df.loc[j,'z_list']
        
        ptm_resname = tmp_ptm_df.loc[j,'pdbResName']
        ptm_num = tmp_ptm_df.loc[j,'pdbResNum']
        ptm_type = tmp_ptm_df.loc[j,'PtmType']
        
        Y = []
        
        for z in range(0,len(ptm_x)):
            # recombine lists
            Y.append([ptm_x[z],ptm_y[z],ptm_z[z]])
        
        # compute pairwise euclidean distance and take the minimum distance
        dist = euclidean_distances([X], Y)
        min_dist = np.min(dist)  # select distance of closest atom on the PTM to variant x,y,z
        
        PTM_dict[j] = (ptm_resname, ptm_num, ptm_type, min_dist)
        
        if nearest_neighbor == '':
            nearest_neighbor = PTM_dict[j]
        else:
            if PTM_dict[j][3] < nearest_neighbor[3]:
                nearest_neighbor = PTM_dict[j]
                
    try:
        ptm_res.append(nearest_neighbor[0])
        ptm_group.append(nearest_neighbor[1])
        ptm_desc.append(nearest_neighbor[2])
        ptm_dist.append(nearest_neighbor[3])    
    except IndexError:
        print("No PTM data for gene %s"%u)
        ptm_res.append('')
        ptm_group.append('')
        ptm_desc.append('')
        ptm_dist.append(1000)   
        
        
df['PTM_dist'] = ptm_dist
df['PTM_type'] = ptm_desc
df['PTM_resName'] = ptm_res
df['PTM_resNum'] = ptm_group
           

No PTM data for gene Q16620
No PTM data for gene Q16620
No PTM data for gene Q16620
No PTM data for gene Q16620
No PTM data for gene Q16620
No PTM data for gene Q16620
No PTM data for gene Q16620
No PTM data for gene Q16620
No PTM data for gene Q16620
No PTM data for gene Q16620
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene Q16236
No PTM data for gene

In [14]:
df[df.PTM_dist<5][['Hugo_Symbol','Tumor_Sample_Barcode','Protein_Change','PTM_dist','PTM_resNum']].drop_duplicates().head(10)

Unnamed: 0,Hugo_Symbol,Tumor_Sample_Barcode,Protein_Change,PTM_dist,PTM_resNum
3,UCHL1,SNU1040_LARGE_INTESTINE,p.K83K,0.0,83
4,UCHL1,MCC26_SKIN,p.S89F,0.0,89
11,TXNRD1,VMCUB1_URINARY_TRACT,p.P28A,3.232057,67
13,TXNRD1,KYAE1_OESOPHAGUS,p.V2V,4.710373,127
20,TXNRD1,MDAMB361_BREAST,p.E90Q,4.430407,152
29,TXNRD1,S117_SOFT_TISSUE,p.K112K,4.165546,152
30,TXNRD1,CAL33_UPPER_AERODIGESTIVE_TRACT,p.K57K,0.0,95
31,TXNRD1,NCIH684_LIVER,p.I168T,4.105054,157
33,TXNRD1,DV90_LUNG,p.A116T,2.940409,13
41,TXNRD1,T173_FIBROBLAST,p.L229L,4.766182,284


## Compute Distance of Variants to Ligand Binding Site

note: will have to consider PDB-PDB mapping or selection of PDB files with ligands before hand in step 3

note -2: would be nice to change to consider interactions with multiple ligands

In [15]:
pdb_ids = [i.split('.')[0] for i in df.structureChainId.unique()]
structures = mmtfReader.download_mmtf_files(pdb_ids)
structures = structures.filter(lambda s: s[1].num_models == 1)  ## Currently, only structures with 1 model are supported

In [16]:
distance_cutoff = 17

exclude = ["HOH", "DOD","ACT"] + list(JUNK)

interaction_filter = InteractionFilter(distanceCutoff=distance_cutoff)
interaction_filter.set_query_groups(False, exclude)  # exclude junk and water

Lig_interactions = InteractionExtractor().get_ligand_polymer_interactions(structures, interaction_filter, level='atom').toPandas()

Lig_interactions['targetGroupNumber'] = Lig_interactions['targetGroupNumber'].astype(int)

# Lig_interactions.to_csv(path_df+'ligand_info.csv') # caution large file size > 100mb

In [17]:
lig_resName = []
lig_dist = []
lig_resNum = []

for i in df.index:
    
    p = df.loc[i,'structureChainId']
    t = df.loc[i,'targetGroupNumber']
    
    r = ''
    d = 1000
    n = ''
    
    tmp_lig_df = Lig_interactions[(Lig_interactions['structureChainId']==p)&(Lig_interactions.targetGroupNumber==t)].sort_values('distance',ascending=True)
    tmp_lig_df.reset_index(inplace=True)
    
    # TO DO: opt for additional ligands, not just nearest neighboring one
    #if len(tmp_lig_df) > 0:
    #    for j in tmp_lig_df.queryGroupId.unique():
    #        tmp = tmp_lig_df[tmp_lig_df.queryGroupId==j].sort_values('distance',ascending=True)
        
    if len(tmp_lig_df) > 0:
        
        r = tmp_lig_df.loc[0,'queryGroupId']
        d = tmp_lig_df.loc[0,'distance']
        n = tmp_lig_df.loc[0,'queryGroupNumber']
                   
        try:
            lig_resName.append(r)
            lig_dist.append(d)
            lig_resNum.append(n)
        except IndexError:
            lig_resName.append('')
            lig_dist.append(1000)
            lig_resNum.append('')
            
    else:
        lig_resName.append('')
        lig_dist.append(1000)
        lig_resNum.append('')
        
        
df['Ligand_dist'] = lig_dist
df['Ligand_resName'] = lig_resName
df['Ligand_resNum'] = lig_resNum
           

In [18]:
df[df.Ligand_dist<5][['Hugo_Symbol','Tumor_Sample_Barcode','Protein_Change','Ligand_dist','Ligand_resName']].drop_duplicates().head(12)

Unnamed: 0,Hugo_Symbol,Tumor_Sample_Barcode,Protein_Change,Ligand_dist,Ligand_resName
4,UCHL1,MCC26_SKIN,p.S89F,4.458467,GVE
27,TXNRD1,GRST_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,p.T335T,4.987658,FAD
28,TXNRD1,MOLT16_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,p.T305T,3.126904,FAD
31,TXNRD1,NCIH684_LIVER,p.I168T,3.739374,FAD
34,TXNRD1,HUH1_LIVER,p.A122V,3.230645,FAD
52,TXNRD1,HEC1_ENDOMETRIUM,p.H434R,2.573166,GOL
54,TXNRD1,AGS_STOMACH,p.N331D,2.349676,GOL
59,TXNRD1,SNU1040_LARGE_INTESTINE,p.Y93Y,3.183022,FAD
60,SRXN1,HCC366_LUNG,p.R101H,2.530788,PO4
67,SRXN1,SNU349_KIDNEY,p.Y102Y,4.789872,PO4


## Compute Distances of Variants to Bound Drugs

### Get InChiKey for ligands in PDB with molecular weight >= 250

In [19]:
mw_min = 250  # minimum molecular weight for drug molecules

ccQuery = "SELECT c.id as ligand_id, c.formula_weight, d.descriptor as inchi_key FROM pdbj.chem_comp c \
           JOIN cc.pdbx_chem_comp_descriptor d ON d.comp_id = c.id \
           WHERE d.type = 'InChIKey' AND c.formula_weight >= " + str(mw_min)

ligands = pdbjMineDataset.get_dataset(ccQuery).dropDuplicates().toPandas()
ligands.head()

Unnamed: 0,ligand_id,formula_weight,inchi_key
0,PQQ,330.206,MMXZSJMASHPLLR-UHFFFAOYSA-N
1,RKS,540.563,WXZHDJXCVBLPHY-MOPGFXCFSA-N
2,GR6,630.749,ZQJBDIZBNGQJDT-JZAADHNPSA-N
3,B83,444.522,YMZMFFRGSWCLPY-KDURUIRLSA-N
4,3P1,423.425,NAHSCHKAPXMNFP-NVQRDWNXSA-N


### Join dataset on ligand id to add InchiKeys

In [20]:
mt = Lig_interactions.merge(ligands, left_on=['queryGroupId'], right_on=['ligand_id'], how='inner')
mt = mt.drop_duplicates()

### Download open DrugBank dataset¶

We use the open DrugBank dataset. One disadvantage of the open DrugBank dataset is that it not only contains approved drugs, but many other compounds in pharmaceutical use such as ethanol, ATP, etc.

In [21]:
drugs = drugBankDataset.get_open_drug_links() \
                       .filter("StandardInChIKey IS NOT NULL") \
                       .filter("CAS IS NOT NULL") \
                       .toPandas()

The DrugBank password protected datasets contain more information (e.g., approval status). To use these datasets, you need to create a free DrugBank account and supply username/passwork to access these datasets. 

For this demo, we continue with the open drug bank dataset.

In [22]:
username = "ebrunk@ucsd.edu"
password = "ssbio123"
drugs = drugBankDataset.get_drug_links("APPROVED", username,password) \
                        .filter("InChIKey IS NOT NULL") \
                        .toPandas()

### Merge Drug Data

In [23]:
mt = mt.merge(drugs, left_on=['inchi_key'], right_on=['InChIKey'], how='inner')
#mt.to_csv(path_df+'drug_info.csv')
mt.head(2)

Unnamed: 0,structureChainId,queryGroupId,queryChainId,queryGroupNumber,queryAtomName,targetGroupId,targetChainId,targetGroupNumber,targetAtomName,distance,sequenceIndex,sequence,ligand_id,formula_weight,inchi_key,DrugBankID,Name,CASNumber,DrugGroups,InChIKey,InChI,SMILES,Formula,KEGGCompoundID,KEGGDrugID,PubChemCompoundID,PubChemSubstanceID,ChEBIID,ChEMBLID,HETID,ChemSpiderID,BindingDBID
0,5EA2.E,FAD,G,301,C6,GLY,E,135,CA,15.728915,138,GPHMVGRRALIVLAHSERTSFNYAMKEAAAAALKKKGWEVVESDLY...,FAD,785.55,VWWQXMAJTJZDQX-UYBVJOGSSA-N,DB03147,Flavin adenine dinucleotide,146-14-5,approved,VWWQXMAJTJZDQX-UYBVJOGSSA-N,InChI=1S/C27H33N9O15P2/c1-10-3-12-13(4-11(10)2...,CC1=CC2=C(C=C1C)N(C[C@H](O)[C@H](O)[C@H](O)CO[...,C27H33N9O15P2,C00016,D00005,643975.0,46508543.0,16238.0,CHEMBL1232653,FAD,559059.0,
1,5EA2.G,FAD,E,301,O4B,PRO,G,68,CB,12.924247,71,GPHMVGRRALIVLAHSERTSFNYAMKEAAAAALKKKGWEVVESDLY...,FAD,785.55,VWWQXMAJTJZDQX-UYBVJOGSSA-N,DB03147,Flavin adenine dinucleotide,146-14-5,approved,VWWQXMAJTJZDQX-UYBVJOGSSA-N,InChI=1S/C27H33N9O15P2/c1-10-3-12-13(4-11(10)2...,CC1=CC2=C(C=C1C)N(C[C@H](O)[C@H](O)[C@H](O)CO[...,C27H33N9O15P2,C00016,D00005,643975.0,46508543.0,16238.0,CHEMBL1232653,FAD,559059.0,


In [24]:
drug_resName = []
drug_dist = []
drug_resNum = []
drug_drugbankId = []
drug_name =[]
drug_inchi_key = []
drug_CAS = []
drug_ChEMBLID = []


for i in df.index:
    
    p = df.loc[i,'structureChainId']
    t = df.loc[i,'targetGroupNumber']
    
    r = ''
    d = 1000
    n = ''
    m = ''
    o = ''
    q = ''
    s = ''
    u = ''
    
    tmp_drug_df = mt[(mt['structureChainId']==p)&(mt.targetGroupNumber==t)].sort_values('distance',ascending=True)
    tmp_drug_df.reset_index(inplace=True)
    
    # TO DO: opt for additional ligands, not just nearest neighboring one
    #if len(tmp_lig_df) > 0:
    #    for j in tmp_lig_df.queryGroupId.unique():
    #        tmp = tmp_lig_df[tmp_lig_df.queryGroupId==j].sort_values('distance',ascending=True)
        
    if len(tmp_drug_df) > 0:
        
        r = tmp_drug_df.loc[0,'queryGroupId']
        d = tmp_drug_df.loc[0,'distance']
        n = tmp_drug_df.loc[0,'queryGroupNumber']
        m = tmp_drug_df.loc[0,'DrugBankID']
        o = tmp_drug_df.loc[0,'Name']
        q = tmp_drug_df.loc[0,'InChIKey']
        s = tmp_drug_df.loc[0,'CASNumber']
        u = tmp_drug_df.loc[0,'ChEMBLID']
                   
        try:
            drug_resName.append(r)
            drug_dist.append(d)
            drug_resNum.append(n)
            drug_drugbankId.append(m)
            drug_name.append(o)
            drug_inchi_key.append(q)
            drug_CAS.append(s)
            drug_ChEMBLID.append(u)
            
        except IndexError:
            drug_resName.append('')
            drug_dist.append(1000)
            drug_resNum.append('')
            drug_drugbankId.append('')
            drug_name.append('')
            drug_inchi_key.append('')
            drug_CAS.append('')
            drug_ChEMBLID.append('')
            
    else:
        drug_resName.append('')
        drug_dist.append(1000)
        drug_resNum.append('')
        drug_drugbankId.append('')
        drug_name.append('')
        drug_inchi_key.append('')
        drug_CAS.append('')
        drug_ChEMBLID.append('')
        
        
df['Drug_dist'] = drug_dist
df['Drug_resName'] = drug_resName
df['Drug_resNum'] = drug_resNum
df['Drug_DrugBankId'] = drug_drugbankId
df['Drug_Name'] = drug_name
df['Drug_Inchi'] = drug_inchi_key
df['Drug_CAS'] = drug_CAS
df['Drug_ChEMBLID'] = drug_ChEMBLID

           

In [25]:
df[df.Drug_dist<10][['Hugo_Symbol','Tumor_Sample_Barcode','Protein_Change','Drug_dist','Drug_resName','Drug_Name']].drop_duplicates().head(12)

Unnamed: 0,Hugo_Symbol,Tumor_Sample_Barcode,Protein_Change,Drug_dist,Drug_resName,Drug_Name
11,TXNRD1,VMCUB1_URINARY_TRACT,p.P28A,6.795431,FAD,Flavin adenine dinucleotide
13,TXNRD1,KYAE1_OESOPHAGUS,p.V2V,7.047333,FAD,Flavin adenine dinucleotide
15,TXNRD1,LS180_LARGE_INTESTINE,p.D143D,5.376522,FAD,Flavin adenine dinucleotide
20,TXNRD1,MDAMB361_BREAST,p.E90Q,6.518649,FAD,Flavin adenine dinucleotide
24,TXNRD1,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,p.G263D,7.827392,FAD,Flavin adenine dinucleotide
27,TXNRD1,GRST_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,p.T335T,4.987658,FAD,Flavin adenine dinucleotide
28,TXNRD1,MOLT16_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,p.T305T,3.126904,FAD,Flavin adenine dinucleotide
31,TXNRD1,NCIH684_LIVER,p.I168T,3.739374,FAD,Flavin adenine dinucleotide
34,TXNRD1,HUH1_LIVER,p.A122V,3.230645,FAD,Flavin adenine dinucleotide
36,TXNRD1,JHUEM7_ENDOMETRIUM,p.R46Q,8.949179,FAD,Flavin adenine dinucleotide


## Save Final DF

In [26]:
df.to_csv(output_file_name)

In [27]:
# Shutdown Spark
spark.stop()

In [28]:
gene = 'TXNRD1'
struct = '3QFB.A'
df[(df.Hugo_Symbol==gene)&(df.structureChainId==struct)&(df.Protein_Change=='p.A122V')]

Unnamed: 0,index,Hugo_Symbol,Protein_Change,Tumor_Sample_Barcode,variationId,uniprotId,structureChainId,targetGroupId,targetChainId,targetGroupNumber,cluster_x,cluster_y,cluster_z,atom_nom,PDB_CHAIN_ID,SP_PRIMARY,GENE,coverage,PTM_dist,PTM_type,PTM_resName,PTM_resNum,Ligand_dist,Ligand_resName,Ligand_resNum,Drug_dist,Drug_resName,Drug_resNum,Drug_DrugBankId,Drug_Name,Drug_Inchi,Drug_CAS,Drug_ChEMBLID
34,3758,TXNRD1,p.A122V,HUH1_LIVER,chr12:g.104713303C>T,Q16881,3QFB.A,ALA,A,160,58.463001,-93.374001,43.442001,CB,3QFB.A,Q16881,TXNRD1,0.762712,8.518215,Ubiquitination,PHE,157,3.230645,FAD,600,3.230645,FAD,600,DB03147,Flavin adenine dinucleotide,VWWQXMAJTJZDQX-UYBVJOGSSA-N,146-14-5,CHEMBL1232653
