# Maps SARS-CoV-2 Mutations to 3D Protein Structures
[Work in progress]

This notebook map mutation frequency of SARS-CoV-2 strains onto 3D protein structures in the [Protein Data Bank](https://www.wwpdb.org/).

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import matplotlib.cm as cm
#import matplotlib
from py2neo import Graph
import ipywidgets as widgets
from ipywidgets import interact, IntSlider
import py3Dmol

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

#### Connect to COVID-19-Community Knowledge Graph

In [3]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

### Get list of SARS-CoV-2 proteins

In [4]:
reference_genome = 'ncbiprotein:NC_045512' # Genbank reference sequence

In [5]:
query = """
MATCH (r:Strain{id: $reference_genome})-[:HAS]->(g:Gene)-[:ENCODES]->(p:Protein)
      -[:HAS_TERTIARY_STRUCTURE]->(:Chain)-[:IS_PART_OF_STRUCTURE]->(s:Structure)
WHERE (g.end - g.start) < 10000 // exclude polyproteins
RETURN DISTINCT(p.name) AS protein
ORDER BY protein
"""
proteins = graph.run(query, reference_genome=reference_genome).to_data_frame()['protein'].values

In [6]:
protein_widget = widgets.Dropdown(options=proteins, description='Select protein:', value='Spike glycoprotein')

In [45]:
display(protein_widget)

Dropdown(description='Select protein:', index=1, options=("2'-O-methyltransferase", '3C-like proteinase', 'Hel…

In [46]:
protein_name = protein_widget.value
print('Protein name :', protein_name)

Protein name : 3C-like proteinase


### Get total number of strains

In [47]:
query = """
MATCH (s:Strain)
WHERE s.hostTaxonomyId = 'taxonomy:9606'
RETURN count(s)
"""

In [48]:
strains = graph.evaluate(query)

In [49]:
print('Total number of human strains:', strains)

Total number of human strains: 72371


### Get variants for selected protein

In [50]:
query = """
MATCH (p:Protein{name: $protein_name})-[:HAS_VARIANT]->(v:Variant{variantConsequence:'missense_variant'})<-[:HAS_VARIANT]-(s:Strain)
WHERE s.hostTaxonomyId = 'taxonomy:9606'
RETURN v.proteinPosition as residue, count(v.proteinPosition) as missenseMutations
ORDER BY residue
"""

In [51]:
variants = graph.run(query, protein_name=protein_name).to_data_frame()

In [52]:
variants['scale'] = variants['missenseMutations'].apply(np.log) / math.log(strains)

#### Create a color scale based on the log mutation frequency

In [53]:
n_colors = 100
#colors = cm.coolwarm(np.linspace(0.0, 1.0, n_colors))
colors = cm.Reds(np.linspace(0.0, 1.0, n_colors))
col = np.empty(n_colors, dtype=object)

for i, color in enumerate(colors):
    col[i] = matplotlib.colors.rgb2hex(color)

In [54]:
variants['color'] = variants['scale'].apply(lambda x: col[round(x*n_colors)])

### Get PDB structures for selected protein

In [55]:
query = """
MATCH (p:Protein{name: $protein_name})-[h:HAS_TERTIARY_STRUCTURE]->(c:Chain)-[:IS_PART_OF_STRUCTURE]->(s:Structure)
RETURN p.name AS name, p.start, p.end, c.name, c.uniprotStart, c.uniprotEnd, c.pdbStart, c.pdbEnd, s.resolution AS resolution, s.description AS description, h.coverage AS coverage
ORDER BY resolution, coverage DESC
"""

In [56]:
chains = graph.run(query, protein_name=protein_name).to_data_frame()

In [57]:
chains.head()

Unnamed: 0,name,p.start,p.end,c.name,c.uniprotStart,c.uniprotEnd,c.pdbStart,c.pdbEnd,resolution,description,coverage
0,3C-like proteinase,3264,3569,pdb:6YB7.A,[3264],[3569],[1],[306],1.25,Non-structural polyprotein 1ab,1.0
1,3C-like proteinase,3264,3569,pdb:5R8T.A,[3264],[3567],[1],[304],1.27,SARS-CoV-2 main protease,0.993464
2,3C-like proteinase,3264,3569,pdb:6XKH.A,[3264],[3569],[1],[306],1.28,3C-like proteinase (3.4.22.69),1.0
3,3C-like proteinase,3264,3569,pdb:5R82.A,[3264],[3567],[1],[304],1.31,SARS-CoV-2 main protease,0.993464
4,3C-like proteinase,3264,3569,pdb:5RH4.A,[3264],[3567],[1],[304],1.34,3C-like proteinase (E.C.3.4.22.69),0.993464


In [58]:
chains.drop_duplicates(subset=['c.name'], inplace=True)

In [59]:
def uniprot_to_pdb_mapping(row):
    mapping = dict()
    for (us,ue, ps, pe) in zip(row['c.uniprotStart'], row['c.uniprotEnd'], row['c.pdbStart'], row['c.pdbEnd']):
        ps = int(ps)
        pe = int(pe)
        if (ue-us != pe-ps):
            print('length mismatch:', row['c.name'], ue-us, pe-ps)
        else:
            offset = ps - us
            for v in range(us, ue+1):
                mapping[v] = offset + v
                
    #print(mapping)
    return mapping

In [60]:
chains['mapping'] = chains.apply(lambda row: uniprot_to_pdb_mapping(row), axis=1)

In [61]:
chains.head()

Unnamed: 0,name,p.start,p.end,c.name,c.uniprotStart,c.uniprotEnd,c.pdbStart,c.pdbEnd,resolution,description,coverage,mapping
0,3C-like proteinase,3264,3569,pdb:6YB7.A,[3264],[3569],[1],[306],1.25,Non-structural polyprotein 1ab,1.0,"{3264: 1, 3265: 2, 3266: 3, 3267: 4, 3268: 5, ..."
1,3C-like proteinase,3264,3569,pdb:5R8T.A,[3264],[3567],[1],[304],1.27,SARS-CoV-2 main protease,0.993464,"{3264: 1, 3265: 2, 3266: 3, 3267: 4, 3268: 5, ..."
2,3C-like proteinase,3264,3569,pdb:6XKH.A,[3264],[3569],[1],[306],1.28,3C-like proteinase (3.4.22.69),1.0,"{3264: 1, 3265: 2, 3266: 3, 3267: 4, 3268: 5, ..."
3,3C-like proteinase,3264,3569,pdb:5R82.A,[3264],[3567],[1],[304],1.31,SARS-CoV-2 main protease,0.993464,"{3264: 1, 3265: 2, 3266: 3, 3267: 4, 3268: 5, ..."
4,3C-like proteinase,3264,3569,pdb:5RH4.A,[3264],[3567],[1],[304],1.34,3C-like proteinase (E.C.3.4.22.69),0.993464,"{3264: 1, 3265: 2, 3266: 3, 3267: 4, 3268: 5, ..."


### Visualize mutation sites

In [62]:
# Setup viewer
def view_mutations(df, variants, *args):
    chainIds = list(df['c.name'])

    def view3d(show_bio_assembly, show_surface, show_labels, show_annotations, size, freq, i): 
        pdb_chain_id = chainIds[i].split(':')[1]
        pdb_id, chain_id = pdb_chain_id.split('.')
        global viewer1
        viewer1 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer1.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.8}})

        # non-polymer style
        viewer1.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
       
        # highlight chain of interest in blue
        viewer1.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
        
        mapping = df['mapping'].iloc[i]

        for row in variants.itertuples():
            res_num = mapping.get(row.residue, 0)
            col = row.color
            if res_num > 0 and row.scale > freq:
                mut_res = {'resi': res_num, 'chain': chain_id}
                viewer1.addStyle(mut_res, {'sphere':{'color':col, 'opacity': 1.0}}) 

        description = df['description'].iloc[i]
        resolution = df['resolution'].iloc[i]
        coverage = df['coverage'].iloc[i]
        
        print(f"PDB Id:{pdb_id}, chain Id:{chain_id}, resolution:{resolution}, sequence coverage:{coverage:.2f}")
        print(description)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])

        viewer1.zoomTo({'chain': chain_id})
        viewer1.center({'chain': chain_id})
        
        if show_surface:
             viewer1.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})

        return viewer1.show()
       
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='Structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_labels=True, show_annotations=False, size=750, freq=0.33, i=s_widget)

def view_image1():
    return viewer1.png()

In [63]:
view_mutations(chains, variants);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…