# Maps SARS-CoV-2 Mutations to 3D Protein Structures
[Work in progress]

This notebook map mutation frequency of SARS-CoV-2 strains onto 3D protein structures in the [Protein Data Bank](https://www.wwpdb.org/).

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import matplotlib.cm as cm
#import matplotlib
from py2neo import Graph
import ipywidgets as widgets
from ipywidgets import interact, IntSlider
import py3Dmol

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

#### Connect to COVID-19-Community Knowledge Graph

In [3]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

### Get list of SARS-CoV-2 proteins

In [4]:
reference_genome = 'ncbiprotein:NC_045512' # Genbank reference sequence

In [5]:
query = """
MATCH (r:Strain{id: $reference_genome})-[:HAS]->(g:Gene)-[:ENCODES]->(p:Protein)
      -[:HAS_TERTIARY_STRUCTURE]->(:Chain)-[:IS_PART_OF_STRUCTURE]->(s:Structure)
WHERE (g.end - g.start) < 10000 // exclude polyproteins
RETURN DISTINCT(p.name) AS protein
ORDER BY protein
"""
proteins = graph.run(query, reference_genome=reference_genome).to_data_frame()['protein'].values

In [6]:
protein_widget = widgets.Dropdown(options=proteins, description='Select protein:', value='Spike glycoprotein')

In [7]:
display(protein_widget)

Dropdown(description='Select protein:', index=14, options=("2'-O-methyltransferase", '3C-like proteinase', 'He…

In [8]:
protein_name = protein_widget.value
print('Protein name :', protein_name)

Protein name : Spike glycoprotein


### Get total number of strains

In [9]:
query = """
MATCH (s:Strain)
WHERE s.hostTaxonomyId = 'taxonomy:9606'
RETURN count(s)
"""

In [10]:
strains = graph.evaluate(query)

In [11]:
print('Total number of human strains:', strains)

Total number of human strains: 72371


### Get variants for selected protein

In [13]:
query = """
MATCH (p:Protein{name: $protein_name})-[:HAS_VARIANT]->(v:Variant{variantConsequence:'missense_variant'})<-[:HAS_VARIANT]-(s:Strain)
WHERE s.hostTaxonomyId = 'taxonomy:9606'
WITH v.proteinPosition AS residue, count(v.proteinVariant) AS count, 
       split(v.proteinVariant, ':')[1] + '(' + count(v.proteinVariant) + ')' AS mutation ORDER by count DESC
WITH residue, count, mutation
RETURN residue, collect(mutation) AS mutations, sum(count) AS count ORDER BY residue
"""

In [14]:
variants = graph.run(query, protein_name=protein_name).to_data_frame()

#### Add mutation annotation to each residue

In [15]:
variants['annotation'] = variants['mutations'].apply(lambda x: ', '.join(x))

In [16]:
variants['annotation'] = variants['annotation'].str.replace('p.', '')

#### Create a color scale based on the log mutation frequency

In [18]:
variants['scale'] = variants['count'].apply(np.log) / math.log(strains)

In [19]:
n_colors = 100
colors = cm.Reds(np.linspace(0.0, 1.0, n_colors))
col = np.empty(n_colors, dtype=object)

for i, color in enumerate(colors):
    col[i] = matplotlib.colors.rgb2hex(color)

In [20]:
variants['color'] = variants['scale'].apply(lambda x: col[round(x*n_colors)])

In [32]:
variants.head()

Unnamed: 0,residue,mutations,count,annotation,scale,color
0,3,"[p.3V>G(2), p.3V>F(1)]",3,"3V>G(2), 3V>F(1)",0.098182,#fee5d8
1,4,[p.4F>S(1)],1,4F>S(1),0.0,#fff5f0
2,5,"[p.5L>F(326), p.5L>I(2)]",328,"5L>F(326), 5L>I(2)",0.517716,#f96044
3,6,"[p.6V>F(48), p.6V>I(1)]",49,"6V>F(48), 6V>I(1)",0.347808,#fc997a
4,7,[p.7L>V(2)],2,7L>V(2),0.061946,#ffebe2


### Get PDB structures for selected protein

In [21]:
query = """
MATCH (p:Protein{name: $protein_name})-[h:HAS_TERTIARY_STRUCTURE]->(c:Chain)-[:IS_PART_OF_STRUCTURE]->(s:Structure)
RETURN p.name AS name, p.start, p.end, c.name, c.uniprotStart, c.uniprotEnd, c.pdbStart, c.pdbEnd, s.resolution AS resolution, s.description AS description, h.coverage AS coverage
ORDER BY resolution, coverage DESC
"""

In [22]:
chains = graph.run(query, protein_name=protein_name).to_data_frame()

In [23]:
chains.head()

Unnamed: 0,name,p.start,p.end,c.name,c.uniprotStart,c.uniprotEnd,c.pdbStart,c.pdbEnd,resolution,description,coverage
0,Spike glycoprotein,13,1273,pdb:6M1V.A,[917],[966],[917],[966],1.5,spike protein,0.039651
1,Spike glycoprotein,1,1273,pdb:6M1V.A,[917],[966],[917],[966],1.5,spike protein,0.039277
2,Spike glycoprotein,13,1273,pdb:7JMP.A,"[338, 366, 371, 392]","[359, 369, 384, 516]","[338, 366, 371, 392]","[359, 369, 384, 516]",1.71,"Spike protein S1, COVA2-39 heavy chain, COVA2-...",0.130849
3,Spike glycoprotein,1,1273,pdb:7JMP.A,"[338, 366, 371, 392]","[359, 369, 384, 516]","[338, 366, 371, 392]","[359, 369, 384, 516]",1.71,"Spike protein S1, COVA2-39 heavy chain, COVA2-...",0.129615
4,Spike glycoprotein,13,1273,pdb:6YZ5.E,[334],[528],[334],[528],1.8,"Spike glycoprotein, Nanobody H11-D4",0.154639


In [24]:
chains.drop_duplicates(subset=['c.name'], inplace=True)

#### Map uniprot residue numbers to PDB residue numbers

In [25]:
def uniprot_to_pdb_mapping(row):
    mapping = dict()
    for (us,ue, ps, pe) in zip(row['c.uniprotStart'], row['c.uniprotEnd'], row['c.pdbStart'], row['c.pdbEnd']):
        ps = int(ps)
        pe = int(pe)
        if (ue-us != pe-ps):
            print('length mismatch:', row['c.name'], ue-us, pe-ps)
        else:
            offset = ps - us
            for v in range(us, ue+1):
                mapping[v] = offset + v
                
    #print(mapping)
    return mapping

In [26]:
chains['mapping'] = chains.apply(lambda row: uniprot_to_pdb_mapping(row), axis=1)

In [27]:
chains.head()

Unnamed: 0,name,p.start,p.end,c.name,c.uniprotStart,c.uniprotEnd,c.pdbStart,c.pdbEnd,resolution,description,coverage,mapping
0,Spike glycoprotein,13,1273,pdb:6M1V.A,[917],[966],[917],[966],1.5,spike protein,0.039651,"{917: 917, 918: 918, 919: 919, 920: 920, 921: ..."
2,Spike glycoprotein,13,1273,pdb:7JMP.A,"[338, 366, 371, 392]","[359, 369, 384, 516]","[338, 366, 371, 392]","[359, 369, 384, 516]",1.71,"Spike protein S1, COVA2-39 heavy chain, COVA2-...",0.130849,"{338: 338, 339: 339, 340: 340, 341: 341, 342: ..."
4,Spike glycoprotein,13,1273,pdb:6YZ5.E,[334],[528],[334],[528],1.8,"Spike glycoprotein, Nanobody H11-D4",0.154639,"{334: 334, 335: 335, 336: 336, 337: 337, 338: ..."
6,Spike glycoprotein,13,1273,pdb:7BZ5.A,"[334, 520]","[518, 528]","[334, 520]","[518, 528]",1.84,"Spike protein S1, Heavy chain of B38, Light ch...",0.153846,"{334: 334, 335: 335, 336: 336, 337: 337, 338: ..."
8,Spike glycoprotein,13,1273,pdb:6ZBP.EEE,[334],[528],[334],[528],1.85,"Spike glycoprotein, H11-H4",0.154639,"{334: 334, 335: 335, 336: 336, 337: 337, 338: ..."


### Visualize mutation sites

Mutations are mapped onto protein chains for available 3D protein structures.

Display options:

|||
|:-|:-|
| *show_bio_assembly* | Toggle display of the biologically relevant quaternary structure |
| *show_surface* | Toggle surface for protein chain |
| *show_annotations* | Toggle display of mutation information<br>{PDBId}.{chainId}.{PDBResidue}: {UniProtResidue}{aminoAcid1}>{aminoAcid2}(# observations)<br>Example: 6Z43.A.614: 614D>G(58984), 614D>N(6) |
| *size* | Change size of visualization |
| *font* | Change font size of annotations |
| *logFreq* | Change minimum threshold to display mutations based on normalized log of mutation frequency [0.0 - 1.0]|
| *structure* | Move slider to browse through available structures |

In [41]:
# Setup viewer
def view_mutations(df, variants, *args):
    chainIds = list(df['c.name'])

    def view3d(show_bio_assembly, show_surface, show_annotations, size, font, logFreq, i): 
        pdb_chain_id = chainIds[i].split(':')[1]
        pdb_id, chain_id = pdb_chain_id.split('.')
        global viewer1
        viewer1 = py3Dmol.view(query='pdb:' + pdb_id, options={'doAssembly': show_bio_assembly}, width=size, height=size)

        # polymer style
        viewer1.setStyle({'cartoon': {'colorscheme': 'chain', 'width': 0.6, 'opacity':0.8}})

        # non-polymer style
        viewer1.setStyle({'hetflag': True}, {'stick':{'radius': 0.3, 'singleBond': False}})
       
        # highlight chain of interest in blue
        viewer1.setStyle({'chain': chain_id},{'cartoon': {'color': 'blue'}})
        
        mapping = df['mapping'].iloc[i]

        for row in variants.itertuples():
            # get PDB residue mapping from a UniProt residue number
            res_num = mapping.get(row.residue, 0)
            col = row.color
            if res_num > 0 and row.scale > logFreq:
                mut_res = {'resi': res_num, 'chain': chain_id}
                viewer1.addStyle(mut_res, {'sphere':{'color':col, 'opacity': 1.0}}) 

                if show_annotations:
                    annotation = row.annotation
                    label = pdb_chain_id + "." + str(res_num) + ": " + annotation
                    viewer1.addLabel(label, {'fontSize':font,'fontColor': 'black','backgroundColor':'ivory', 'opacity': 1.0}, {'resi': res_num, 'chain': chain_id})

        description = df['description'].iloc[i]
        resolution = df['resolution'].iloc[i]
        coverage = df['coverage'].iloc[i]
        
        print(f"PDB Id:{pdb_id}, chain Id:{chain_id}, resolution:{resolution}, sequence coverage:{coverage:.2f}")
        print(description)
        
        # print any specified additional columns from the dataframe
        for a in args:
            print(a + ": " + df.iloc[i][a])

        viewer1.zoomTo({'chain': chain_id})
        viewer1.center({'chain': chain_id})
        
        if show_surface:
             viewer1.addSurface(py3Dmol.SES,{'opacity':0.8,'color':'lightblue'},{'chain': chain_id})

        return viewer1.show()
       
    s_widget = IntSlider(min=0, max=len(chainIds)-1, description='structure', continuous_update=False)
    
    return interact(view3d, show_bio_assembly=False, show_surface=False, show_annotations=True, size=750, font=9, logFreq=0.33, i=s_widget)

def view_image1():
    return viewer1.png()

In [42]:
view_mutations(chains, variants);

interactive(children=(Checkbox(value=False, description='show_bio_assembly'), Checkbox(value=False, descriptio…