# T cell epitopes of SARS-CoV2

## Methods

* Predict MHC-I binders for sars-cov2 reference sequences (S and N important)
* Align with sars-cov and get conserved epitopes.
* Find overlap with known epitopes from https://www.viprbrc.org/
* Check MHC restriction of known vs predictions?
* Try 50nm cutoff vs default method
* Best alleles to use?
* Get GIS sequences and translate to proteins
* Multiple sequence alignment of protein to reference
    - find non-conserved binders or clusters

## References

* S. F. Ahmed, A. A. Quadeer, and M. R. McKay, “Preliminary Identification of Potential Vaccine Targets for the COVID-19 Coronavirus (SARS-CoV-2) Based on SARS-CoV Immunological Studies.,” Viruses, vol. 12, no. 3, 2020.
* A. Grifoni et al., “A sequence homology and bioinformatic approach can predict candidate targets for immune responses to SARS-CoV-2,” Cell Host Microbe, pp. 1–10, 2020.
* V. Baruah and S. Bose, “Immunoinformatics-aided identification of T cell and B cell epitopes in the surface glycoprotein of 2019-nCoV,” J. Med. Virol., no. February, pp. 495–500, 2020.

## Epitope Loss in Mutations

* https://www.biorxiv.org/content/10.1101/2020.03.27.012013
* https://www.biorxiv.org/content/10.1101/2020.04.10.029454v1?ct=
* https://www.biorxiv.org/content/10.1101/2020.04.07.030924v1

## Common coronoviruses

* https://www.cdc.gov/coronavirus/types.html


In [78]:
import os, math, time, pickle, subprocess
from importlib import reload
from collections import OrderedDict, defaultdict
import numpy as np
import pandas as pd
pd.set_option('display.width', 150)
import epitopepredict as ep
from epitopepredict import base, sequtils, plotting, peptutils, analysis
from IPython.display import display, HTML, Image
%matplotlib inline
import matplotlib as mpl
import pylab as plt
import pybioviz
from bokeh.io import show, output_notebook
output_notebook()
import pathogenie
from Bio import SeqIO,AlignIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

## ref genomes

In [84]:
labels = {'sars':'NC_004718.3','scov2':'NC_045512.2','229E':'NC_002645.1','NL63':'NC_005831.2','OC43':'NC_006213.1','HKU1':'NC_006577.2'}
genomes = []
for l in labels:
    df = ep.genbank_to_dataframe(labels[l]+'.gb',cds=True)
    df['label'] = l
    genomes.append(df)
genomes = pd.concat(genomes)
print (genomes[genomes.label=='scov2'])

   type      protein_id   locus_tag    gene          db_xref                      product                                          note  \
3   CDS  YP_009724389.1  GU280_gp01  ORF1ab  GeneID:43740578           ORF1ab polyprotein  pp1ab; translated by -1 ribosomal frameshift   
19  CDS  YP_009725295.1  GU280_gp01  ORF1ab  GeneID:43740578            ORF1a polyprotein                                          pp1a   
34  CDS  YP_009724390.1  GU280_gp02       S  GeneID:43740568         surface glycoprotein             structural protein; spike protein   
36  CDS  YP_009724391.1  GU280_gp03   ORF3a  GeneID:43740569                ORF3a protein                                           NaN   
38  CDS  YP_009724392.1  GU280_gp04       E  GeneID:43740570             envelope protein           ORF4; structural protein; E protein   
40  CDS  YP_009724393.1  GU280_gp05       M  GeneID:43740571        membrane glycoprotein                      ORF5; structural protein   
42  CDS  YP_009724394.1  GU

In [71]:
sub = genomes[genomes.gene=='S']
seqs = []
for i,r in sub.iterrows():
    s=SeqRecord(Seq(r.translation),id=r.label)
    seqs.append(s)
aln=pathogenie.clustal_alignment(seqs=seqs)
print (aln)

SingleLetterAlphabet() alignment with 6 rows and 1475 columns
--MFIFLLFLT----------------LTSGSDLDRCTTFDDVQ...HYT sars
--MFVFLVLLP----------------LVSSQCVN--LTTRTQL...HYT scov2
-MFLILLISLPTAFAVIGD-------LKCTSDNINDKDTGPPPI...D-- OC43
--MLLIIFILPTTLAVIGD-------FNCTNFAINDLNTTVPRI...D-- HKU1
--------------------------------------------...HIQ 229E
MKLFLILLVLPLASCFFTCNSNANLSMLQLGVPDNSSTIVTGLL...HVQ NL63


In [70]:
p = pybioviz.plotters.plot_sequence_alignment(aln, annot = {'polybasic cleavage site':690,'RBD contact residues':480})
#output_file('alignment.html')
show(p)

## mutations within scov2 population

In [2]:
mutations = pd.read_csv('sarscov2_mutations.csv')
mutations[:10]

Unnamed: 0,mutation,count,protein
0,D614G,792,Spike glycoprotein
1,P214L,769,Replicase polyprotein 1ab
2,Q57H,386,Protein 3a
3,R203K+G204R,288,Nucleoprotein
4,G251V,225,Protein 3a
5,P1327L+Y1364C,222,Replicase polyprotein 1ab
6,T248I,165,Replicase polyprotein 1a
7,L3589F,61,Replicase polyprotein 1a
8,T175M,46,Membrane protein
9,I722V+P748S+L3589F,44,Replicase polyprotein 1a


In [3]:
sc2 = ep.genbank_to_dataframe('NC_045512.2.gb',cds=True)
sc2 = sc2.drop_duplicates('gene')

In [17]:
m1_alleles = ep.get_preset_alleles('broad_coverage_mhc1')
m2_alleles = ep.get_preset_alleles('mhc2_supertypes')

In [None]:
m1_alleles

In [None]:
P1 = base.get_predictor('netmhcpan') 
P1.predict_sequences(sc2, alleles=m1_alleles,cpus=10,path='netmhcpan',length=9,overwrite=False,verbose=True)
P2 = base.get_predictor('mhcflurry') 
P2.predict_sequences(sc2, alleles=m1_alleles,cpus=10,path='mhcflurry',length=9,overwrite=False)

In [19]:
P1.load(path='netmhcpan')
P2.load(path='mhcflurry')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [20]:
rb = P1.promiscuous_binders(n=3, cutoff_method='rank', cutoff=10)

In [22]:
reload(analysis)
pb1 = P1.promiscuous_binders(n=5, cutoff=.95)
#pb2 = P2.promiscuous_binders(n=5, cutoff=.95)
#pb = P.promiscuous_binders(n=3, cutoff=500, cutoff_method='score')
cl1 = analysis.find_clusters(pb1, genome=sc2)
#cl2 = analysis.find_clusters(pb2, genome=sc2)

## find epitopes conserved across an alignment

In [91]:
#c = analysis.epitope_conservation(seqs, alnrows=aln)
?analysis.epitope_conservation

[0;31mSignature:[0m
[0manalysis[0m[0;34m.[0m[0mepitope_conservation[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpeptides[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malnrows[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mproteinseq[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mblastresult[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mblastdb[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mperc_ident[0m[0;34m=[0m[0;36m50[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mequery[0m[0;34m=[0m[0;34m'srcdb_refseq[Properties]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Find and visualise conserved peptides in a set of aligned sequences.
Args:
    peptides: a list of peptides/epitopes
    alnrows: a dataframe of previously aligned sequences e.g. custom strains
    proteinseq: a sequence to blast and get an alignment for
 

In [88]:
df = pb1[pb1.name=='GU280_gp02']
df.to_csv('spike_promiscuous.csv')
peptides = df.peptide


out=[]
for p in peptides:
    if p in spike1.translation:
        out.append(True)
    else:
        out.append(False)
        
df['conserved'] = out
print (df[df.conserved==True])
df.conserved.value_counts()

          peptide   pos        name  alleles       core   score         mean  median_rank  conserved
1481  IPFAMQMAYRF   895  GU280_gp02       12  IAMQMAYRF   100.3  2208.900000         20.0       True
2138  LQIPFAMQMAY   893  GU280_gp02       10  LQFAMQMAY    19.4  6184.200000         29.0       True
2864  RFPNITNLCPF   327  GU280_gp02        9  RFPNITNLF    62.0  3680.455556         15.0       True
2939  RSFIEDLLFNK   814  GU280_gp02        7  RSFDLLFNK    37.2  1167.485714         12.0       True
2817  RAAEIRASANL  1013  GU280_gp02        7  RAAEIRASL   434.9  3596.657143         26.0       True
1539  ITNLCPFGEVF   331  GU280_gp02        7  ITNLCPFGF   446.5  4617.371429         45.0       True
1278  HGVVFLHVTYV  1057  GU280_gp02        6  HVFLHVTYV   157.4  4100.283333         29.5       True
3170  SNCVADYSVLY   358  GU280_gp02        6  SVADYSVLY   717.6  1985.483333         30.5       True
1483  IPIGAGICASY   663  GU280_gp02        6  IPIGAGASY    61.9  7199.183333         33.5  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['conserved'] = out


False    76
True     17
Name: conserved, dtype: int64

In [None]:
for n in scov2.locus_tag:
    print (n)
    x = pb1[pb1.name==n]
    print (x)

In [None]:
c1 = ep.binders_to_coords(cl1)
c2 = ep.binders_to_coords(cl2)
a = ep.binders_to_coords(pb1)
b = ep.binders_to_coords(pb2)
f = ep.plot_overview(sc2, coords={'clusters':c1,'netmhcpan':a,'mhcflurry':b},
                         cols=2, figsize=(14,6))


In [24]:
cl1[cl1.name=='GU280_gp02']

Unnamed: 0,name,start,end,binders,length,gene,peptide
5,GU280_gp02,866,911,9,45,S,DEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGV
18,GU280_gp02,129,176,7,47,S,VCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFL
19,GU280_gp02,358,402,7,44,S,SNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVI
21,GU280_gp02,318,356,6,38,S,RVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRK
22,GU280_gp02,255,279,6,24,S,SGWTAGAAAYYVGYLQPRTFLLKY
25,GU280_gp02,443,472,5,29,S,KVGGNYNYLYRLFRKSNLKPFERDISTEI
26,GU280_gp02,684,707,5,23,S,RSVASQSIIAYTMSLGAENSVAY
41,GU280_gp02,1049,1075,4,26,S,MSFPQSAPHGVVFLHVTYVPAQEKNF
42,GU280_gp02,1092,1109,4,17,S,GVFVSNGTHWFVTQRNF
43,GU280_gp02,1204,1220,4,16,S,KYEQYIKWPWYIWLGF


In [None]:
name ='GU280_gp02'
#print (pb[pb.name==name])
ax = ep.plot_tracks([P1,P2],name=name,legend=True,figsize=(14,7),n=5)#,cutoff_method='score')
r = cl1[cl1.name==name]
coords = (list(r.start),list(r.end-r.start))
coords = zip(*coords)
ep.plot_regions(coords, ax, color='gray')
plt.savefig('clusters.png')

In [72]:
plot=ep.bokeh_plot_tracks([P1],name=name,n=5,height=200)#,cutoff_method='score')
show(plot)

## exp data from IEDB

In [None]:
exp = pd.read_csv('mhc_ligand_table_export_1591822113.csv')
exp.columns

In [None]:
cols = ['Sequence','Starting Position','Ending Position','Parent Protein','Antigen Name','Allele Name']
exp[cols]
subset = exp[exp['Parent Protein']=='Spike glycoprotein']
subset[cols]
subset['Sequence'].value_counts()

In [None]:
subset[cols][:10]

## Common human coronaviruses

## view residues on structure

In [None]:
ranges = list(zip(r.start,r.end))
ranges = [list(range(r[0],r[1])) for r in ranges]

In [1]:
from pymol import cmd

In [2]:
def highlight_residues(residues, chain):
   
    from pymol import stored
    vals = {}   
    for r in residues:
        sel = '(chain %s and resi %s)' %(chain,r)
        cmd.color('red', sel)
    return
    
def find_interacting_residues():
    """Find set of residues"""
   
    from pymol import stored
    vals = {}
    residues = range(1,50)
    offset=3
    for p in residues:
        sel1 = '(c. A and (donor or acceptor) and resi %s)' %p
        cmd.select('near','c. B within 4 of %s' %sel1)
        #cmd.show('stick','near')
        #cmd.color('green', 'near')
        stored.lst=[]
        cmd.iterate('near',"stored.lst.append((chain,resi,resn,name))")
        #print (stored.lst)       
        for r in stored.lst:         
            cmd.show('stick','resi %s' %r[1])
            cmd.color('red','resi %s' %r[1])
    return


In [None]:
def draw_spike():
    cmd.reinitialize()
    #cmd.load('6lzg.pdb')
    cmd.load('model_spike.pdb')
    cmd.orient()
    cmd.remove('resn hoh')
    cmd.hide('all')
    cmd.show('cartoon','chain C')
    cmd.turn('x', -90)
    cmd.turn('z', 200)
    cmd.bg_color('white')
    cmd.color('marine','chain C')
    cmd.color('gray','chain D')
    #cmd.select('rbd', '(chain C and resi 455+486+493+494+501+505)')
    cmd.zoom('chain C')
    #cmd.show('sticks', 'rbd')
    #cmd.label('rbd and n. c' , 'resn+resi')
    cmd.set('label_position', (1,2,3))
    return

def highlight_ranges():
    for x in ranges:
        highlight_residues(x,'C')
        
def save_image(filename):
    cmd.set('ray_trace_mode',1)
    cmd.set('ray_trace_gain',0)
    cmd.png(filename, width=1200,dpi=150)
    #cmd.save('model_spike.pse')
    Image(filename='model_spike.png')
    
draw_spike()
#highlight_ranges()
save_image('model_spike.png')
Image(filename='model_spike.png')