# T cell epitopes of SARS-CoV2

## Methods

* Predict MHC-I binders for sars-cov2 reference sequences (S and N important)
* Align with sars-cov and get conserved epitopes.
* Best alleles to use?
* Multiple sequence alignment of each protein to reference
* find conservation of binders with closest peptide in each HCov sequence and determine identity

## References

* S. F. Ahmed, A. A. Quadeer, and M. R. McKay, “Preliminary Identification of Potential Vaccine Targets for the COVID-19 Coronavirus (SARS-CoV-2) Based on SARS-CoV Immunological Studies.,” Viruses, vol. 12, no. 3, 2020.
* A. Grifoni et al., “A sequence homology and bioinformatic approach can predict candidate targets for immune responses to SARS-CoV-2,” Cell Host Microbe, pp. 1–10, 2020.
* V. Baruah and S. Bose, “Immunoinformatics-aided identification of T cell and B cell epitopes in the surface glycoprotein of 2019-nCoV,” J. Med. Virol., no. February, pp. 495–500, 2020.

## Epitope Loss in Mutations

* https://www.biorxiv.org/content/10.1101/2020.03.27.012013
* https://www.biorxiv.org/content/10.1101/2020.04.10.029454v1?ct=
* https://www.biorxiv.org/content/10.1101/2020.04.07.030924v1

## Common coronoviruses

* https://www.cdc.gov/coronavirus/types.html


In [1]:
import os, math, time, pickle, subprocess
from importlib import reload
from collections import OrderedDict, defaultdict
import numpy as np
import pandas as pd
pd.set_option('display.width', 150)
import epitopepredict as ep
from epitopepredict import base, sequtils, plotting, peptutils, analysis
from IPython.display import display, HTML, Image
%matplotlib inline
import matplotlib as mpl
import pylab as plt
import pybioviz
from bokeh.io import show, output_notebook
output_notebook()
import pathogenie
from Bio import SeqIO,AlignIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

## ref genomes

In [2]:
labels = {'sars':'NC_004718.3','scov2':'NC_045512.2','229E':'NC_002645.1','NL63':'NC_005831.2','OC43':'NC_006213.1','HKU1':'NC_006577.2'}
genomes = []
for l in labels:
    df = ep.genbank_to_dataframe(labels[l]+'.gb',cds=True)
    df['label'] = l
    genomes.append(df)
genomes = pd.concat(genomes)
print (genomes[genomes.label=='scov2'])

   type      protein_id   locus_tag    gene          db_xref                      product                                          note  \
3   CDS  YP_009724389.1  GU280_gp01  ORF1ab  GeneID:43740578           ORF1ab polyprotein  pp1ab; translated by -1 ribosomal frameshift   
19  CDS  YP_009725295.1  GU280_gp01  ORF1ab  GeneID:43740578            ORF1a polyprotein                                          pp1a   
34  CDS  YP_009724390.1  GU280_gp02       S  GeneID:43740568         surface glycoprotein             structural protein; spike protein   
36  CDS  YP_009724391.1  GU280_gp03   ORF3a  GeneID:43740569                ORF3a protein                                           NaN   
38  CDS  YP_009724392.1  GU280_gp04       E  GeneID:43740570             envelope protein           ORF4; structural protein; E protein   
40  CDS  YP_009724393.1  GU280_gp05       M  GeneID:43740571        membrane glycoprotein                      ORF5; structural protein   
42  CDS  YP_009724394.1  GU

In [3]:
def get_seqs(gene):
    sub = genomes[genomes['gene']==gene]
    #print (sub)
    seqs = []
    for i,r in sub.iterrows():
        s=SeqRecord(Seq(r.translation),id=r.label)
        seqs.append(s)
        #print (s)
    return seqs

seqs=get_seqs('S')
aln=pathogenie.clustal_alignment(seqs=seqs)
print (aln)

SingleLetterAlphabet() alignment with 6 rows and 1475 columns
--MFIFLLFLT----------------LTSGSDLDRCTTFDDVQ...HYT sars
--MFVFLVLLP----------------LVSSQCVN--LTTRTQL...HYT scov2
-MFLILLISLPTAFAVIGD-------LKCTSDNINDKDTGPPPI...D-- OC43
--MLLIIFILPTTLAVIGD-------FNCTNFAINDLNTTVPRI...D-- HKU1
--------------------------------------------...HIQ 229E
MKLFLILLVLPLASCFFTCNSNANLSMLQLGVPDNSSTIVTGLL...HVQ NL63


In [4]:
spikesars = SeqIO.to_dict(seqs)['sars'].seq
spikesars

Seq('MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEIFRSDTLYL...HYT')

In [5]:
p = pybioviz.plotters.plot_sequence_alignment(aln, annot = {'polybasic cleavage site':690,'RBD contact residues':480})
#output_file('alignment.html')
show(p)

## mutations within scov2 population

In [6]:
mutations = pd.read_csv('sarscov2_mutations.csv')
#mutations[:10]

In [7]:
sc2 = ep.genbank_to_dataframe('NC_045512.2.gb',cds=True)
sc2 = sc2.drop_duplicates('gene')

In [8]:
m1_alleles = ep.get_preset_alleles('broad_coverage_mhc1')
m2_alleles = ep.get_preset_alleles('mhc2_supertypes')

In [9]:
P1 = base.get_predictor('netmhcpan') 
P1.predict_sequences(sc2, alleles=m1_alleles,cpus=10,path='netmhcpan',length=9,overwrite=False,verbose=True)
P1.load(path='netmhcpan')

name                      allele           top peptide        score
predictions done for 11 sequences in 26 alleles
results saved to /home/damien/gitprojects/teaching/sarscov2/netmhcpan


  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
#rb = P1.promiscuous_binders(n=3, cutoff_method='rank', cutoff=10)
pb1 = P1.promiscuous_binders(n=3, cutoff=.95)
#pb = P.promiscuous_binders(n=3, cutoff=500, cutoff_method='score')

In [11]:
P2 = base.get_predictor('netmhciipan') 
P2.predict_sequences(sc2, alleles=m2_alleles,cpus=10,path='netmhciipan',length=15,overwrite=False,verbose=True)

name                      allele           top peptide        score
predictions done for 11 sequences in 8 alleles
results saved to /home/damien/gitprojects/teaching/sarscov2/netmhciipan


In [12]:
P2.load(path='netmhciipan')
pb2 = P2.promiscuous_binders(n=3, cutoff=.95)

## find epitopes conserved across an alignment
### simple yes or no method

In [13]:
seqs = get_seqs('S')
df = pb2[pb2.name=='GU280_gp02']

def find_conservation(x, seqs):

    found=0
    for s in seqs:        
        if x in s.seq:
            found+=1    
    return found

df['conserved'] = df.peptide.apply(lambda x: find_conservation(x, seqs),1)
df.conserved.value_counts()
cons = df[df.conserved>=2]
#cons.to_csv('S_netmhciipan_conserved.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['conserved'] = df.peptide.apply(lambda x: find_conservation(x, seqs),1)


### find identity to closest peptide in each sequence 

In [None]:
import difflib

def find_conservation(x, w):    
    m = difflib.get_close_matches(x, w, n=1, cutoff=.67)
    if len(m)==0:
        return 0
    else:
        m=m[0]
        s = difflib.SequenceMatcher(None, x, m)
        return s.ratio()

seqs = get_seqs('S')
df = pb2[pb2.name=='GU280_gp02']
s=seqs[0]

for s in seqs:
    if s.id == 'scov2': 
        continue
    w,ss = peptutils.create_fragments(seq=str(s.seq), length=11)
    df[s.id] = df.peptide.apply(lambda x: find_conservation(x, w),1)  

df['total'] = df[df.columns[8:]].sum(1)
df = df.sort_values('total',ascending=False)
df = df[df.total>0]
df = df.round(2)
df.to_csv('S_netmhciipan_conserved.csv')

In [None]:
a = ep.binders_to_coords(pb2)
#b = ep.binders_to_coords(pb2)
f = ep.plot_overview(sc2, coords={'netmhciipan':a},
                         cols=2, figsize=(14,6))

In [None]:
name ='GU280_gp02'
#print (pb[pb.name==name])
ax = ep.plot_tracks([P1],name=name,legend=True,figsize=(15,5),n=3)#,cutoff_method='score')
#r = cl1[cl1.name==name]
#coords = (list(r.start),list(r.end-r.start))
coords = (list(spike_cons.pos),[11 for i in range(len(spike_cons))])
#print (coords)
coords = zip(*coords)
#print (list(coords))
ep.plot_regions(coords, ax, color='red')
plt.savefig('spike_conserved_epitopes.png',dpi=150)

In [None]:
plot=ep.bokeh_plot_tracks([P1],name=name,n=3,height=200)#,cutoff_method='score')
show(plot)

## exp data from IEDB

In [None]:
exp = pd.read_csv('mhc_ligand_table_export_1591822113.csv')
exp.columns

In [None]:
cols = ['Sequence','Starting Position','Ending Position','Parent Protein','Antigen Name','Allele Name']
subset = exp[exp['Parent Protein']=='Spike glycoprotein']
subset[cols]
exp_peptides = subset.groupby('Sequence').agg({'Response Frequency':np.sum})

## view residues on structure

In [None]:
ranges = list(zip(r.start,r.end))
ranges = [list(range(r[0],r[1])) for r in ranges]

In [None]:
from pymol import cmd

In [None]:
def highlight_residues(residues, chain):
   
    from pymol import stored
    vals = {}   
    for r in residues:
        sel = '(chain %s and resi %s)' %(chain,r)
        cmd.color('red', sel)
    return
    
def find_interacting_residues():
    """Find set of residues"""
   
    from pymol import stored
    vals = {}
    residues = range(1,50)
    offset=3
    for p in residues:
        sel1 = '(c. A and (donor or acceptor) and resi %s)' %p
        cmd.select('near','c. B within 4 of %s' %sel1)
        #cmd.show('stick','near')
        #cmd.color('green', 'near')
        stored.lst=[]
        cmd.iterate('near',"stored.lst.append((chain,resi,resn,name))")
        #print (stored.lst)       
        for r in stored.lst:         
            cmd.show('stick','resi %s' %r[1])
            cmd.color('red','resi %s' %r[1])
    return


In [None]:
def draw_spike():
    cmd.reinitialize()
    #cmd.load('6lzg.pdb')
    cmd.load('model_spike.pdb')
    cmd.orient()
    cmd.remove('resn hoh')
    cmd.hide('all')
    cmd.show('cartoon','chain C')
    cmd.turn('x', -90)
    cmd.turn('z', 200)
    cmd.bg_color('white')
    cmd.color('marine','chain C')
    cmd.color('gray','chain D')
    #cmd.select('rbd', '(chain C and resi 455+486+493+494+501+505)')
    cmd.zoom('chain C')
    #cmd.show('sticks', 'rbd')
    #cmd.label('rbd and n. c' , 'resn+resi')
    cmd.set('label_position', (1,2,3))
    return

def highlight_ranges():
    for x in ranges:
        highlight_residues(x,'C')
        
def save_image(filename):
    cmd.set('ray_trace_mode',1)
    cmd.set('ray_trace_gain',0)
    cmd.png(filename, width=1200,dpi=150)
    #cmd.save('model_spike.pse')
    Image(filename='model_spike.png')
    
draw_spike()
#highlight_ranges()
save_image('model_spike.png')
Image(filename='model_spike.png')