# Preparing sites under selection data

This notebook validates sites under selection provided by DataMonkey effort at https://covid19.datamonkey.org (also see https://observablehq.com/@spond/revised-sars-cov-2-analytics-page). The validation is necessary to ensure exact coordinate match.


In [1]:
!pip install biopython



In [2]:
import pandas as pd

In [3]:
# SARS-CoV-2 genome assembly url
gnm_url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.gbff.gz'
gnm_file = gnm_url.split('/')[-1]
sel_url = 'https://raw.githubusercontent.com/veg/SARS-CoV-2/compact/data/fasta/2021-01-14/comparative-annotation.json'

In [4]:
# Get SARS-CoV-2 RefSeq genomes (in GenBank format) from NCBI
!wget {gnm_url}
!gunzip {gnm_file}

--2021-02-09 22:46:56--  https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.gbff.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.229, 2607:f220:41e:250::13, 2607:f220:41e:250::11, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.229|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24256 (24K) [application/x-gzip]
Saving to: ‘GCF_009858895.2_ASM985889v3_genomic.gbff.gz’


2021-02-09 22:46:57 (403 KB/s) - ‘GCF_009858895.2_ASM985889v3_genomic.gbff.gz’ saved [24256/24256]



In [5]:
from Bio import SeqIO
genome = SeqIO.read(gnm_file[0:len(gnm_file)-3], "genbank")

In [6]:
# Validation function for gisaid data

def check_against_genome(seqobject,df,pos_base_list,l):
    wrong = []
    slip_sites = dict()
    bad = 0
    good = 0
    pb = df[pos_base_list].to_numpy()
    for item in pb:
        base = seqobject[item[0]:(item[0])+l].seq
        if base != item[1] and len(base) == len(item[1]):
            slip = seqobject[(item[0]-1):(item[0]-1)+l].seq
            if slip == item[1]:
                good += 1
                slip_sites[item[0]] = True
            else:
                bad += 1
                wrong.append([item[1],base,item[0]])
        elif base == item[1] and len(base) == len(item[1]): 
            good += 1
    print ('Total = {}, Wrong = {}, Correct = {}'.format(len(pb),bad,good))
    return(slip_sites,wrong)

In [7]:
# Loading data
gisaid = pd.read_json(sel_url,orient="index")

In [8]:
# Some rows have no data and cannot be processed. As long as this number is small, it is ok
sel = pd.DataFrame(columns=['gene','site','fel','meme','freq','REF','ALT','codon'])
i = 0
for row in gisaid.iterrows():
    index, data = row
    try:
        fel = data['FEL']['p']
        meme = data['MEME']['p']
        ancestral_codon_count = data['cdn'][data['bSC2']]
        sum_of_all_counts = sum(data['cdn'].values())
        freq = (sum_of_all_counts-ancestral_codon_count )/sum_of_all_counts
        sel.loc[index,:] = {'gene':data['G'],'site':data['S'],'fel':fel, 'meme':meme,'freq':freq,'REF':ancestral_codon_count, 'ALT':(sum_of_all_counts -ancestral_codon_count),'codon':data['bSC2'] }
    except:
        i += 1
print('Could not process {} rows'.format(i))
del(gisaid)

Could not process 83 rows


In [9]:
# Restrict to "significant" sites only 
# For information of FEL and MEME methods see http://hyphy.org/methods/selection-methods/
sel = sel[(sel['fel']<= 0.05) | (sel['meme']<= 0.05)]

In [10]:
sel = sel.reset_index().rename(columns={'index':'pos'})

In [11]:
sel.head()

Unnamed: 0,pos,gene,site,fel,meme,freq,REF,ALT,codon
0,26522,M,1,0.000691632,0.666667,0.00051564,315949,163,ATG
1,26528,M,3,0.0108724,0.01751,0.00425482,314767,1345,GAT
2,26540,M,7,0.00202402,0.0037384,0.000335324,316006,106,ACT
3,26549,M,10,0.0463498,0.0642653,0.000338488,316005,107,GTT
4,26570,M,17,0.0148766,0.0233506,0.00242636,315345,767,CTT


In [12]:
# Check against genome
# We expect some wrong here simply because we use consensus. As long as the majority is correct we are good
slip,wrong = check_against_genome(genome,sel,['pos','codon'],3)

Total = 1399, Wrong = 0, Correct = 1399


In [13]:
len(slip)

172

In [14]:
# Tag sites where coordinate needs to be reduced by 1 due to slippage at orf1a/orf1ab switch site
for key in slip:
    sel.loc[sel["pos"] == key, "slip"] = True

In [15]:
sel.fillna(False,inplace=True)

In [16]:
sel

Unnamed: 0,pos,gene,site,fel,meme,freq,REF,ALT,codon,slip
0,26522,M,1,0.000692,0.666667,0.000516,315949,163,ATG,False
1,26528,M,3,0.010872,0.017510,0.004255,314767,1345,GAT,False
2,26540,M,7,0.002024,0.003738,0.000335,316006,106,ACT,False
3,26549,M,10,0.046350,0.064265,0.000338,316005,107,GTT,False
4,26570,M,17,0.014877,0.023351,0.002426,315345,767,CTT,False
...,...,...,...,...,...,...,...,...,...,...
1394,23011,S,484,0.001406,0.002641,0.000999,256934,257,GAA,False
1395,23026,S,489,0.003169,0.666667,0.000684,257021,176,TAC,False
1396,23029,S,490,0.039912,0.056271,0.000163,257133,42,TTT,False
1397,23062,S,501,0.000005,0.000002,0.049682,244417,12778,AAT,False


In [17]:
sel.loc[sel['slip']==True, 'pos'] = sel['pos']-1

In [18]:
sel[sel['pos']==13515]

Unnamed: 0,pos,gene,site,fel,meme,freq,REF,ALT,codon,slip
819,13515,ORF1b,17,0.044452,0.061978,0.001042,212916,222,ACT,True


In [19]:
sel.to_csv('selection.tsv',sep='\t',index=False)

In [20]:
from google.colab import files
files.download('selection.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>