# Preparing sites under selection data

This notebook validates sites under selection provided by DataMonkey effort at https://covid19.datamonkey.org (also see https://observablehq.com/@spond/revised-sars-cov-2-analytics-page). The validation is necessary to ensure exact coordinate match.


In [1]:
!pip install biopython

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/76/02/8b606c4aa92ff61b5eda71d23b499ab1de57d5e818be33f77b01a6f435a8/biopython-1.78-cp36-cp36m-manylinux1_x86_64.whl (2.3MB)
[K     |▏                               | 10kB 15.9MB/s eta 0:00:01[K     |▎                               | 20kB 21.9MB/s eta 0:00:01[K     |▍                               | 30kB 7.6MB/s eta 0:00:01[K     |▋                               | 40kB 2.9MB/s eta 0:00:01[K     |▊                               | 51kB 3.5MB/s eta 0:00:01[K     |▉                               | 61kB 4.1MB/s eta 0:00:01[K     |█                               | 71kB 4.2MB/s eta 0:00:01[K     |█▏                              | 81kB 4.6MB/s eta 0:00:01[K     |█▎                              | 92kB 5.0MB/s eta 0:00:01[K     |█▌                              | 102kB 5.3MB/s eta 0:00:01[K     |█▋                              | 112kB 5.3MB/s eta 0:00:01[K     |█▊                              | 

In [2]:
import pandas as pd

In [3]:
# SARS-CoV-2 genome assembly url
gnm_url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.gbff.gz'
gnm_file = gnm_url.split('/')[-1]
sel_url = 'https://raw.githubusercontent.com/veg/SARS-CoV-2/compact/data/fasta/2021-01-14/comparative-annotation.json'

In [4]:
# Get SARS-CoV-2 RefSeq genomes (in GenBank format) from NCBI
!wget {gnm_url}
!gunzip {gnm_file}

--2021-02-17 14:23:44--  https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.gbff.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 2607:f220:41e:250::13, 2607:f220:41e:250::12, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24256 (24K) [application/x-gzip]
Saving to: ‘GCF_009858895.2_ASM985889v3_genomic.gbff.gz’


2021-02-17 14:23:45 (132 KB/s) - ‘GCF_009858895.2_ASM985889v3_genomic.gbff.gz’ saved [24256/24256]



In [5]:
from Bio import SeqIO
genome = SeqIO.read(gnm_file[0:len(gnm_file)-3], "genbank")

In [6]:
# Validation function for gisaid data

def check_against_genome(seqobject,df,pos_base_list,l):
    wrong = []
    slip_sites = dict()
    bad = 0
    good = 0
    pb = df[pos_base_list].to_numpy()
    for item in pb:
        base = seqobject[item[0]:(item[0])+l].seq
        if base != item[1] and len(base) == len(item[1]):
            slip = seqobject[(item[0]-1):(item[0]-1)+l].seq
            if slip == item[1]:
                good += 1
                slip_sites[item[0]] = True
            else:
                bad += 1
                wrong.append([item[1],base,item[0]])
        elif base == item[1] and len(base) == len(item[1]): 
            good += 1
    print ('Total = {}, Wrong = {}, Correct = {}'.format(len(pb),bad,good))
    return(slip_sites,wrong)

In [7]:
# Loading data
gisaid = pd.read_json(sel_url,orient="index")

In [8]:
gisaid.head()

Unnamed: 0,G,S,bCFEL,bFEL,bMEME,bSC2,bSC2-aa,bcdn,baa,evo,cdn,aa,SLAC,FEL,MEME,trend,subs,hla
26522,M,1,"{'p': 1, 'a': 0, 'b-nCOV': 0, 'b': 0}","{'a': 0, 'b': 0, 'p': 1}","{'p': 1, 'a': 0, 'b+': 0, 'w+': 0, 'b-': 0, 'w...",ATG,M,"{'nCOV': {'ATG': 11}, 'others': {'ATG': 58}}","{'nCOV': {'M': 11}, 'others': {'M': 58}}",,"{'ATG': 315949, 'CT-': 7, 'AAG': 155, 'AGA': 1}","{'M': 315949, 'K': 155, 'R': 1}","{'N': 2.5, 'S': 0.5, 'EN': 2.996838544038124, ...","{'a': 417.8222848307964, 'b': 0, 'p': 0.000691...","{'p': 0.6666666666666661, 'a': 412.82941071423...",-2.046384,"{'cdn': {}, 'aa': {}, 'lcdn': {'AAG|ATG': 1, '...",
26525,M,2,"{'p': 0.163687076656009, 'a': 3.60671003067715...","{'a': 3.605986089854317, 'b': 0.86442440798198...","{'p': 0.585979417576455, 'a': 3.60965619836132...",GCA,A,"{'nCOV': {'GCT': 5, 'GGT': 3, 'GCA': 2, 'GTA':...","{'nCOV': {'A': 7, 'G': 3, 'V': 1}, 'others': {...","{'GCT': 0.194136900610685, 'GGT': 0.0183401883...","{'GCA': 314811, 'GTA': 582, 'CT-': 25, 'TCA': ...","{'A': 314973, 'V': 582, 'S': 530, 'D': 1, 'L': 1}","{'N': 17.5, 'S': 2.5, 'EN': 1.997608751216602,...","{'a': 3.421407460468902, 'b': 1.70090336909322...","{'p': 0.6666666666666661, 'a': 3.4125230238633...",9.147443,"{'cdn': {'GTA|GCA': 1, 'TCA|GCA': 3, 'GCT|GCA'...",
26528,M,3,"{'p': 0.020556384746109, 'a': 1.43385583667858...","{'a': 1.433602576251733, 'b': 0, 'p': 0.032009...","{'p': 0.6666666666666661, 'a': 1.4354484090066...",GAT,D,"{'nCOV': {'GAC': 8, 'GAT': 2, '---': 1}, 'othe...","{'nCOV': {'D': 10, '-': 1}, 'others': {'D': 25...","{'GAC': 0.8407633604173491, 'GAT': 0.092845022...","{'GAT': 314767, 'GGT': 1257, 'TAT': 48, 'CT-':...","{'D': 314767, 'G': 1257, 'Y': 48}","{'N': 10, 'S': 0, 'EN': 2.294568320426781, 'ES...","{'a': 0, 'b': 2.356117806449815, 'p': 0.010872...","{'p': 0.017510014120806, 'a': 0, 'b+': 2.37339...",-6.389396,"{'cdn': {'GGT|GAT': 1, 'TAT|GAT': 3}, 'aa': {'...",
26531,M,4,"{'p': 1, 'a': 0, 'b-nCOV': 8.314818677743428, ...","{'a': 0, 'b': 8.315740051354865, 'p': 0.177533...","{'p': 0.20095787090875203, 'a': 0, 'b+': 8.328...",TCC,S,"{'nCOV': {'---': 9, 'TCC': 1, 'AAC': 1}, 'othe...","{'nCOV': {'-': 9, 'S': 1, 'N': 1}, 'others': {...","{'TCC': 0.27949523426495504, 'AAC': 0.38062518...","{'TCC': 315801, 'TTC': 96, 'TCT': 186, 'CT-': ...","{'S': 315987, 'F': 96}","{'N': 2, 'S': 3, 'EN': 2.001452317837964, 'ES'...","{'a': 0.662652827444776, 'b': 0.82751185455001...","{'p': 0.5983302889617891, 'a': 0.6653082200434...",3.255255,"{'cdn': {'TTC|TCC': 2, 'TCT|TCC': 3}, 'aa': {'...",
26534,M,5,"{'p': 0.18760050438280002, 'a': 1.931559529077...","{'a': 1.931608593694144, 'b': 0, 'p': 0.010497...","{'p': 0.6666666666666661, 'a': 1.9335891463403...",AAC,N,"{'nCOV': {'AAC': 11}, 'others': {'AAC': 53, 'A...","{'nCOV': {'N': 11}, 'others': {'N': 56, 'T': 1...","{'AAC': 0.878888017092832, 'AAT': 0.0882367158...","{'AAC': 316108, 'CT-': 4}",{'N': 316108},"{'N': 0, 'S': 0, 'EN': 2.162391094277982, 'ES'...","{'a': 0, 'b': 0, 'p': 1}","{'p': 1, 'a': 0, 'b+': 0, 'w+': 0, 'b-': 0, 'w...",-0.927954,,


In [9]:
# Some rows have no data and cannot be processed. As long as this number is small, it is ok
sel = pd.DataFrame(columns=['gene','site','fel_p','fel_a','fel_b','meme_p','meme_a','meme_b_p','meme_b_m','freq','REF','ALT','codon'])
i = 0
for row in gisaid.iterrows():
    index, data = row
    try:
        fel_p = data['FEL']['p']
        fel_a = data['FEL']['a']
        fel_b = data['FEL']['b']
        meme_p = data['MEME']['p']
        meme_a = data['MEME']['a']
        meme_b_m = data['MEME']['b-']
        meme_b_p = data['MEME']['b+']
        ancestral_codon_count = data['cdn'][data['bSC2']]
        sum_of_all_counts = sum(data['cdn'].values())
        freq = (sum_of_all_counts-ancestral_codon_count )/sum_of_all_counts
        sel.loc[index,:] = {
                            'gene':data['G'],
                            'site':data['S'],
                            'fel_p':fel_p,
                            'fel_a':fel_a,
                            'fel_b':fel_b, 
                            'meme_p':meme_p,
                            'meme_a':meme_a,
                            'meme_b_p':meme_b_p,
                            'meme_b_m':meme_b_m,
                            'freq':freq,
                            'REF':ancestral_codon_count, 
                            'ALT':(sum_of_all_counts -ancestral_codon_count),
                            'codon':data['bSC2'] 
                            }
    except:
        i += 1
print('Could not process {} rows'.format(i))
del(gisaid)

Could not process 83 rows


In [11]:
# Restrict to "significant" sites only 
# For information of FEL and MEME methods see http://hyphy.org/methods/selection-methods/
sel = sel[(sel['fel_p']<= 0.05) | (sel['meme_p']<= 0.05)]

In [12]:
sel = sel.reset_index().rename(columns={'index':'pos'})

In [13]:
sel.head()

Unnamed: 0,pos,gene,site,fel_p,fel_a,fel_b,meme_p,meme_a,meme_b_p,meme_b_m,freq,REF,ALT,codon
0,26522,M,1,0.000691632,417.822,0.0,0.666667,412.829,619.244,0,0.00051564,315949,163,ATG
1,26528,M,3,0.0108724,0.0,2.35612,0.01751,0.0,2.37339,0,0.00425482,314767,1345,GAT
2,26540,M,7,0.00202402,0.0,2.60246,0.0037384,0.0,2.5982,0,0.000335324,316006,106,ACT
3,26549,M,10,0.0463498,0.0,0.966731,0.0642653,0.0,0.957421,0,0.000338488,316005,107,GTT
4,26570,M,17,0.0148766,0.0,1.44099,0.0233506,0.0,1.44975,0,0.00242636,315345,767,CTT


In [14]:
# Check against genome
# We expect some wrong here simply because we use consensus. As long as the majority is correct we are good
slip,wrong = check_against_genome(genome,sel,['pos','codon'],3)

Total = 1399, Wrong = 0, Correct = 1399


In [15]:
len(slip)

172

In [16]:
# Tag sites where coordinate needs to be reduced by 1 due to slippage at orf1a/orf1ab switch site
for key in slip:
    sel.loc[sel["pos"] == key, "slip"] = True

In [17]:
sel.fillna(False,inplace=True)

In [18]:
sel

Unnamed: 0,pos,gene,site,fel_p,fel_a,fel_b,meme_p,meme_a,meme_b_p,meme_b_m,freq,REF,ALT,codon,slip
0,26522,M,1,0.000692,417.822285,0.000000,0.666667,412.829411,619.244116,0.000000,0.000516,315949,163,ATG,False
1,26528,M,3,0.010872,0.000000,2.356118,0.017510,0.000000,2.373394,0.000000,0.004255,314767,1345,GAT,False
2,26540,M,7,0.002024,0.000000,2.602464,0.003738,0.000000,2.598201,0.000000,0.000335,316006,106,ACT,False
3,26549,M,10,0.046350,0.000000,0.966731,0.064265,0.000000,0.957421,0.000000,0.000338,316005,107,GTT,False
4,26570,M,17,0.014877,0.000000,1.440988,0.023351,0.000000,1.449747,0.000000,0.002426,315345,767,CTT,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1394,23011,S,484,0.001406,0.000000,15.519778,0.002641,0.000950,15.559239,0.000034,0.000999,256934,257,GAA,False
1395,23026,S,489,0.003169,2.387321,0.000000,0.666667,2.383996,4.516279,0.000000,0.000684,257021,176,TAC,False
1396,23029,S,490,0.039912,0.000000,1.013725,0.056271,0.000000,1.011087,0.000000,0.000163,257133,42,TTT,False
1397,23062,S,501,0.000005,0.365282,9.767394,0.000002,0.302641,1336.128177,0.000000,0.049682,244417,12778,AAT,False


In [19]:
sel.loc[sel['slip']==True, 'pos'] = sel['pos']-1

In [20]:
sel[sel['pos']==13515]

Unnamed: 0,pos,gene,site,fel_p,fel_a,fel_b,meme_p,meme_a,meme_b_p,meme_b_m,freq,REF,ALT,codon,slip
819,13515,ORF1b,17,0.044452,0.0,14.310192,0.061978,0.0,14.456635,0.0,0.001042,212916,222,ACT,True


In [21]:
sel.to_csv('selection.tsv',sep='\t',index=False)

In [22]:
from google.colab import files
files.download('selection.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>