# Preparing variants of concern (VOC) site data

This notebook validates VOC sites from https://cov-lineages.org/global_report.html and https://www.biorxiv.org/content/10.1101/2020.12.31.425021v1 against SARS-CoV-2 genome and creates a tab-delimited dataset. The validation is necessary to ensure exact coordinate match.


In [None]:
!pip install biopython

In [None]:
# SARS-CoV-2 genome assembly url
gnm_url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.gbff.gz'
gnm_file = gnm_url.split('/')[-1]

In [None]:
# B.1.551 set from https://cov-lineages.org/global_report_B.1.351.html
voc = {
    'B1351' : 
    {
        'P71L':26454,
        'T205I':28885,
        'K1655N':5227,
        'D80A':21799,
        'D215G':22204,
        'K417N':22810,
        'A701V':23662,
        'N501Y':23062,
        'E484K':23011
    },

# P.1 from https://cov-lineages.org/global_report_P.1.html

    'P1' : 
    {
        'S1188L':3826,
        'K1795Q':5647,
        'del':11288,
        'L18F':21613,
        'T20N':21619,
        'P26S':21637,
        'D138Y':21973,
        'R190S':22129,
        'K417T':22810,
        'E484K':23011,
        'N501Y':23062,
        'H655Y':23524,
        'T1027I':24640,
        'G174C':25911,
        'E92K':28166,
        'P80R':28510
    },

# B.1.1.7 from https://cov-lineages.org/global_report_B.1.1.7.html

    'B117' : 
    {
        'T1001I':3265,
        'A1708D':5386,
        'I2230T':6952,
        'del9':11288,
        'del6':21765,
        'del3':21991,
        'N501Y':23062,
        'A570D':23269,
        'P681H':23602,
        'T716I':23707,
        'S982A':24505,
        'D1118H':24913,
        'Q27stop':27971,
        'Y73C':28109,
        'D3L':28279,
        'S235F':28975
    },

# Evolving sites from https://www.biorxiv.org/content/10.1101/2020.12.31.425021v1

    'BLOOM' : 
    {
        'E484': 23011,
        'F456': 22927,
        'F486': 23017,
        'F490': 23029,
        'G446': 22897,
        'G447': 22900,
        'G485': 23014,
        'G496': 23047,
        'I472': 22975,
        'K444': 22891,
        'L455': 22924,
        'N448': 22903,
        'N450': 22909,
        'P384': 22711,
        'S383': 22708,
        'S443': 22888,
        'V445': 22894,
        'Y365': 22654,
        'Y369': 22666,
        'Y449': 22906,
        'Y473': 22978
     },

# A.23.1 from https://cov-lineages.org/global_report_A.23.1.html
     'A231':
     {
            'F157L':22030,
            'V367F':22660,
            'Q613H':23398,
            'P681':23602
     }

}

In [None]:
# Get SARS-CoV-2 RefSeq genomes (in GenBank format) from NCBI
!wget {gnm_url}
!gunzip {gnm_file}

In [None]:
from Bio import SeqIO
genome = SeqIO.read(gnm_file[0:len(gnm_file)-3], "genbank")

In [None]:
# Validate above sites against SARS-CoV-2 genomes
# Should show True for all sites
# If everything is OK executing this cell will produce no output

for voc_set in voc:
    for key,site in voc[voc_set].items():
        if not key.startswith('del'):
            aa = genome[site:site+3].translate()[0]
            if not aa==key[0]:
                print(voc_set)
                print('='*10)
                print('{}\t{}\t{}\t{}\t{}'.format(key,site,key[0],aa,aa==key[0]))

In [None]:
# Write file voc_tsv as [voc_set]\t[mutation]\t[genome position]

f = open('voc.tsv','w')
for voc_set in voc:
    for key,site in voc[voc_set].items():
        f.write('{}\t{}\t{}\n'.format(voc_set,key,site))
f.close()

In [None]:
!head voc.tsv

In [None]:
from google.colab import files
files.download('voc.tsv')