# Load SARS-CoV-2 Strain Variant Data from CNCB
**[Work in progress]**

This notebook downloads and standardizes viral strain variation data from CNCB for ingestion into a Knowledge Graph.

Data source: [China National Center for Bioinformation, 2019 Novel Coronavirus Resource (2019nCoVR)](https://bigd.big.ac.cn/ncov/release_genome)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import shutil
import glob
import ftplib
import re
import dateutil
import pandas as pd
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
# Path will take care of handling operating system differences.
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


In [4]:
# Create a directory to cache variation data that could not be parsed
CACHE_FAILED = Path(NEO4J_IMPORT / 'cache/failed/cncb')
CACHE_FAILED.mkdir(parents=True, exist_ok=True)

In [5]:
# Create a directory to cache raw variation data gff3 files
CACHE_RAW_CNCB = Path(NEO4J_IMPORT / 'cache/raw/cncb')
CACHE_RAW_CNCB.mkdir(parents=True, exist_ok=True)

In [6]:
# Create a directory to cache processed variation data csv files
CACHE_PROCESSED_CNCB = Path(NEO4J_IMPORT / 'cache/processed/cncb')
CACHE_PROCESSED_CNCB.mkdir(parents=True, exist_ok=True)

## Download SARS-CoV-2 Variation Data

Limit currently set to 5000 downloads per day to avoid timeouts

In [7]:
limit = 5000

Variation data are available on this FTP server
ftp://download.big.ac.cn/GVM/Coronavirus/gff3/" 

In [8]:
ftp_server = "download.big.ac.cn"
user = "anonymous"
password = "anonymous"
source = "/GVM/Coronavirus/gff3/"

Create a list of local and remote directories

In [9]:
# directories on the FTP server
remote_dirs = []
# local directories to cache raw data files
raw_dirs = []
# local directories to cache processed data files
proc_dirs = []

# subdirectories are named: a ... n (note, this may change)
subdirs = [chr(x) for x in range(ord('a'), ord('n') + 1)]

for subdir in subdirs:
    remote_dirs.append(source + subdir)
    raw_dirs.append(CACHE_RAW_CNCB / subdir)
    proc_dirs.append(CACHE_PROCESSED_CNCB / subdir)

Setup local cache directories if they don't exit

In [10]:
for subdir in raw_dirs:
    subdir.mkdir(exist_ok=True)
for subdir in proc_dirs:   
    subdir.mkdir(exist_ok=True)

Download and cache data files with variant information

In [11]:
def update_cache(ftp_server, user, password, remote_dir, raw_dir, limit):
    ftp = ftplib.FTP(ftp_server)
    ftp.login(user, password)
    ftp.cwd(remote_dir)
    
    downloads = 0
    for file in ftp.nlst():
        filename = raw_dir / file
        if not Path.exists(filename) and downloads < limit:
            downloads += 1
            with open(filename, 'wb') as f:
                ftp.retrbinary(f'RETR {file}', f.write)

    ftp.quit()
    return downloads

In [12]:
for remote_dir, raw_dir in zip(remote_dirs, raw_dirs):
    downloads = update_cache(ftp_server, user, password, remote_dir, raw_dir, limit)
    print(f'downloaded {downloads}: {remote_dir} -> {raw_dir}')

downloaded 25: /GVM/Coronavirus/gff3/a -> /Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import/cache/raw/cncb/a
downloaded 25: /GVM/Coronavirus/gff3/b -> /Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import/cache/raw/cncb/b
downloaded 0: /GVM/Coronavirus/gff3/c -> /Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import/cache/raw/cncb/c
downloaded 0: /GVM/Coronavirus/gff3/d -> /Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import/cache/raw/cncb/d
downloaded 0: /GVM/Coronavirus/gff3/e -> /Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import/cache/raw/cncb/e
downloaded 0: /GVM/Coronavirus/gff3/f -> /Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf

In [13]:
def parse_strain_id(filename):
    # parse strain id from file name, e.g. ..cache/raw/cncb/a/2019-nCoV_EPI_ISL_484968_variants.gff3 -> EPI_ISL_484968
    return Path(filename).stem[10:-9]

In [14]:
def parse_base_path(filename):
    # parse base path from file name, e.g. ..cache/raw/cncb/n/2019-nCoV_EPI_ISL_484968_variants.gff3 -> /n/2019-nCoV_EPI_ISL_484968_variants.
    return re.split('cncb', filename)[1][1:-5]

Create a dataframe of raw files and strain identifiers

In [15]:
raw_files = glob.glob(f'{CACHE_RAW_CNCB}/*/*.gff3')
raw_files_df = pd.DataFrame(raw_files, columns=['raw_filename'])
raw_files_df['id'] = raw_files_df['raw_filename'].apply(parse_strain_id)
print("Cached raw gff3 files:", raw_files_df.shape[0])

Cached raw gff3 files: 5262


Create a dataframe of processed files and strain identifiers

In [16]:
proc_files = glob.glob(f'{CACHE_PROCESSED_CNCB}/*/*.csv')
proc_files_df = pd.DataFrame(proc_files, columns=['proc_filename'])
proc_files_df['id'] = proc_files_df['proc_filename'].apply(parse_strain_id)
print("Cached processed csv files:", proc_files_df.shape[0])

Cached processed csv files: 5203


In [17]:
unproc_files_df = pd.merge(raw_files_df, proc_files_df, on='id', how='outer', indicator=True).query('_merge=="left_only"')

In [18]:
print("Files to be processed:", unproc_files_df.shape[0])

Files to be processed: 53


In [19]:
names = ['taxon1', 'variantType', 'name', 'start', 'end','x1', 'x2', 'x3','taxon2', 'x4', 'ref', 'alt', 'vepAnnotation']

In [20]:
def split_vep(record):
    # split variant effect predictor record
    items = record.split(',')
    num_items = len(items)
    # example: ['intergenic_variant']
    if num_items == 1:
        return items[0] + ',,,'
    # example: ['missense_variant', 'QHD43415.1:p.1036D>E', 'gene-orf1ab:c.3108gaC>gaA']
    elif num_items == 3:
        return items[0] + ',' + items[1] + ',' + items[2] + ','
    # example: ['upstream_gene_variant', 'DISTANCE=25', 'QHD43415.1', 'gene-orf1ab']     
    elif num_items == 4:
        return items[0] + ',' +  items[2] + ',' + items[3] + ',' + items[1]
    else:
        return ',,,'

In [21]:
def parse_gff3(raw_file):
    filename =  parse_base_path(raw_file)  + '.csv'
    #print('parsing:', raw_file, filename)
    gff3 = pd.read_csv(raw_file, header=None, comment='#', sep='[\t;]', engine='python', names=names)
    try:
        gff3['vepAnnotation'] = gff3['vepAnnotation'].str.replace('VEP=','')
        gff3['vepAnnotation'] = gff3['vepAnnotation'].apply(split_vep)
        gff3[['variantConsequence','proteinVariant','geneVariant', 'distance']] = gff3['vepAnnotation'].str.split(',', expand=True)
        gff3['geneVariant'] = gff3['geneVariant'].str.replace('gene-','')
        gff3['distance'] = gff3['distance'].str.replace('DISTANCE=', '')
        gff3['ref'] = gff3['ref'].str.replace('REF=','')
        gff3['alt'] = gff3['alt'].str.replace('ALT=','')
        gff3 = gff3[['name', 'variantType', 'start', 'end', 'ref', 'alt', 'variantConsequence', 'proteinVariant', 'geneVariant', 'distance']]
        gff3['id'] = parse_strain_id(raw_file)

        gff3.to_csv(CACHE_PROCESSED_CNCB / filename, index=False)
        return True
    except:
        print('Parsing failed for: ', filename)
        # cache files that failed to be parsed for manual inspection
        shutil.copy(raw_file, CACHE_FAILED)
        return False

In [22]:
status = raw_files_df['raw_filename'].apply(parse_gff3)

Parsing failed for:  a/2019-nCoV_EPI_ISL_424491_variants.csv
Parsing failed for:  a/2019-nCoV_EPI_ISL_424471_variants.csv
Parsing failed for:  a/2019-nCoV_EPI_ISL_418635_variants.csv
Parsing failed for:  a/2019-nCoV_EPI_ISL_424538_variants.csv
Parsing failed for:  a/2019-nCoV_EPI_ISL_417602_variants.csv
Parsing failed for:  a/2019-nCoV_EPI_ISL_423003_variants.csv
Parsing failed for:  a/2019-nCoV_EPI_ISL_420580_variants.csv
Parsing failed for:  a/2019-nCoV_EPI_ISL_415583_variants.csv
Parsing failed for:  a/2019-nCoV_EPI_ISL_417591_variants.csv
Parsing failed for:  a/2019-nCoV_EPI_ISL_420534_variants.csv


#### Standardize node property names (CURIEs and URIs)

In [23]:
# https://registry.identifiers.org/registry/insdc
insdc_pattern = re.compile('^([A-Z]\d{5}|[A-Z]{2}\d{6}|[A-Z]{4}\d{8}|[A-J][A-Z]{2}\d{5})(\.\d+)?$')
# https://registry.identifiers.org/registry/refseq
refseq_pattern = re.compile('^(((AC|AP|NC|NG|NM|NP|NR|NT|NW|XM|XP|XR|YP|ZP)_\d+)|(NZ\_[A-Z]{2,4}\d+))(\.\d+)?$')

In [24]:
def assign_curie(id):
    id = id.strip()
    # remove underscore to enable CURIE matching of NCBI reference sequences NC_...
    #id = id.replace('NC_', 'NC') 
    if len(id) > 0:
        if id.startswith('EPI'):
            return 'https://www.gisaid.org/' + id
        elif refseq_pattern.match(id) != None:
            return 'refseq:' + id
        elif insdc_pattern.match(id) != None:
            return 'insdc:' + id
        else:
            # TODO are URIs available for these cases?
            return id
    else:
        return id

In [25]:
# use all processed data files
path = str(CACHE_PROCESSED_CNCB / '*/*.csv')
filenames = glob.glob(path)

variations = pd.concat((pd.read_csv(f, index_col=None, header=0) for f in filenames))
variations.fillna('', inplace=True)

print('Number of cached files loaded:',len(filenames))

Number of cached files loaded: 5252


List of variant types and consequences:

https://uswest.ensembl.org/info/genome/variation/prediction/classification.html

https://uswest.ensembl.org/info/genome/variation/prediction/predicted_data.html#consequences

#### Extract protein position and protein id from proteinVariant string

Example: QHD43415.1:p.5828P>L

proteinPosition: 5828
proteinId: QHD43415

In [26]:
position_pattern = re.compile(':p\.(.*?)[A-Z|\-]+')

In [27]:
def extract_protein_position(s):
    if s == '':
        return s
    else:
        groups = position_pattern.search(s)
        if groups == None:
            return ''
        else:
            return groups.group(1)

In [28]:
variations['proteinPosition'] = variations['proteinVariant'].apply(extract_protein_position)

In [29]:
variations['proteinAccession'] = variations['proteinVariant'].apply(lambda s: s.split('.')[0] if '.' in s else '')

In [30]:
variations['proteinAccession'].unique()

array(['', 'QHD43415', 'QHD43416', 'QHD43417', 'QHI42199', 'QHD43418',
       'QHD43419', 'QHD43423', 'QHD43421', 'QHD43422', 'QHD43420'],
      dtype=object)

#### Assign SARS-CoV-2 taxonomy id

In [31]:
variations['taxonomyId'] = 'taxonomy:2697049'

#### Assign Reference genome

The first SARS-CoV-2 genome sequence is the reference for the variant annotation below.

[Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1](https://www.ncbi.nlm.nih.gov/nuccore/MN908947)

In [32]:
variations['referenceGenome'] = 'insdc:MN908947' # same as NCBI reference sequence NC_045512

In [33]:
variations['proteinAccession'] = variations['proteinAccession'].apply(lambda s: 'ncbiprotein:' + s if s != '' else s)

In [34]:
variations['id'] = variations['id'].apply(assign_curie)

Fix a misspelled terms

In [35]:
variations['variantConsequence'] = variations['variantConsequence'].replace('intergenic_variant||intergenic_variant', 'intergenic_variant')

In [36]:
variations['variantConsequence'] = variations['variantConsequence'].replace('intergenic_variant||intergenic_varia', 'intergenic_variant')

In [37]:
print('variantType:', variations['variantType'].unique())

variantType: ['Deletion' 'SNP' 'Insertion' 'Indel']


In [38]:
print("variantConsequence:", variations['variantConsequence'].unique())

variantConsequence: ['intergenic_variant' 'upstream_gene_variant' 'missense_variant'
 'synonymous_variant' 'inframe_deletion' 'downstream_gene_variant'
 'coding_sequence_variant' 'stop_gained' 'frameshift_variant'
 'protein_altering_variant' 'inframe_insertion' 'stop_lost' 'start_lost']


In [39]:
variations.head()

Unnamed: 0,name,variantType,start,end,ref,alt,variantConsequence,proteinVariant,geneVariant,distance,id,proteinPosition,proteinAccession,taxonomyId,referenceGenome
0,hCoV-19/USA/WA-UW-1446/2020,Deletion,1,7,AATTAAAG,-,intergenic_variant,,,,https://www.gisaid.org/EPI_ISL_423010,,,taxonomy:2697049,insdc:MN908947
1,hCoV-19/USA/WA-UW-1446/2020,SNP,34,34,A,T,intergenic_variant,,,,https://www.gisaid.org/EPI_ISL_423010,,,taxonomy:2697049,insdc:MN908947
2,hCoV-19/USA/WA-UW-1446/2020,SNP,35,35,A,T,intergenic_variant,,,,https://www.gisaid.org/EPI_ISL_423010,,,taxonomy:2697049,insdc:MN908947
3,hCoV-19/USA/WA-UW-1446/2020,SNP,36,36,C,T,intergenic_variant,,,,https://www.gisaid.org/EPI_ISL_423010,,,taxonomy:2697049,insdc:MN908947
4,hCoV-19/USA/WA-UW-1446/2020,SNP,37,37,C,A,intergenic_variant,,,,https://www.gisaid.org/EPI_ISL_423010,,,taxonomy:2697049,insdc:MN908947


In [40]:
print("Number of variants:", variations.shape[0])

Number of variants: 78157


In [41]:
variations.to_csv(NEO4J_IMPORT / "01c-CNCBVariant.csv", index=False)

In [42]:
strains = pd.read_csv(NEO4J_IMPORT / "01c-CNCBStrainPre.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [43]:
strains.shape

(294247, 16)

In [44]:
strains.head()

Unnamed: 0,name,accession,accessions,gisaidId,source,taxonomyId,hostTaxonomyId,lineage,sequenceLength,completeness,gender,age,collectionDate,location,origLocation,originatingLab
0,hCoV-19/Brazil/AP162741-IEC/2020,https://www.gisaid.org/EPI_ISL_458138,https://www.gisaid.org/EPI_ISL_458138;,https://www.gisaid.org/EPI_ISL_458138,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,29903,Complete,Female,37.0,2020-04-03,Brazil / Amapa,"Brazil,Amapa",﻿Evandro Chagas Institute
1,hCoV-19/Brazil/PA-IEC-164747/2020,https://www.gisaid.org/EPI_ISL_524783,https://www.gisaid.org/EPI_ISL_524783;,https://www.gisaid.org/EPI_ISL_524783,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.28,29903,Complete,Female,25.0,2020-04-28,Brazil / Para,"Brazil,Para",﻿Evandro Chagas Institute
2,hCoV-19/Brazil/AC162535-IEC/2020,https://www.gisaid.org/EPI_ISL_458139,https://www.gisaid.org/EPI_ISL_458139;,https://www.gisaid.org/EPI_ISL_458139,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,29903,Complete,Male,81.0,2020-03-18,Brazil / Acre,"Brazil,Acre",﻿Evandro Chagas Institute
3,hCoV-19/Brazil/AP-IEC-165513/2020,https://www.gisaid.org/EPI_ISL_524784,https://www.gisaid.org/EPI_ISL_524784;,https://www.gisaid.org/EPI_ISL_524784,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,29903,Complete,Male,49.0,2020-04-29,Brazil / Amapa,"Brazil,Amapa",﻿Evandro Chagas Institute
4,hCoV-19/Brazil/PA162802-IEC/2020,https://www.gisaid.org/EPI_ISL_458140,https://www.gisaid.org/EPI_ISL_458140;,https://www.gisaid.org/EPI_ISL_458140,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.28,29903,Complete,Male,63.0,2020-04-07,Brazil / Para,"Brazil,Para",﻿Evandro Chagas Institute


In [45]:
strains_acc = strains[['accession', 'accessions']].copy()
strains_acc['id'] = strains_acc['accessions'].str.split(';')
strains_acc = strains_acc.explode('id')
strains_acc['id'] = strains_acc['id'].str.strip()

In [46]:
strains_acc.shape

(587867, 3)

In [47]:
strains_acc.head()

Unnamed: 0,accession,accessions,id
0,https://www.gisaid.org/EPI_ISL_458138,https://www.gisaid.org/EPI_ISL_458138;,https://www.gisaid.org/EPI_ISL_458138
0,https://www.gisaid.org/EPI_ISL_458138,https://www.gisaid.org/EPI_ISL_458138;,
1,https://www.gisaid.org/EPI_ISL_524783,https://www.gisaid.org/EPI_ISL_524783;,https://www.gisaid.org/EPI_ISL_524783
1,https://www.gisaid.org/EPI_ISL_524783,https://www.gisaid.org/EPI_ISL_524783;,
2,https://www.gisaid.org/EPI_ISL_458139,https://www.gisaid.org/EPI_ISL_458139;,https://www.gisaid.org/EPI_ISL_458139


In [48]:
strains_acc.drop_duplicates(inplace=True)
strains_acc.shape

(587867, 3)

### Map variants to strains

In [49]:
var_ids = variations['id'].copy()
var_ids.drop_duplicates(inplace=True)

In [50]:
var_ids.shape

(5246,)

In [51]:
strains_map = strains_acc.merge(var_ids, on='id')

In [52]:
strains_map.head()

Unnamed: 0,accession,accessions,id
0,NMDC60013101-01,NMDC60013101-01;https://www.gisaid.org/EPI_ISL...,https://www.gisaid.org/EPI_ISL_422425
1,https://www.gisaid.org/EPI_ISL_420080,https://www.gisaid.org/EPI_ISL_420080;,https://www.gisaid.org/EPI_ISL_420080
2,https://www.gisaid.org/EPI_ISL_420081,https://www.gisaid.org/EPI_ISL_420081;,https://www.gisaid.org/EPI_ISL_420081
3,https://www.gisaid.org/EPI_ISL_415710,https://www.gisaid.org/EPI_ISL_415710;,https://www.gisaid.org/EPI_ISL_415710
4,https://www.gisaid.org/EPI_ISL_514514,https://www.gisaid.org/EPI_ISL_514514;,https://www.gisaid.org/EPI_ISL_514514


In [53]:
strains_map.shape

(5243, 3)

In [54]:
strains_var = strains.merge(strains_map[['accession', 'id']], on='accession', how='outer')

In [55]:
strains_var['id'].fillna('', inplace=True)

In [56]:
strains_var['id'] = strains_var[['id', 'accession']].apply(lambda x: x[0] if x[0] != '' else x[1], axis=1)

In [57]:
strains_var.shape

(294248, 17)

In [58]:
strains_var.head()

Unnamed: 0,name,accession,accessions,gisaidId,source,taxonomyId,hostTaxonomyId,lineage,sequenceLength,completeness,gender,age,collectionDate,location,origLocation,originatingLab,id
0,hCoV-19/Brazil/AP162741-IEC/2020,https://www.gisaid.org/EPI_ISL_458138,https://www.gisaid.org/EPI_ISL_458138;,https://www.gisaid.org/EPI_ISL_458138,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,29903,Complete,Female,37.0,2020-04-03,Brazil / Amapa,"Brazil,Amapa",﻿Evandro Chagas Institute,https://www.gisaid.org/EPI_ISL_458138
1,hCoV-19/Brazil/PA-IEC-164747/2020,https://www.gisaid.org/EPI_ISL_524783,https://www.gisaid.org/EPI_ISL_524783;,https://www.gisaid.org/EPI_ISL_524783,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.28,29903,Complete,Female,25.0,2020-04-28,Brazil / Para,"Brazil,Para",﻿Evandro Chagas Institute,https://www.gisaid.org/EPI_ISL_524783
2,hCoV-19/Brazil/AC162535-IEC/2020,https://www.gisaid.org/EPI_ISL_458139,https://www.gisaid.org/EPI_ISL_458139;,https://www.gisaid.org/EPI_ISL_458139,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,29903,Complete,Male,81.0,2020-03-18,Brazil / Acre,"Brazil,Acre",﻿Evandro Chagas Institute,https://www.gisaid.org/EPI_ISL_458139
3,hCoV-19/Brazil/AP-IEC-165513/2020,https://www.gisaid.org/EPI_ISL_524784,https://www.gisaid.org/EPI_ISL_524784;,https://www.gisaid.org/EPI_ISL_524784,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,29903,Complete,Male,49.0,2020-04-29,Brazil / Amapa,"Brazil,Amapa",﻿Evandro Chagas Institute,https://www.gisaid.org/EPI_ISL_524784
4,hCoV-19/Brazil/PA162802-IEC/2020,https://www.gisaid.org/EPI_ISL_458140,https://www.gisaid.org/EPI_ISL_458140;,https://www.gisaid.org/EPI_ISL_458140,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.28,29903,Complete,Male,63.0,2020-04-07,Brazil / Para,"Brazil,Para",﻿Evandro Chagas Institute,https://www.gisaid.org/EPI_ISL_458140


In [59]:
strains_var.to_csv(NEO4J_IMPORT / "01c-CNCBStrain.csv", index=False)