# Load SARS-CoV-2 Virus Strain Metadata from CNCB
**[Work in progress]**

This notebook downloads and standardizes viral strain data from CNCB for ingestion into a Knowledge Graph.

Data source: [China National Center for Bioinformation, 2019 Novel Coronavirus Resource (2019nCoVR)](https://bigd.big.ac.cn/ncov/release_genome)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import shutil
import glob
import ftplib
import re
import requests
import json
import dateutil
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
# Pangolin lineage (https://github.com/cov-lineages/pangolin)
source = 'CNCB'
software = 'pangolin v.2.3.8'

In [4]:
metadata_url = "https://bigd.big.ac.cn/ncov/genome/export/meta"

In [5]:
# Path will take care of handling operating system differences.
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Download strain metadata

In [6]:
#df = pd.read_excel(metadata_url, dtype='str')
df = pd.read_csv(metadata_url, sep='\t', dtype='str')

In [7]:
df.fillna('', inplace=True)

In [8]:
print("Total number of strains:", df.shape[0])

Total number of strains: 1398222


In [9]:
df.head(5)

Unnamed: 0,Virus Strain Name,Accession ID,Data Source,Related ID,Lineage,Nuc.Completeness,Sequence Length,Sequence Quality,Quality Assessment,`Host,Sample Collection Date,Location,Originating Lab,Submit Date,Submission Date,Submitting Lab,Create Time,Last Update Time
0,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01,NMDC,EPI_ISL_402132,B,Complete,29848,High,0/0/-/-/-,Homo sapiens,2019-12-30,China / Hubei,Hubei Provincial Center for Disease Control an...,2020-01-19,Hubei Provincial Center for Disease Control an...,2020-01-20 12:04:48,2021-03-16 10:46:59,
1,hCoV-19/Thailand/74/2020,EPI_ISL_403963,GISAID,,B,Complete,29859,High,0/0/-/-/-,Homo sapiens,2020-01-13,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 12:04:48,2021-03-16 10:46:59,
2,hCoV-19/Thailand/61/2020,EPI_ISL_403962,GISAID,,B,Complete,29848,High,0/0/-/-/-,Homo sapiens,2020-01-08,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 12:04:48,2021-03-16 10:46:59,
3,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01,NMDC,EPI_ISL_402120,B,Complete,29896,High,0/0/-/-/-,Homo sapiens,2020-01-01,China / Hubei / Wuhan,National Institute for Viral Disease Control a...,2020-01-11,National Institute for Viral Disease Control a...,2020-01-20 12:04:48,2021-03-16 10:46:59,
4,BetaCoV/Wuhan/IVDC-HB-01/2019,NMDC60013084-01,NMDC,EPI_ISL_402119,B,Complete,29891,High,0/0/-/-/-,Homo sapiens,2019-12-30,China / Hubei / Wuhan,National Institute for Viral Disease Control a...,2020-01-10,National Institute for Viral Disease Control a...,2020-01-20 12:04:48,2021-03-16 10:46:59,


### Assign identifiers, aliases, and assign compact identifiers (CURIES)

In [10]:
# https://registry.identifiers.org/registry/insdc
insdc_pattern = re.compile('^([A-Z]\d{5}|[A-Z]{2}\d{6}|[A-Z]{4}\d{8}|[A-J][A-Z]{2}\d{5})(\.\d+)?$')
# https://registry.identifiers.org/registry/refseq
refseq_pattern = re.compile('^(((AC|AP|NC|NG|NM|NP|NR|NT|NW|XM|XP|XR|YP|ZP)_\d+)|(NZ\_[A-Z]{2,4}\d+))(\.\d+)?$')
epi_pattern = re.compile('^EPI_ISL_\d+$')

In [11]:
def assign_curie(id):
    id = id.strip()
    if len(id) > 0:
        if epi_pattern.match(id) != None:
            return 'https://www.gisaid.org/' + id
        elif refseq_pattern.match(id) != None:
            return 'refseq:' + id
        elif insdc_pattern.match(id) != None:
            return 'insdc:' + id
        else:
            # TODO are URIs available for these cases?
            return id
    else:
        return id

In [12]:
def assign_curies(ids):
    return [assign_curie(id) for id in ids.split(',')]

In [13]:
def get_gisaid_id(ids):
    for id in ids:
        if id.startswith('https://www.gisaid.org/'):
            return id
        
    return ''

#### Rename and concatenate fields

In [14]:
df['Related ID'] = df['Related ID'].str.replace('-', '')

# combine all ids into an accession column and assign curies
df['accessions'] = df['Accession ID'] + df['Related ID'].apply(lambda s: ',' + s if len(s) > 0 else s)
df['accessions'] = df['accessions'].apply(assign_curies)
df['gisaidId'] = df['accessions'].apply(get_gisaid_id)
df['accessions'] = df['accessions'].apply(lambda x: ';'.join(x))
df['accession'] = df['Accession ID'].apply(lambda s: assign_curie(s))

In [15]:
df.rename(columns={'Data Source': 'source'}, inplace=True)
df.rename(columns={'Sequence Length': 'sequenceLength'}, inplace=True)
df.rename(columns={'Sequence Quality': 'sequenceQuality'}, inplace=True)
df.rename(columns={'Quality Assessment': 'qualityAssessment'}, inplace=True)
df.rename(columns={'Originating Lab': 'originatingLab'}, inplace=True)
df.rename(columns={'Virus Strain Name': 'name'}, inplace=True)
df.rename(columns={'Sample Collection Date':'collectionDate'},inplace=True)
df.rename(columns={'Location':'location'}, inplace=True)
df.rename(columns={'Lineage': 'lineage'}, inplace=True)

In [16]:
df['lineage'] = df['lineage'].str.replace('-', '')
df['lineage'] = df['lineage'].str.replace('None', '')

In [17]:
print(df['lineage'].unique())

['B' '' 'B.1' ... 'C.37' 'B.1.621' 'B.1.526.3']


Remove invalid collection date

In [18]:
df.query("collectionDate == '2020-00-00'")

Unnamed: 0,name,Accession ID,source,Related ID,lineage,Nuc.Completeness,sequenceLength,sequenceQuality,qualityAssessment,`Host,collectionDate,location,originatingLab,Submit Date,Submission Date,Submitting Lab,Create Time,Last Update Time,accessions,gisaidId,accession


In [19]:
df['collectionDate'] = df['collectionDate'].apply(lambda d: '' if d == '2020-00-00' else d)

In [20]:
df['collectionDate'] = df['collectionDate'].apply(lambda d: dateutil.parser.parse(d) if len(d) > 0 else '')

In [21]:
df.fillna('', inplace=True)

In [22]:
df[df['accessions'].str.contains('refseq:NC_045512')]

Unnamed: 0,name,Accession ID,source,Related ID,lineage,Nuc.Completeness,sequenceLength,sequenceQuality,qualityAssessment,`Host,collectionDate,location,originatingLab,Submit Date,Submission Date,Submitting Lab,Create Time,Last Update Time,accessions,gisaidId,accession
7,Wuhan-Hu-1,MN908947,GenBank,"NC_045512,EPI_ISL_402125",B.1,Complete,29903,High,0/0/-/-/-,Homo sapiens,2019-12-31,China / Hubei / Wuhan,Shanghai Public Health Clinical Center & Schoo...,2020-01-17,Shanghai Public Health Clinical Center & Schoo...,2020-01-20 12:04:48,2021-03-16 10:46:59,,insdc:MN908947;refseq:NC_045512;https://www.gi...,https://www.gisaid.org/EPI_ISL_402125,insdc:MN908947


In [23]:
df.head()

Unnamed: 0,name,Accession ID,source,Related ID,lineage,Nuc.Completeness,sequenceLength,sequenceQuality,qualityAssessment,`Host,collectionDate,location,originatingLab,Submit Date,Submission Date,Submitting Lab,Create Time,Last Update Time,accessions,gisaidId,accession
0,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01,NMDC,EPI_ISL_402132,B,Complete,29848,High,0/0/-/-/-,Homo sapiens,2019-12-30,China / Hubei,Hubei Provincial Center for Disease Control an...,2020-01-19,Hubei Provincial Center for Disease Control an...,2020-01-20 12:04:48,2021-03-16 10:46:59,,NMDC60013088-01;https://www.gisaid.org/EPI_ISL...,https://www.gisaid.org/EPI_ISL_402132,NMDC60013088-01
1,hCoV-19/Thailand/74/2020,EPI_ISL_403963,GISAID,,B,Complete,29859,High,0/0/-/-/-,Homo sapiens,2020-01-13,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 12:04:48,2021-03-16 10:46:59,,https://www.gisaid.org/EPI_ISL_403963,https://www.gisaid.org/EPI_ISL_403963,https://www.gisaid.org/EPI_ISL_403963
2,hCoV-19/Thailand/61/2020,EPI_ISL_403962,GISAID,,B,Complete,29848,High,0/0/-/-/-,Homo sapiens,2020-01-08,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 12:04:48,2021-03-16 10:46:59,,https://www.gisaid.org/EPI_ISL_403962,https://www.gisaid.org/EPI_ISL_403962,https://www.gisaid.org/EPI_ISL_403962
3,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01,NMDC,EPI_ISL_402120,B,Complete,29896,High,0/0/-/-/-,Homo sapiens,2020-01-01,China / Hubei / Wuhan,National Institute for Viral Disease Control a...,2020-01-11,National Institute for Viral Disease Control a...,2020-01-20 12:04:48,2021-03-16 10:46:59,,NMDC60013085-01;https://www.gisaid.org/EPI_ISL...,https://www.gisaid.org/EPI_ISL_402120,NMDC60013085-01
4,BetaCoV/Wuhan/IVDC-HB-01/2019,NMDC60013084-01,NMDC,EPI_ISL_402119,B,Complete,29891,High,0/0/-/-/-,Homo sapiens,2019-12-30,China / Hubei / Wuhan,National Institute for Viral Disease Control a...,2020-01-10,National Institute for Viral Disease Control a...,2020-01-20 12:04:48,2021-03-16 10:46:59,,NMDC60013084-01;https://www.gisaid.org/EPI_ISL...,https://www.gisaid.org/EPI_ISL_402119,NMDC60013084-01


#### Assign taxonomy ids

In [24]:
# read Organism reference dictionary
organism_to_id = dict()
data = pd.read_csv("../../reference_data/OrganismDictionary.csv", comment='#')
for index, row in data.iterrows():
    organism_to_id[row['organism']] = row['taxonomyId']

In [25]:
print(organism_to_id)

{'human': 'taxonomy:9606', 'homo sapiens': 'taxonomy:9606', 'gorilla gorilla gorilla': 'taxonomy:9595', 'chlorocebus sabaeus': 'taxonomy:60711', 'mus musculus': 'taxonomy:10090', 'rhinolophus affinis': 'taxonomy:59477 ', 'rhinolophus malayanus': 'taxonomy:608659', 'rhinolophus shameli': 'taxonomy:608708', 'mustela lutreola': 'taxonomy:9666', 'mustela putorius furo': 'taxonomy:9669', 'mink': 'taxonomy:9666', 'panthera tigris jacksoni': 'taxonomy:419130', 'rhinolophus sp. (bat)': 'taxonomy:49442', 'rhinolophus bat': 'taxonomy:49442', 'manis pentadactyla': 'taxonomy:143292', 'rhinolophus sinicus': 'taxonomy:89399', 'rhinolophus stheno': 'taxonomy:430491', 'rhinolophus pusillus': 'taxonomy:159858', 'bat': 'taxonomy:49442', 'manis javanica': 'taxonomy:9974', 'palm civet': 'taxonomy:71116', 'canine': 'taxonomy:9608', 'canis lupus familiaris': 'taxonomy:9615', 'felis catus': 'taxonomy:9685', 'neovison vison': 'taxonomy:452646', 'mesocricetus auratus': 'taxonomy:10036', 'panthera leo': 'taxono

In [26]:
# assign taxonomy id to host
df['host'] = df['`Host'].str.strip()
df['hostTaxonomyId'] = df['host'].apply(lambda s: organism_to_id.get(s.lower(), s))
df['hostTaxonomyId'].unique()

array(['taxonomy:9606', 'taxonomy:59477 ', 'taxonomy:151659',
       'taxonomy:9974', 'taxonomy:9615', 'taxonomy:9685', 'unknown',
       'taxonomy:608659', 'taxonomy:9666', 'Vero cell culture',
       'taxonomy:10090', 'taxonomy:452646', 'taxonomy:9689',
       'taxonomy:9694', 'taxonomy:10036', 'taxonomy:60711',
       'taxonomy:608708', 'taxonomy:9595', 'taxonomy:9669',
       'taxonomy:419130', 'taxonomy:143292', 'taxonomy:49442',
       'taxonomy:300877', 'taxonomy:9608', 'taxonomy:89399',
       'taxonomy:430491', 'taxonomy:159858'], dtype=object)

In [27]:
df['taxonomyId'] = 'taxonomy:2697049' # SARS-CoV-2

#### Standardize location information

In [28]:
df[['loc0', 'loc1', 'loc2', 'loc3']] = df['location'].str.split('/', n=3, expand=True)
# strip white space
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [None]:
df['origLocation'] = df[['loc0', 'loc1', 'loc2', 'loc3']].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)

### Save strain metadata

In [None]:
strains = df[['name', 'accession', 'accessions', 'gisaidId', 'source', 'taxonomyId', 'hostTaxonomyId', 
              'lineage', 'sequenceLength', 'sequenceQuality', 'qualityAssessment', 'collectionDate',
              'location', 'origLocation', 'originatingLab']].copy()

In [None]:
strains.head()

In [None]:
print('Number of strains:',strains.shape[0])

In [None]:
strains.to_csv(NEO4J_IMPORT / "01c-CNCBStrainPre.csv", index=False)