# Load SARS-CoV-2 Virus Strain Metadata from CNCB
**[Work in progress]**

This notebook downloads and standardizes viral strain data from CNCB for ingestion into a Knowledge Graph.

Data source: [China National Center for Bioinformation, 2019 Novel Coronavirus Resource (2019nCoVR)](https://bigd.big.ac.cn/ncov/release_genome)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import shutil
import glob
import ftplib
import re
import dateutil
import pandas as pd
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
metadata_url = "https://bigd.big.ac.cn/ncov/genome/export/meta"

In [4]:
# Path will take care of handling operating system differences.
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-328d8379-6ab4-4cc1-a397-2de37909d2e4/installation-4.1.0/import


### Download strain metadata

In [5]:
df = pd.read_excel(metadata_url, dtype='str')
df.fillna('', inplace=True)

In [6]:
print("Total number of strains:", df.shape[0])

Total number of strains: 119277


In [7]:
df = df.query("`Sequence Quality` == 'High'")
df = df.query("`Nuc.Completeness` == 'Complete'")

In [8]:
print("Number of complete high quality strains", df.shape[0])

Number of complete high quality strains 64477


In [9]:
df.head()

Unnamed: 0,Virus Strain Name,Accession ID,Data Source,Related ID,Nuc.Completeness,Sequence Length,Sequence Quality,Quality Assessment,Host,Sample Collection Date,Location,Originating Lab,Submission Date,Submitting Lab,Create Time,Last Update Time
0,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01,NMDC,EPI_ISL_402132,Complete,29848,High,0/0/0/1/NO,Homo sapiens,2019-12-30,China / Hubei,Hubei Provincial Center for Disease Control an...,2020-01-19,Hubei Provincial Center for Disease Control an...,2020-01-20 20:04:48,2020-09-09 11:31:17
1,hCoV-19/Thailand/74/2020,EPI_ISL_403963,GISAID,,Complete,29859,High,0/0/0/0/NO,Homo sapiens,2020-01-13,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 20:04:48,2020-09-09 11:31:17
2,hCoV-19/Thailand/61/2020,EPI_ISL_403962,GISAID,,Complete,29848,High,0/0/0/0/NO,Homo sapiens,2020-01-08,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 20:04:48,2020-09-09 11:31:17
3,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01,NMDC,EPI_ISL_402120,Complete,29896,High,0/0/0/2/NO,Homo sapiens,2020-01-01,China / Hubei / Wuhan,National Institute for Viral Disease Control a...,2020-01-11,National Institute for Viral Disease Control a...,2020-01-20 20:04:48,2020-09-09 11:31:17
4,BetaCoV/Wuhan/IVDC-HB-01/2019,NMDC60013084-01,NMDC,EPI_ISL_402119,Complete,29891,High,0/0/0/0/NO,Homo sapiens,2019-12-30,China / Hubei / Wuhan,National Institute for Viral Disease Control a...,2020-01-10,National Institute for Viral Disease Control a...,2020-01-20 20:04:48,2020-09-09 11:31:17


#### Create a separate row for each Accession and Related ID

In [10]:
df['Accession ID'] = df['Accession ID'].str.strip()
df['Related ID'] = df['Related ID'].str.strip()

# combine all ids into a single column
df['alias'] = df['Accession ID'] + df['Related ID'].apply(lambda s: ',' + s if len(s) > 0 else s)
df['alias'] = df['alias'].str.replace(' ', '')

# then "explode" ids into separate rows
df['id'] = df['alias'].apply(lambda s: s.split(','))
df = df.explode('id')
df['id'] = df['id'].str.strip()
df['alias'] = df['alias'].str.replace(',', ';')

In [11]:
df.head()

Unnamed: 0,Virus Strain Name,Accession ID,Data Source,Related ID,Nuc.Completeness,Sequence Length,Sequence Quality,Quality Assessment,Host,Sample Collection Date,Location,Originating Lab,Submission Date,Submitting Lab,Create Time,Last Update Time,alias,id
0,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01,NMDC,EPI_ISL_402132,Complete,29848,High,0/0/0/1/NO,Homo sapiens,2019-12-30,China / Hubei,Hubei Provincial Center for Disease Control an...,2020-01-19,Hubei Provincial Center for Disease Control an...,2020-01-20 20:04:48,2020-09-09 11:31:17,NMDC60013088-01;EPI_ISL_402132,NMDC60013088-01
0,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01,NMDC,EPI_ISL_402132,Complete,29848,High,0/0/0/1/NO,Homo sapiens,2019-12-30,China / Hubei,Hubei Provincial Center for Disease Control an...,2020-01-19,Hubei Provincial Center for Disease Control an...,2020-01-20 20:04:48,2020-09-09 11:31:17,NMDC60013088-01;EPI_ISL_402132,EPI_ISL_402132
1,hCoV-19/Thailand/74/2020,EPI_ISL_403963,GISAID,,Complete,29859,High,0/0/0/0/NO,Homo sapiens,2020-01-13,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 20:04:48,2020-09-09 11:31:17,EPI_ISL_403963,EPI_ISL_403963
2,hCoV-19/Thailand/61/2020,EPI_ISL_403962,GISAID,,Complete,29848,High,0/0/0/0/NO,Homo sapiens,2020-01-08,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 20:04:48,2020-09-09 11:31:17,EPI_ISL_403962,EPI_ISL_403962
3,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01,NMDC,EPI_ISL_402120,Complete,29896,High,0/0/0/2/NO,Homo sapiens,2020-01-01,China / Hubei / Wuhan,National Institute for Viral Disease Control a...,2020-01-11,National Institute for Viral Disease Control a...,2020-01-20 20:04:48,2020-09-09 11:31:17,NMDC60013085-01;EPI_ISL_402120,NMDC60013085-01


#### Assign taxonomy ids

In [12]:
# read Organism reference dictionary
organism_to_id = dict()
data = pd.read_csv("../../reference_data/OrganismDictionary.csv", comment='#')
for index, row in data.iterrows():
    organism_to_id[row['organism']] = row['taxonomyId']

In [13]:
# assign taxonomy id to host
df['Host'] = df['Host'].str.strip()
df['hostTaxonomyId'] = df['Host'].apply(lambda s: organism_to_id.get(s.lower(), s))
df['hostTaxonomyId'].unique()

array(['taxonomy:9606', 'taxonomy:59477 ', 'Environment', 'taxonomy:9974',
       'taxonomy:608659', 'taxonomy:419130', 'taxonomy:9666',
       'taxonomy:9608', 'taxonomy:10090', 'taxonomy:9685', 'unknown',
       'taxonomy:452646'], dtype=object)

In [14]:
df['taxonomyId'] = 'taxonomy:2697049' # SARS-CoV-2

#### Standardize node property names (CURIEs and URIs)

In [15]:
df.rename(columns={'Virus Strain Name': 'name',
                   'Sample Collection Date':'collectionDate',
                   'Location':'location'}, 
          inplace=True)

In [16]:
# https://registry.identifiers.org/registry/insdc
insdc_pattern = re.compile('^([A-Z]\d{5}|[A-Z]{2}\d{6}|[A-Z]{4}\d{8}|[A-J][A-Z]{2}\d{5})(\.\d+)?$')

In [17]:
df[['loc0', 'loc1', 'loc2', 'loc3']] = df['location'].str.split('/', n=3, expand=True)
# strip white space
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [18]:
df['origLocation'] = df[['loc0', 'loc1', 'loc2', 'loc3']].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)

In [19]:
def assign_curie(id):
    id = id.strip()
    # remove underscore to enable CURIE matching of NCBI reference sequences NC_...
    id = id.replace('NC_', 'NC') 
    if len(id) > 0:
        if id.startswith('EPI'):
            return 'https://www.gisaid.org/' + id
        elif id.startswith('NC_'):
            # NCBI reference sequences resolve with ncbiprotein CURIE
            return 'ncbiprotein:' + id
        elif insdc_pattern.match(id) != None:
            return 'insdc:' + id
        else:
            # TODO are URIs available for these cases?
            return id
    else:
        return id

In [20]:
df['id'] = df['id'].apply(assign_curie)

### Save strain metadata

In [21]:
strains = df[['id', 'name', 'alias', 'taxonomyId', 'hostTaxonomyId','collectionDate', 'location', 'origLocation']].copy()
strains['id'] = strains['id'].apply(assign_curie)

In [22]:
strains.head()

Unnamed: 0,id,name,alias,taxonomyId,hostTaxonomyId,collectionDate,location,origLocation
0,NMDC60013088-01,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China / Hubei,"China,Hubei"
0,https://www.gisaid.org/EPI_ISL_402132,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01;EPI_ISL_402132,taxonomy:2697049,taxonomy:9606,2019-12-30,China / Hubei,"China,Hubei"
1,https://www.gisaid.org/EPI_ISL_403963,hCoV-19/Thailand/74/2020,EPI_ISL_403963,taxonomy:2697049,taxonomy:9606,2020-01-13,Thailand/ Nonthaburi Province,"Thailand,Nonthaburi Province"
2,https://www.gisaid.org/EPI_ISL_403962,hCoV-19/Thailand/61/2020,EPI_ISL_403962,taxonomy:2697049,taxonomy:9606,2020-01-08,Thailand/ Nonthaburi Province,"Thailand,Nonthaburi Province"
3,NMDC60013085-01,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01;EPI_ISL_402120,taxonomy:2697049,taxonomy:9606,2020-01-01,China / Hubei / Wuhan,"China,Hubei,Wuhan"


In [23]:
print('Number of strains:',strains.shape[0])

Number of strains: 78850


In [24]:
strains.to_csv(NEO4J_IMPORT / "01d-CNCBStrain.csv", index=False)