# Load SARS-CoV-2 Virus Strain Metadata from CNCB
**[Work in progress]**

This notebook downloads and standardizes viral strain data from CNCB for ingestion into a Knowledge Graph.

Data source: [China National Center for Bioinformation, 2019 Novel Coronavirus Resource (2019nCoVR)](https://bigd.big.ac.cn/ncov/release_genome)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import shutil
import glob
import ftplib
import re
import requests
import json
import dateutil
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
# Pangolin lineage (https://github.com/cov-lineages/pangolin)
source = 'CNCB'
software = 'pangolin v.2.3.2'

In [4]:
metadata_url = "https://bigd.big.ac.cn/ncov/genome/export/meta"

In [5]:
# Path will take care of handling operating system differences.
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Download strain metadata

In [6]:
df = pd.read_excel(metadata_url, dtype='str')

  warn("Workbook contains no default style, apply openpyxl's default")


In [7]:
df.fillna('', inplace=True)

In [8]:
print("Total number of strains:", df.shape[0])

Total number of strains: 710069


In [9]:
df.head(5)

Unnamed: 0,Virus Strain Name,Accession ID,Data Source,Related ID,Lineage,Nuc.Completeness,Sequence Length,Sequence Quality,Quality Assessment,Host,Sample Collection Date,Location,Originating Lab,Submission Date,Submitting Lab,Create Time,Last Update Time
0,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01,NMDC,EPI_ISL_402132,B,Complete,29848,High,0/0/-/-/-,Homo sapiens,2019-12-30,China / Hubei,Hubei Provincial Center for Disease Control an...,2020-01-19,Hubei Provincial Center for Disease Control an...,2020-01-20 20:04:48,2020-09-09 11:31:17
1,hCoV-19/Thailand/74/2020,EPI_ISL_403963,GISAID,-,B,Complete,29859,High,0/0/-/-/-,Homo sapiens,2020-01-13,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 20:04:48,2020-09-09 11:31:17
2,hCoV-19/Thailand/61/2020,EPI_ISL_403962,GISAID,-,B,Complete,29848,High,0/0/-/-/-,Homo sapiens,2020-01-08,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 20:04:48,2020-09-09 11:31:17
3,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01,NMDC,EPI_ISL_402120,B,Complete,29896,High,0/0/-/-/-,Homo sapiens,2020-01-01,China / Hubei / Wuhan,National Institute for Viral Disease Control a...,2020-01-11,National Institute for Viral Disease Control a...,2020-01-20 20:04:48,2020-09-09 11:31:17
4,BetaCoV/Wuhan/IVDC-HB-01/2019,NMDC60013084-01,NMDC,EPI_ISL_402119,B,Complete,29891,High,0/0/-/-/-,Homo sapiens,2019-12-30,China / Hubei / Wuhan,National Institute for Viral Disease Control a...,2020-01-10,National Institute for Viral Disease Control a...,2020-01-20 20:04:48,2020-09-09 11:31:17


### Assign identifiers, aliases, and assign compact identifiers (CURIES)

In [10]:
# https://registry.identifiers.org/registry/insdc
insdc_pattern = re.compile('^([A-Z]\d{5}|[A-Z]{2}\d{6}|[A-Z]{4}\d{8}|[A-J][A-Z]{2}\d{5})(\.\d+)?$')
# https://registry.identifiers.org/registry/refseq
refseq_pattern = re.compile('^(((AC|AP|NC|NG|NM|NP|NR|NT|NW|XM|XP|XR|YP|ZP)_\d+)|(NZ\_[A-Z]{2,4}\d+))(\.\d+)?$')
epi_pattern = re.compile('^EPI_ISL_\d+$')

In [11]:
def assign_curie(id):
    id = id.strip()
    if len(id) > 0:
        if epi_pattern.match(id) != None:
            return 'https://www.gisaid.org/' + id
        elif refseq_pattern.match(id) != None:
            return 'refseq:' + id
        elif insdc_pattern.match(id) != None:
            return 'insdc:' + id
        else:
            # TODO are URIs available for these cases?
            return id
    else:
        return id

In [12]:
def assign_curies(ids):
    return [assign_curie(id) for id in ids.split(',')]

In [13]:
def get_gisaid_id(ids):
    for id in ids:
        if id.startswith('https://www.gisaid.org/'):
            return id
        
    return ''

#### Rename and concatenate fields

In [14]:
df['Related ID'] = df['Related ID'].str.replace('-', '')

# combine all ids into an accession column and assign curies
df['accessions'] = df['Accession ID'] + df['Related ID'].apply(lambda s: ',' + s if len(s) > 0 else s)
df['accessions'] = df['accessions'].apply(assign_curies)
df['gisaidId'] = df['accessions'].apply(get_gisaid_id)
df['accessions'] = df['accessions'].apply(lambda x: ';'.join(x))
df['accession'] = df['Accession ID'].apply(lambda s: assign_curie(s))

In [15]:
df.rename(columns={'Data Source': 'source'}, inplace=True)
df.rename(columns={'Sequence Length': 'sequenceLength'}, inplace=True)
df.rename(columns={'Sequence Quality': 'sequenceQuality'}, inplace=True)
df.rename(columns={'Quality Assessment': 'qualityAssessment'}, inplace=True)
df.rename(columns={'Originating Lab': 'originatingLab'}, inplace=True)
df.rename(columns={'Virus Strain Name': 'name'}, inplace=True)
df.rename(columns={'Sample Collection Date':'collectionDate'},inplace=True)
df.rename(columns={'Location':'location'}, inplace=True)
df.rename(columns={'Lineage': 'lineage'}, inplace=True)

In [16]:
df['lineage'] = df['lineage'].str.replace('-', '')
df['lineage'] = df['lineage'].str.replace('None', '')

In [17]:
print(df['lineage'].unique())

['B' '' 'B.1' 'A' 'A.1' 'B.10' 'A.3' 'B.42' 'B.40' 'B.1.1.29' 'B.5' 'B.4'
 'B.1.1.247' 'B.1.1.162' 'B.1.1' 'B.41' 'B.3' 'B.1.1.1' 'N.4' 'B.11'
 'B.1.383' 'B.1.74' 'A.16' 'B.1.1.166' 'B.1.8' 'B.1.1.257' 'B.43'
 'B.1.1.104' 'B.1.1.256' 'A.5' 'A.2' 'B.1.371' 'B.1.212' 'B.1.277'
 'B.1.91' 'B.29' 'B.1.76' 'B.1.395' 'B.1.414' 'B.1.22' 'B.23' 'B.1.320'
 'B.27' 'B.1.428' 'B.3.1' 'B.1.93' 'B.1.1.28' 'B.48' 'B.1.104' 'B.6'
 'B.1.13' 'B.39' 'B.1.1.5' 'B.1.387' 'B.1.211' 'B.1.147' 'B.1.12' 'B.38'
 'B.1.9' 'B.1.1.211' 'B.31' 'B.1.1.10' 'B.47' 'B.1.1.292' 'B.1.1.122'
 'B.1.110' 'B.1.494' 'B.1.2' 'B.4.4' 'B.1.128' 'B.1.178' 'B.1.448'
 'B.1.419' 'B.18' 'B.1.1.17' 'B.1.356' 'B.1.314' 'B.1.510' 'B.1.98'
 'B.6.6' 'B.1.370' 'B.1.264' 'B.44' 'B.30' 'B.1.313' 'B.1.319' 'B.1.332'
 'B.1.369' 'B.1.354' 'B.1.1.130' 'B.1.1.282' 'B.1.1.137' 'B.1.350'
 'B.1.422' 'B.15' 'B.1.1.33' 'B.1.208' 'B.1.513' 'B.1.158' 'B.1.1.41'
 'B.1.6' 'B.1.153' 'B.1.1.64' 'B.33' 'B.1.391' 'B.1.1.164' 'B.1.1.39'
 'B.1.1.237' 'B.1.166' 'B

Remove invalid collection date

In [18]:
df.query("collectionDate == '2020-00-00'")

Unnamed: 0,name,Accession ID,source,Related ID,lineage,Nuc.Completeness,sequenceLength,sequenceQuality,qualityAssessment,Host,collectionDate,location,originatingLab,Submission Date,Submitting Lab,Create Time,Last Update Time,accessions,gisaidId,accession


In [19]:
df['collectionDate'] = df['collectionDate'].apply(lambda d: '' if d == '2020-00-00' else d)

In [20]:
df['collectionDate'] = df['collectionDate'].apply(lambda d: dateutil.parser.parse(d) if len(d) > 0 else '')

In [21]:
df.fillna('', inplace=True)

In [22]:
df[df['accessions'].str.contains('refseq:NC_045512')]

Unnamed: 0,name,Accession ID,source,Related ID,lineage,Nuc.Completeness,sequenceLength,sequenceQuality,qualityAssessment,Host,collectionDate,location,originatingLab,Submission Date,Submitting Lab,Create Time,Last Update Time,accessions,gisaidId,accession
7,Wuhan-Hu-1,MN908947,GenBank,"NC_045512,EPI_ISL_402125",B.1,Complete,29903,High,0/0/-/-/-,Homo sapiens,2019-12-31,China / Hubei / Wuhan,Shanghai Public Health Clinical Center & Schoo...,2020-01-17,Shanghai Public Health Clinical Center & Schoo...,2020-01-20 20:04:48,2020-05-20 11:14:12,insdc:MN908947;refseq:NC_045512;https://www.gi...,https://www.gisaid.org/EPI_ISL_402125,insdc:MN908947


In [23]:
df.head()

Unnamed: 0,name,Accession ID,source,Related ID,lineage,Nuc.Completeness,sequenceLength,sequenceQuality,qualityAssessment,Host,collectionDate,location,originatingLab,Submission Date,Submitting Lab,Create Time,Last Update Time,accessions,gisaidId,accession
0,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01,NMDC,EPI_ISL_402132,B,Complete,29848,High,0/0/-/-/-,Homo sapiens,2019-12-30,China / Hubei,Hubei Provincial Center for Disease Control an...,2020-01-19,Hubei Provincial Center for Disease Control an...,2020-01-20 20:04:48,2020-09-09 11:31:17,NMDC60013088-01;https://www.gisaid.org/EPI_ISL...,https://www.gisaid.org/EPI_ISL_402132,NMDC60013088-01
1,hCoV-19/Thailand/74/2020,EPI_ISL_403963,GISAID,,B,Complete,29859,High,0/0/-/-/-,Homo sapiens,2020-01-13,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 20:04:48,2020-09-09 11:31:17,https://www.gisaid.org/EPI_ISL_403963,https://www.gisaid.org/EPI_ISL_403963,https://www.gisaid.org/EPI_ISL_403963
2,hCoV-19/Thailand/61/2020,EPI_ISL_403962,GISAID,,B,Complete,29848,High,0/0/-/-/-,Homo sapiens,2020-01-08,Thailand/ Nonthaburi Province,"Department of Medical Sciences, Ministry of Pu...",2020-01-17,"Department of Medical Sciences, Ministry of Pu...",2020-01-20 20:04:48,2020-09-09 11:31:17,https://www.gisaid.org/EPI_ISL_403962,https://www.gisaid.org/EPI_ISL_403962,https://www.gisaid.org/EPI_ISL_403962
3,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01,NMDC,EPI_ISL_402120,B,Complete,29896,High,0/0/-/-/-,Homo sapiens,2020-01-01,China / Hubei / Wuhan,National Institute for Viral Disease Control a...,2020-01-11,National Institute for Viral Disease Control a...,2020-01-20 20:04:48,2020-09-09 11:31:17,NMDC60013085-01;https://www.gisaid.org/EPI_ISL...,https://www.gisaid.org/EPI_ISL_402120,NMDC60013085-01
4,BetaCoV/Wuhan/IVDC-HB-01/2019,NMDC60013084-01,NMDC,EPI_ISL_402119,B,Complete,29891,High,0/0/-/-/-,Homo sapiens,2019-12-30,China / Hubei / Wuhan,National Institute for Viral Disease Control a...,2020-01-10,National Institute for Viral Disease Control a...,2020-01-20 20:04:48,2020-09-09 11:31:17,NMDC60013084-01;https://www.gisaid.org/EPI_ISL...,https://www.gisaid.org/EPI_ISL_402119,NMDC60013084-01


#### Assign taxonomy ids

In [24]:
# read Organism reference dictionary
organism_to_id = dict()
data = pd.read_csv("../../reference_data/OrganismDictionary.csv", comment='#')
for index, row in data.iterrows():
    organism_to_id[row['organism']] = row['taxonomyId']

In [25]:
print(organism_to_id)

{'human': 'taxonomy:9606', 'homo sapiens': 'taxonomy:9606', 'gorilla gorilla gorilla': 'taxonomy:9595', 'chlorocebus sabaeus': 'taxonomy:60711', 'mus musculus': 'taxonomy:10090', 'rhinolophus affinis': 'taxonomy:59477 ', 'rhinolophus malayanus': 'taxonomy:608659', 'rhinolophus shameli': 'taxonomy:608708', 'mustela lutreola': 'taxonomy:9666', 'mustela putorius furo': 'taxonomy:9669', 'mink': 'taxonomy:9666', 'panthera tigris jacksoni': 'taxonomy:419130', 'rhinolophus sp. (bat)': 'taxonomy:49442', 'rhinolophus bat': 'taxonomy:49442', 'bat': 'taxonomy:49442', 'manis javanica': 'taxonomy:9974', 'manis pentadactyla': 'taxonomy:143292', 'palm civet': 'taxonomy:71116', 'canine': 'taxonomy:9608', 'canis lupus familiaris': 'taxonomy:9615', 'felis catus': 'taxonomy:9685', 'neovison vison': 'taxonomy:452646', 'mesocricetus auratus': 'taxonomy:10036', 'panthera leo': 'taxonomy:9689', 'panthera tigris': 'taxonomy:9694', 'environment': 'taxonomy:151659', 'environmental': 'taxonomy:151659'}


In [26]:
# assign taxonomy id to host
df['host'] = df['Host'].str.strip()
df['hostTaxonomyId'] = df['host'].apply(lambda s: organism_to_id.get(s.lower(), s))
df['hostTaxonomyId'].unique()

array(['taxonomy:9606', 'taxonomy:59477 ', 'taxonomy:151659',
       'taxonomy:9974', 'taxonomy:9615', 'taxonomy:9685', 'unknown',
       'taxonomy:608659', 'taxonomy:9666', 'Vero cell culture',
       'taxonomy:10090', 'taxonomy:452646', 'taxonomy:9689',
       'taxonomy:9694', 'taxonomy:10036', 'taxonomy:60711',
       'taxonomy:608708', 'taxonomy:9595', 'taxonomy:9669',
       'taxonomy:419130', 'taxonomy:143292', 'taxonomy:49442'],
      dtype=object)

In [27]:
df['taxonomyId'] = 'taxonomy:2697049' # SARS-CoV-2

#### Standardize location information

In [28]:
df[['loc0', 'loc1', 'loc2', 'loc3']] = df['location'].str.split('/', n=3, expand=True)
# strip white space
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [29]:
df['origLocation'] = df[['loc0', 'loc1', 'loc2', 'loc3']].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)

### Save strain metadata

In [30]:
strains = df[['name', 'accession', 'accessions', 'gisaidId', 'source', 'taxonomyId', 'hostTaxonomyId', 
              'lineage', 'sequenceLength', 'sequenceQuality', 'qualityAssessment', 'collectionDate',
              'location', 'origLocation', 'originatingLab']].copy()

In [31]:
#strains = df[['name', 'accession', 'accessions', 'gisaidId', 'source', 'taxonomyId', 'hostTaxonomyId', 'lineage',
#              'sequenceLength', 'completeness', 'gender', 'age', 'collectionDate', 'location', 
#              'origLocation', 'originatingLab']].copy()

In [32]:
strains.head()

Unnamed: 0,name,accession,accessions,gisaidId,source,taxonomyId,hostTaxonomyId,lineage,sequenceLength,sequenceQuality,qualityAssessment,collectionDate,location,origLocation,originatingLab
0,BetaCoV/Wuhan/HBCDC-HB-01/2019,NMDC60013088-01,NMDC60013088-01;https://www.gisaid.org/EPI_ISL...,https://www.gisaid.org/EPI_ISL_402132,NMDC,taxonomy:2697049,taxonomy:9606,B,29848,High,0/0/-/-/-,2019-12-30,China / Hubei,"China,Hubei",Hubei Provincial Center for Disease Control an...
1,hCoV-19/Thailand/74/2020,https://www.gisaid.org/EPI_ISL_403963,https://www.gisaid.org/EPI_ISL_403963,https://www.gisaid.org/EPI_ISL_403963,GISAID,taxonomy:2697049,taxonomy:9606,B,29859,High,0/0/-/-/-,2020-01-13,Thailand/ Nonthaburi Province,"Thailand,Nonthaburi Province","Department of Medical Sciences, Ministry of Pu..."
2,hCoV-19/Thailand/61/2020,https://www.gisaid.org/EPI_ISL_403962,https://www.gisaid.org/EPI_ISL_403962,https://www.gisaid.org/EPI_ISL_403962,GISAID,taxonomy:2697049,taxonomy:9606,B,29848,High,0/0/-/-/-,2020-01-08,Thailand/ Nonthaburi Province,"Thailand,Nonthaburi Province","Department of Medical Sciences, Ministry of Pu..."
3,BetaCoV/Wuhan/IVDC-HB-04/2020,NMDC60013085-01,NMDC60013085-01;https://www.gisaid.org/EPI_ISL...,https://www.gisaid.org/EPI_ISL_402120,NMDC,taxonomy:2697049,taxonomy:9606,B,29896,High,0/0/-/-/-,2020-01-01,China / Hubei / Wuhan,"China,Hubei,Wuhan",National Institute for Viral Disease Control a...
4,BetaCoV/Wuhan/IVDC-HB-01/2019,NMDC60013084-01,NMDC60013084-01;https://www.gisaid.org/EPI_ISL...,https://www.gisaid.org/EPI_ISL_402119,NMDC,taxonomy:2697049,taxonomy:9606,B,29891,High,0/0/-/-/-,2019-12-30,China / Hubei / Wuhan,"China,Hubei,Wuhan",National Institute for Viral Disease Control a...


In [33]:
print('Number of strains:',strains.shape[0])

Number of strains: 710069


In [34]:
strains.to_csv(NEO4J_IMPORT / "01c-CNCBStrainPre.csv", index=False)

In [47]:
subset = strains[['lineage','accession']].copy()

In [48]:
lineages = subset.groupby('lineage')['accession'].count().to_frame(name = 'count').reset_index()
lineages.query('lineage != ""', inplace=True)

In [49]:
def split_lineages(row):
    lineage = row.lineage
    lineages =  np.empty(5, dtype=object)
    lineages[0] = lineage
    l1 = lineage.rsplit('.', 1)[0]
    if l1 != lineage:
        lineages[1] = l1
    l2 = lineage.rsplit('.', 2)[0]
    if l2 != l1:
        lineages[2] = l2
    l3 = lineage.rsplit('.', 3)[0]
    if l3 != l2:
        lineages[3] = l3
    l4 = lineage.rsplit('.', 4)[0]
    if l4 != l3:
        lineages[4] = l4
    return lineages

In [50]:
lineages[['l0', 'l1', 'l2', 'l3', 'l4']] = lineages.apply(split_lineages, axis=1, result_type='expand')

In [51]:
lineages.fillna('', inplace=True)

In [52]:
lineages.head(1000)

Unnamed: 0,lineage,count,l0,l1,l2,l3,l4
1,A,1467,A,,,,
2,A.1,3477,A.1,A,,,
3,A.10,3,A.10,A,,,
4,A.11,12,A.11,A,,,
5,A.12,6,A.12,A,,,
6,A.15,62,A.15,A,,,
7,A.16,63,A.16,A,,,
8,A.17,10,A.17,A,,,
9,A.18,14,A.18,A,,,
10,A.19,69,A.19,A,,,


In [53]:
lineages['levels'] = lineages['lineage'].str.count('\.') + 1

In [54]:
lineages['source'] = source
lineages['software'] = software

In [55]:
print("Number of lineages:", lineages.shape[0])
lineages.sample(5)

Number of lineages: 881


Unnamed: 0,lineage,count,l0,l1,l2,l3,l4,levels,source,software
379,B.1.173,36,B.1.173,B.1,B,,,3,CNCB,pangolin v.2.3.2
104,B.1.1.168,31,B.1.1.168,B.1.1,B.1,B,,4,CNCB,pangolin v.2.3.2
349,B.1.139,1513,B.1.139,B.1,B,,,3,CNCB,pangolin v.2.3.2
389,B.1.177.17,1049,B.1.177.17,B.1.177,B.1,B,,4,CNCB,pangolin v.2.3.2
694,B.1.457,116,B.1.457,B.1,B,,,3,CNCB,pangolin v.2.3.2


In [56]:
lineages.to_csv(NEO4J_IMPORT / "01c-CNCBLineage.csv", index=False)