# Load SARS-CoV-2 Virus Strain Metadata from CNCB
**[Work in progress]**

This notebook downloads and standardizes viral strain data from CNCB for ingestion into a Knowledge Graph.

Data source: [China National Center for Bioinformation, 2019 Novel Coronavirus Resource (2019nCoVR)](https://bigd.big.ac.cn/ncov/release_genome)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import shutil
import glob
import ftplib
import re
import requests
import json
import dateutil
import pandas as pd
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
metadata_url = "https://bigd.big.ac.cn/ncov/genome/export/meta"

In [4]:
# Path will take care of handling operating system differences.
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


In [5]:
### Download data on the fly

In [6]:
# this values should be >= then the current number of isolates in CNCB
# max_entries = 1000000

In [7]:
# url = f'https://bigd.big.ac.cn/ncov/genome/query?&columns%5B0%5D.data=accession&columns%5B0%5D.name=&columns%5B0%5D.searchable=false&columns%5B0%5D.orderable=false&columns%5B0%5D.search.value=&columns%5B0%5D.search.regex=false&columns%5B1%5D.data=name&columns%5B1%5D.name=&columns%5B1%5D.searchable=true&columns%5B1%5D.orderable=true&columns%5B1%5D.search.value=&columns%5B1%5D.search.regex=false&columns%5B2%5D.data=accession&columns%5B2%5D.name=&columns%5B2%5D.searchable=true&columns%5B2%5D.orderable=true&columns%5B2%5D.search.value=&columns%5B2%5D.search.regex=false&columns%5B3%5D.data=gender&columns%5B3%5D.name=&columns%5B3%5D.searchable=true&columns%5B3%5D.orderable=true&columns%5B3%5D.search.value=&columns%5B3%5D.search.regex=false&columns%5B4%5D.data=age&columns%5B4%5D.name=&columns%5B4%5D.searchable=true&columns%5B4%5D.orderable=true&columns%5B4%5D.search.value=&columns%5B4%5D.search.regex=false&columns%5B5%5D.data=source&columns%5B5%5D.name=&columns%5B5%5D.searchable=true&columns%5B5%5D.orderable=true&columns%5B5%5D.search.value=&columns%5B5%5D.search.regex=false&columns%5B6%5D.data=relatedAccession&columns%5B6%5D.name=&columns%5B6%5D.searchable=true&columns%5B6%5D.orderable=true&columns%5B6%5D.search.value=&columns%5B6%5D.search.regex=false&columns%5B7%5D.data=genomeLineage.lineage&columns%5B7%5D.name=&columns%5B7%5D.searchable=true&columns%5B7%5D.orderable=true&columns%5B7%5D.search.value=&columns%5B7%5D.search.regex=false&columns%5B8%5D.data=completeness&columns%5B8%5D.name=&columns%5B8%5D.searchable=true&columns%5B8%5D.orderable=true&columns%5B8%5D.search.value=&columns%5B8%5D.search.regex=false&columns%5B9%5D.data=genomeQuality&columns%5B9%5D.name=&columns%5B9%5D.searchable=false&columns%5B9%5D.orderable=true&columns%5B9%5D.search.value=&columns%5B9%5D.search.regex=false&columns%5B10%5D.data=genomeQuality&columns%5B10%5D.name=&columns%5B10%5D.searchable=false&columns%5B10%5D.orderable=true&columns%5B10%5D.search.value=&columns%5B10%5D.search.regex=false&columns%5B11%5D.data=genomeQuality&columns%5B11%5D.name=&columns%5B11%5D.searchable=false&columns%5B11%5D.orderable=true&columns%5B11%5D.search.value=&columns%5B11%5D.search.regex=false&columns%5B12%5D.data=host&columns%5B12%5D.name=&columns%5B12%5D.searchable=true&columns%5B12%5D.orderable=true&columns%5B12%5D.search.value=&columns%5B12%5D.search.regex=false&columns%5B13%5D.data=collectDate&columns%5B13%5D.name=&columns%5B13%5D.searchable=true&columns%5B13%5D.orderable=true&columns%5B13%5D.search.value=&columns%5B13%5D.search.regex=false&columns%5B14%5D.data=location&columns%5B14%5D.name=&columns%5B14%5D.searchable=true&columns%5B14%5D.orderable=true&columns%5B14%5D.search.value=&columns%5B14%5D.search.regex=false&columns%5B15%5D.data=dataProvider&columns%5B15%5D.name=&columns%5B15%5D.searchable=true&columns%5B15%5D.orderable=true&columns%5B15%5D.search.value=&columns%5B15%5D.search.regex=false&columns%5B16%5D.data=submitDate&columns%5B16%5D.name=&columns%5B16%5D.searchable=true&columns%5B16%5D.orderable=true&columns%5B16%5D.search.value=&columns%5B16%5D.search.regex=false&columns%5B17%5D.data=dataSubmitter&columns%5B17%5D.name=&columns%5B17%5D.searchable=true&columns%5B17%5D.orderable=true&columns%5B17%5D.search.value=&columns%5B17%5D.search.regex=false&columns%5B18%5D.data=createDate&columns%5B18%5D.name=&columns%5B18%5D.searchable=true&columns%5B18%5D.orderable=true&columns%5B18%5D.search.value=&columns%5B18%5D.search.regex=false&columns%5B19%5D.data=country&columns%5B19%5D.name=&columns%5B19%5D.searchable=true&columns%5B19%5D.orderable=true&columns%5B19%5D.search.value=&columns%5B19%5D.search.regex=false&columns%5B20%5D.data=province&columns%5B20%5D.name=&columns%5B20%5D.searchable=true&columns%5B20%5D.orderable=true&columns%5B20%5D.search.value=&columns%5B20%5D.search.regex=false&columns%5B21%5D.data=lastModified&columns%5B21%5D.name=&columns%5B21%5D.searchable=true&columns%5B21%5D.orderable=true&columns%5B21%5D.search.value=&columns%5B21%5D.search.regex=false&order%5B0%5D.column=15&order%5B0%5D.dir=desc&start=0&length={max_entries}&search.value=&search.regex=false&'

In [8]:
# response = json.loads(requests.get(url).text)
# data = response['data']

### Use predownloaded data file

In [9]:
STRAIN_DATA = Path(NEO4J_IMPORT / 'cache/cncb_strain/cncbstrain.json')

In [10]:
with open(STRAIN_DATA) as f:
    response = json.load(f)

In [11]:
data = response['data']

In [12]:
# https://stackoverflow.com/questions/52795561/flattening-nested-json-in-pandas-data-frame
def flatten_json(nested_json, exclude=['']):
    """Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
            exclude: Keys to exclude from output.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}

    def flatten(x, name='', exclude=exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude: flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

In [13]:
df = pd.DataFrame([flatten_json(x, exclude=['variationInfoList']) for x in data])

In [14]:
df.fillna('', inplace=True)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405128 entries, 0 to 405127
Data columns (total 50 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   id                             405128 non-null  int64 
 1   name                           405128 non-null  object
 2   accession                      405128 non-null  object
 3   source                         405128 non-null  object
 4   link                           405128 non-null  object
 5   location                       405128 non-null  object
 6   collectDate                    405128 non-null  object
 7   dataProvider                   405128 non-null  object
 8   submitDate                     405128 non-null  object
 9   dataSubmitter                  405128 non-null  object
 10  host                           405128 non-null  object
 11  country                        405128 non-null  object
 12  province                       405128 non-nu

### Download strain metadata

In [16]:
#df = pd.read_excel(metadata_url, dtype='str')
#df.fillna('', inplace=True)

In [17]:
print("Total number of strains:", df.shape[0])

Total number of strains: 405128


In [18]:
df.head(10)

Unnamed: 0,id,name,accession,source,link,location,collectDate,dataProvider,submitDate,dataSubmitter,host,country,province,city,relatedAccession,completeness,gender,age,lastModified,createDate,genomeQuality_id,genomeQuality_completeness,genomeQuality_virusLength,genomeQuality_nratio,genomeQuality_nbase,genomeQuality_mutation,genomeQuality_degenerateBase,genomeQuality_gap,genomeQuality_maxGapLength,genomeQuality_highDensity,genomeQuality_accession,genomeQuality_virusName,genomeQuality_createTime,genomeQuality_lastModified,genomeLineage_id,genomeLineage_accession,genomeLineage_lineage,genomeLineage_lineagesVersion,genomeLineage_status,genomeLineage_note,genomeLineage_createTime,genomeLineage_lastModified,genomeVariants_id,genomeVariants_accession,genomeVariants_variants,genomeVariants_count,genomeVariants_lastModified,genomeVariants_createDate,genomeVariants,genomeLineage
0,43584,hCoV-19/Brazil/AP162966-IEC/2020,EPI_ISL_458142,GISAID,https://gisaid.org/,Brazil / Amapa,2020-04-05,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Amapa,,,Complete,Female,37,2020-09-09 11:31:17,2020-06-04 12:19:36,41660,Complete,29903,0.00%,0,11,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458142,hCoV-19/Brazil/AP162966-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,109137.0,EPI_ISL_458142,B.1.1.314,2021/1/11,passed_qc,,2020-11-30 15:44:38,2021-01-15 14:12:12,12993.0,EPI_ISL_458142,"241,29148,6433,3037,28883,9286,14408,28882,234...",11.0,2021-01-20 19:03:19,2021-01-20 19:03:19,,
1,100674,hCoV-19/Brazil/MA-IEC-164827/2020,EPI_ISL_524788,GISAID,43,Brazil / Maranhao,2020-04-24,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Maranhao,,,Complete,Male,43,2020-09-09 11:31:17,2020-08-29 18:49:44,100019,Complete,29903,0.00%,0,11,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_524788,hCoV-19/Brazil/MA-IEC-164827/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,194626.0,EPI_ISL_524788,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:43,2021-01-15 14:12:19,32683.0,EPI_ISL_524788,"28883,14408,27750,25338,23403,28881,3037,28882...",11.0,2021-01-20 19:03:20,2021-01-20 19:03:20,,
2,100682,hCoV-19/Brazil/AP-IEC-164920/2020,EPI_ISL_524796,GISAID,33,Brazil / Amapa,2020-04-23,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Amapa,,,Complete,Female,33,2020-09-09 11:31:17,2020-08-29 18:49:44,100027,Complete,29903,0.00%,0,9,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_524796,hCoV-19/Brazil/AP-IEC-164920/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,41990.0,EPI_ISL_524796,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:32,2021-01-15 14:12:07,32691.0,EPI_ISL_524796,"28883,14408,23403,28881,3037,28882,241,29148,2...",9.0,2021-01-20 19:03:20,2021-01-20 19:03:20,,
3,43585,hCoV-19/Brazil/AP164082-IEC/2020,EPI_ISL_458143,GISAID,https://gisaid.org/,Brazil / Amapa,2020-04-15,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Amapa,,,Complete,Male,44,2020-09-09 11:31:17,2020-06-04 12:19:36,41661,Complete,29903,0.00%,0,9,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458143,hCoV-19/Brazil/AP164082-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,77380.0,EPI_ISL_458143,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:35,2021-01-15 14:12:19,12994.0,EPI_ISL_458143,"28882,23403,28881,27299,241,29148,3037,28883,1...",9.0,2021-01-20 19:03:19,2021-01-20 19:03:19,,
4,100675,hCoV-19/Brazil/MA-IEC-166716/2020,EPI_ISL_524789,GISAID,32,Brazil / Maranhao,2020-05-05,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Maranhao,,,Complete,Male,32,2020-09-09 11:31:17,2020-08-29 18:49:44,100020,Complete,29903,0.00%,0,12,0,0,-1,"27297~27299(3-2-0.67,SNP:27297; SNP:27299) 288...",EPI_ISL_524789,hCoV-19/Brazil/MA-IEC-166716/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,194640.0,EPI_ISL_524789,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:43,2021-01-15 14:12:19,32684.0,EPI_ISL_524789,"241,29148,27299,28883,14408,22224,27297,23403,...",12.0,2021-01-20 19:03:20,2021-01-20 19:03:20,,
5,100683,hCoV-19/Brazil/MA-IEC-162157/2020,EPI_ISL_524797,GISAID,55,Brazil / Maranhao,2020-03-24,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Maranhao,,,Complete,Female,55,2020-09-09 11:31:17,2020-08-29 18:49:44,100028,Complete,29903,0.00%,0,9,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_524797,hCoV-19/Brazil/MA-IEC-162157/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,194613.0,EPI_ISL_524797,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:43,2021-01-15 14:12:19,32692.0,EPI_ISL_524797,"241,29148,27299,28883,14408,23403,28881,3037,2...",9.0,2021-01-20 19:03:20,2021-01-20 19:03:20,,
6,43586,hCoV-19/Brazil/AP163972-IEC/2020,EPI_ISL_458144,GISAID,https://gisaid.org/,Brazil / Amapa,2020-04-15,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Amapa,,,Complete,Male,44,2020-09-09 11:31:17,2020-06-04 12:19:36,41662,Complete,29903,0.00%,0,9,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458144,hCoV-19/Brazil/AP163972-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,77367.0,EPI_ISL_458144,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:35,2021-01-15 14:12:19,12995.0,EPI_ISL_458144,"14408,28882,23403,28881,27299,241,29148,3037,2...",9.0,2021-01-20 19:03:19,2021-01-20 19:03:19,,
7,100676,hCoV-19/Brazil/MA-IEC-165398/2020,EPI_ISL_524790,GISAID,37,Brazil / Maranhao,2020-04-28,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Maranhao,,,Complete,Female,37,2020-09-09 11:31:17,2020-08-29 18:49:44,100021,Complete,29903,0.00%,0,11,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_524790,hCoV-19/Brazil/MA-IEC-165398/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,42003.0,EPI_ISL_524790,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:32,2021-01-15 14:12:07,32685.0,EPI_ISL_524790,"241,29148,27299,28883,11083,14408,21147,23403,...",11.0,2021-01-20 19:03:20,2021-01-20 19:03:20,,
8,100684,hCoV-19/Brazil/RN-IEC-162277/2020,EPI_ISL_524798,GISAID,49,Brazil / Rio Grande do Norte,2020-03-14,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Rio Grande do Norte,,,Complete,Female,49,2020-09-09 11:31:17,2020-08-29 18:49:44,100029,Complete,29903,0.00%,0,9,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_524798,hCoV-19/Brazil/RN-IEC-162277/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,41918.0,EPI_ISL_524798,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:32,2021-01-15 14:12:07,32693.0,EPI_ISL_524798,"241,29148,27299,28883,14408,23403,28881,3037,2...",9.0,2021-01-20 19:03:20,2021-01-20 19:03:20,,
9,43587,hCoV-19/Brazil/AP164346-IEC/2020,EPI_ISL_458145,GISAID,https://gisaid.org/,Brazil / Amapa,2020-04-20,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Amapa,,,Complete,Female,62,2020-09-09 11:31:17,2020-06-04 12:19:36,41663,Complete,29903,0.00%,0,11,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458145,hCoV-19/Brazil/AP164346-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,109110.0,EPI_ISL_458145,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:38,2021-01-15 14:12:06,12996.0,EPI_ISL_458145,"28883,14408,28882,27047,23403,28881,27299,2572...",11.0,2021-01-20 19:03:19,2021-01-20 19:03:19,,


### Asssign unique identifiers

In [19]:
# https://registry.identifiers.org/registry/insdc
insdc_pattern = re.compile('^([A-Z]\d{5}|[A-Z]{2}\d{6}|[A-Z]{4}\d{8}|[A-J][A-Z]{2}\d{5})(\.\d+)?$')
# https://registry.identifiers.org/registry/refseq
refseq_pattern = re.compile('^(((AC|AP|NC|NG|NM|NP|NR|NT|NW|XM|XP|XR|YP|ZP)_\d+)|(NZ\_[A-Z]{2,4}\d+))(\.\d+)?$')
epi_pattern = re.compile('^EPI_ISL_\d+$')

In [20]:
def assign_curie(id):
    id = id.strip()
    if len(id) > 0:
        if id.startswith('EPI'):
            return 'https://www.gisaid.org/' + id
        elif refseq_pattern.match(id) != None:
            return 'refseq:' + id
        elif insdc_pattern.match(id) != None:
            return 'insdc:' + id
        else:
            # TODO are URIs available for these cases?
            return id
    else:
        return id

In [21]:
def assign_curies(ids):
    return [assign_curie(id) for id in ids.split(',')]

In [22]:
def get_gisaid_id(ids):
    for id in ids:
        if id.startswith('https://www.gisaid.org/'):
            return id
        
    return ''

#### Rename and concatenate fields

In [23]:
#df['Accession ID'] = df['Accession ID'].str.strip()
#df['Related ID'] = df['Related ID'].str.strip()

# combine all ids into an accession column and assign curies
df['accessions'] = df['accession'] + df['relatedAccession'].apply(lambda s: ',' + s if len(s) > 0 else s)
df['accessions'] = df['accessions'].apply(assign_curies)
df['gisaidId'] = df['accessions'].apply(get_gisaid_id)
df['accessions'] = df['accessions'].apply(lambda x: ';'.join(x))

df['accession'] = df['accession'].apply(lambda s: assign_curie(s))

In [24]:
df.rename(columns={'genomeQuality_virusLength': 'sequenceLength'}, inplace=True)
#df.rename(columns={'Sequence Quality': 'sequenceQuality'}, inplace=True) # get from original spreadsheet
#df.rename(columns={'Quality Assessment': 'qualityAssessment'}, inplace=True) # get from original spreadsheet
df.rename(columns={'dataProvider': 'originatingLab'}, inplace=True)
#df.rename(columns={'Virus Strain Name': 'name'}, inplace=True)
df.rename(columns={'collectDate':'collectionDate'},inplace=True)
df.rename(columns={'Location':'location'}, inplace=True)
df.rename(columns={'genomeLineage_lineage': 'lineage'}, inplace=True)

Remove invalid collection date

In [25]:
df.query("collectionDate == '2020-00-00'")

Unnamed: 0,id,name,accession,source,link,location,collectionDate,originatingLab,submitDate,dataSubmitter,host,country,province,city,relatedAccession,completeness,gender,age,lastModified,createDate,genomeQuality_id,genomeQuality_completeness,sequenceLength,genomeQuality_nratio,genomeQuality_nbase,genomeQuality_mutation,genomeQuality_degenerateBase,genomeQuality_gap,genomeQuality_maxGapLength,genomeQuality_highDensity,genomeQuality_accession,genomeQuality_virusName,genomeQuality_createTime,genomeQuality_lastModified,genomeLineage_id,genomeLineage_accession,lineage,genomeLineage_lineagesVersion,genomeLineage_status,genomeLineage_note,genomeLineage_createTime,genomeLineage_lastModified,genomeVariants_id,genomeVariants_accession,genomeVariants_variants,genomeVariants_count,genomeVariants_lastModified,genomeVariants_createDate,genomeVariants,genomeLineage,accessions,gisaidId


In [26]:
df['collectionDate'] = df['collectionDate'].apply(lambda d: '' if d == '2020-00-00' else d)

In [27]:
df['collectionDate'] = df['collectionDate'].apply(lambda d: dateutil.parser.parse(d) if len(d) > 0 else '')

In [28]:
df[df['accessions'].str.contains('refseq:NC_045512')]

Unnamed: 0,id,name,accession,source,link,location,collectionDate,originatingLab,submitDate,dataSubmitter,host,country,province,city,relatedAccession,completeness,gender,age,lastModified,createDate,genomeQuality_id,genomeQuality_completeness,sequenceLength,genomeQuality_nratio,genomeQuality_nbase,genomeQuality_mutation,genomeQuality_degenerateBase,genomeQuality_gap,genomeQuality_maxGapLength,genomeQuality_highDensity,genomeQuality_accession,genomeQuality_virusName,genomeQuality_createTime,genomeQuality_lastModified,genomeLineage_id,genomeLineage_accession,lineage,genomeLineage_lineagesVersion,genomeLineage_status,genomeLineage_note,genomeLineage_createTime,genomeLineage_lastModified,genomeVariants_id,genomeVariants_accession,genomeVariants_variants,genomeVariants_count,genomeVariants_lastModified,genomeVariants_createDate,genomeVariants,genomeLineage,accessions,gisaidId
71298,14,Wuhan-Hu-1,insdc:MN908947,GenBank,https://www.ncbi.nlm.nih.gov/nuccore/MN908947,China / Hubei / Wuhan,2019-12-31,Shanghai Public Health Clinical Center & Schoo...,2020-01-17,Shanghai Public Health Clinical Center & Schoo...,Homo sapiens,China,Hubei,Wuhan,"NC_045512,EPI_ISL_402125",Complete,Male,41,2020-05-20 11:14:12,2020-01-20 20:04:48,629,Complete,29903,0,0,0,0,0,-1,NO,MN908947,Wuhan-Hu-1,2020-03-17 14:27:13,2020-03-17 14:27:13,195185.0,MN908947,B,2020/10/30,passed_qc,,2020-11-30 15:44:43,2020-11-30 15:44:43,,,,,,,,,insdc:MN908947;refseq:NC_045512;https://www.gi...,https://www.gisaid.org/EPI_ISL_402125


In [29]:
df.head()

Unnamed: 0,id,name,accession,source,link,location,collectionDate,originatingLab,submitDate,dataSubmitter,host,country,province,city,relatedAccession,completeness,gender,age,lastModified,createDate,genomeQuality_id,genomeQuality_completeness,sequenceLength,genomeQuality_nratio,genomeQuality_nbase,genomeQuality_mutation,genomeQuality_degenerateBase,genomeQuality_gap,genomeQuality_maxGapLength,genomeQuality_highDensity,genomeQuality_accession,genomeQuality_virusName,genomeQuality_createTime,genomeQuality_lastModified,genomeLineage_id,genomeLineage_accession,lineage,genomeLineage_lineagesVersion,genomeLineage_status,genomeLineage_note,genomeLineage_createTime,genomeLineage_lastModified,genomeVariants_id,genomeVariants_accession,genomeVariants_variants,genomeVariants_count,genomeVariants_lastModified,genomeVariants_createDate,genomeVariants,genomeLineage,accessions,gisaidId
0,43584,hCoV-19/Brazil/AP162966-IEC/2020,https://www.gisaid.org/EPI_ISL_458142,GISAID,https://gisaid.org/,Brazil / Amapa,2020-04-05,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Amapa,,,Complete,Female,37,2020-09-09 11:31:17,2020-06-04 12:19:36,41660,Complete,29903,0.00%,0,11,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458142,hCoV-19/Brazil/AP162966-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,109137.0,EPI_ISL_458142,B.1.1.314,2021/1/11,passed_qc,,2020-11-30 15:44:38,2021-01-15 14:12:12,12993.0,EPI_ISL_458142,"241,29148,6433,3037,28883,9286,14408,28882,234...",11.0,2021-01-20 19:03:19,2021-01-20 19:03:19,,,https://www.gisaid.org/EPI_ISL_458142;,https://www.gisaid.org/EPI_ISL_458142
1,100674,hCoV-19/Brazil/MA-IEC-164827/2020,https://www.gisaid.org/EPI_ISL_524788,GISAID,43,Brazil / Maranhao,2020-04-24,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Maranhao,,,Complete,Male,43,2020-09-09 11:31:17,2020-08-29 18:49:44,100019,Complete,29903,0.00%,0,11,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_524788,hCoV-19/Brazil/MA-IEC-164827/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,194626.0,EPI_ISL_524788,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:43,2021-01-15 14:12:19,32683.0,EPI_ISL_524788,"28883,14408,27750,25338,23403,28881,3037,28882...",11.0,2021-01-20 19:03:20,2021-01-20 19:03:20,,,https://www.gisaid.org/EPI_ISL_524788;,https://www.gisaid.org/EPI_ISL_524788
2,100682,hCoV-19/Brazil/AP-IEC-164920/2020,https://www.gisaid.org/EPI_ISL_524796,GISAID,33,Brazil / Amapa,2020-04-23,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Amapa,,,Complete,Female,33,2020-09-09 11:31:17,2020-08-29 18:49:44,100027,Complete,29903,0.00%,0,9,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_524796,hCoV-19/Brazil/AP-IEC-164920/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,41990.0,EPI_ISL_524796,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:32,2021-01-15 14:12:07,32691.0,EPI_ISL_524796,"28883,14408,23403,28881,3037,28882,241,29148,2...",9.0,2021-01-20 19:03:20,2021-01-20 19:03:20,,,https://www.gisaid.org/EPI_ISL_524796;,https://www.gisaid.org/EPI_ISL_524796
3,43585,hCoV-19/Brazil/AP164082-IEC/2020,https://www.gisaid.org/EPI_ISL_458143,GISAID,https://gisaid.org/,Brazil / Amapa,2020-04-15,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Amapa,,,Complete,Male,44,2020-09-09 11:31:17,2020-06-04 12:19:36,41661,Complete,29903,0.00%,0,9,0,0,-1,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458143,hCoV-19/Brazil/AP164082-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,77380.0,EPI_ISL_458143,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:35,2021-01-15 14:12:19,12994.0,EPI_ISL_458143,"28882,23403,28881,27299,241,29148,3037,28883,1...",9.0,2021-01-20 19:03:19,2021-01-20 19:03:19,,,https://www.gisaid.org/EPI_ISL_458143;,https://www.gisaid.org/EPI_ISL_458143
4,100675,hCoV-19/Brazil/MA-IEC-166716/2020,https://www.gisaid.org/EPI_ISL_524789,GISAID,32,Brazil / Maranhao,2020-05-05,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Maranhao,,,Complete,Male,32,2020-09-09 11:31:17,2020-08-29 18:49:44,100020,Complete,29903,0.00%,0,12,0,0,-1,"27297~27299(3-2-0.67,SNP:27297; SNP:27299) 288...",EPI_ISL_524789,hCoV-19/Brazil/MA-IEC-166716/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,194640.0,EPI_ISL_524789,B.1.1.33,2021/1/11,passed_qc,,2020-11-30 15:44:43,2021-01-15 14:12:19,32684.0,EPI_ISL_524789,"241,29148,27299,28883,14408,22224,27297,23403,...",12.0,2021-01-20 19:03:20,2021-01-20 19:03:20,,,https://www.gisaid.org/EPI_ISL_524789;,https://www.gisaid.org/EPI_ISL_524789


#### Assign taxonomy ids

In [30]:
# read Organism reference dictionary
organism_to_id = dict()
data = pd.read_csv("../../reference_data/OrganismDictionary.csv", comment='#')
for index, row in data.iterrows():
    organism_to_id[row['organism']] = row['taxonomyId']

In [31]:
print(organism_to_id)

{'human': 'taxonomy:9606', 'homo sapiens': 'taxonomy:9606', 'chlorocebus sabaeus': 'taxonomy:60711', 'mus musculus': 'taxonomy:10090', 'rhinolophus affinis': 'taxonomy:59477 ', 'rhinolophus malayanus': 'taxonomy:608659', 'rhinolophus shameli': 'taxonomy:608708', 'mustela lutreola': 'taxonomy:9666', 'mink': 'taxonomy:9666', 'panthera tigris jacksoni': 'taxonomy:419130', 'rhinolophus sp. (bat)': 'taxonomy:49442', 'bat': 'taxonomy:49442', 'manis javanica': 'taxonomy:9974', 'manis pentadactyla': 'taxonomy:143292', 'palm civet': 'taxonomy:71116', 'canine': 'taxonomy:9608', 'canis lupus familiaris': 'taxonomy:9615', 'felis catus': 'taxonomy:9685', 'neovison vison': 'taxonomy:452646', 'mesocricetus auratus': 'taxonomy:10036', 'panthera leo': 'taxonomy:9689', 'panthera tigris': 'taxonomy:9694', 'environment': 'taxonomy:151659', 'environmental': 'taxonomy:151659'}


In [32]:
# assign taxonomy id to host
df['host'] = df['host'].str.strip()
df['hostTaxonomyId'] = df['host'].apply(lambda s: organism_to_id.get(s.lower(), s))
df['hostTaxonomyId'].unique()

array(['taxonomy:9606', 'taxonomy:10090', 'taxonomy:151659',
       'taxonomy:9685', 'taxonomy:9666', 'taxonomy:608708',
       'taxonomy:9615', 'taxonomy:143292', 'taxonomy:9974', 'unknown',
       'taxonomy:608659', 'taxonomy:452646', 'taxonomy:419130',
       'taxonomy:60711', 'taxonomy:59477 ', 'Vero cell culture',
       'taxonomy:9694', 'taxonomy:9689', 'taxonomy:10036'], dtype=object)

In [33]:
df['taxonomyId'] = 'taxonomy:2697049' # SARS-CoV-2

#### Standardize location information

In [34]:
df[['loc0', 'loc1', 'loc2', 'loc3']] = df['location'].str.split('/', n=3, expand=True)
# strip white space
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [35]:
df['origLocation'] = df[['loc0', 'loc1', 'loc2', 'loc3']].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)

### Save strain metadata

In [36]:
# strains = df[['name', 'accession', 'accessions', 'gisaidId', 'source', 'taxonomyId', 'hostTaxonomyId', 
#               'sequenceLength', 'sequenceQuality', 'qualityAssessment', 'collectionDate', 'location', 
#               'origLocation', 'originatingLab']].copy()

In [37]:
strains = df[['name', 'accession', 'accessions', 'gisaidId', 'source', 'taxonomyId', 'hostTaxonomyId', 'lineage',
              'sequenceLength', 'completeness', 'gender', 'age', 'collectionDate', 'location', 
              'origLocation', 'originatingLab']].copy()

In [38]:
strains.head()

Unnamed: 0,name,accession,accessions,gisaidId,source,taxonomyId,hostTaxonomyId,lineage,sequenceLength,completeness,gender,age,collectionDate,location,origLocation,originatingLab
0,hCoV-19/Brazil/AP162966-IEC/2020,https://www.gisaid.org/EPI_ISL_458142,https://www.gisaid.org/EPI_ISL_458142;,https://www.gisaid.org/EPI_ISL_458142,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.314,29903,Complete,Female,37,2020-04-05,Brazil / Amapa,"Brazil,Amapa",﻿Evandro Chagas Institute
1,hCoV-19/Brazil/MA-IEC-164827/2020,https://www.gisaid.org/EPI_ISL_524788,https://www.gisaid.org/EPI_ISL_524788;,https://www.gisaid.org/EPI_ISL_524788,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,29903,Complete,Male,43,2020-04-24,Brazil / Maranhao,"Brazil,Maranhao",﻿Evandro Chagas Institute
2,hCoV-19/Brazil/AP-IEC-164920/2020,https://www.gisaid.org/EPI_ISL_524796,https://www.gisaid.org/EPI_ISL_524796;,https://www.gisaid.org/EPI_ISL_524796,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,29903,Complete,Female,33,2020-04-23,Brazil / Amapa,"Brazil,Amapa",﻿Evandro Chagas Institute
3,hCoV-19/Brazil/AP164082-IEC/2020,https://www.gisaid.org/EPI_ISL_458143,https://www.gisaid.org/EPI_ISL_458143;,https://www.gisaid.org/EPI_ISL_458143,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,29903,Complete,Male,44,2020-04-15,Brazil / Amapa,"Brazil,Amapa",﻿Evandro Chagas Institute
4,hCoV-19/Brazil/MA-IEC-166716/2020,https://www.gisaid.org/EPI_ISL_524789,https://www.gisaid.org/EPI_ISL_524789;,https://www.gisaid.org/EPI_ISL_524789,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,29903,Complete,Male,32,2020-05-05,Brazil / Maranhao,"Brazil,Maranhao",﻿Evandro Chagas Institute


In [39]:
print('Number of strains:',strains.shape[0])

Number of strains: 405128


In [40]:
strains.to_csv(NEO4J_IMPORT / "01c-CNCBStrainPre.csv", index=False)