# Load SARS-CoV-2 Virus Strain Metadata from CNCB
**[Work in progress]**

This notebook downloads and standardizes viral strain data from CNCB for ingestion into a Knowledge Graph.

Data source: [China National Center for Bioinformation, 2019 Novel Coronavirus Resource (2019nCoVR)](https://bigd.big.ac.cn/ncov/release_genome)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import shutil
import glob
import ftplib
import re
import requests
import json
import dateutil
import pandas as pd
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
metadata_url = "https://bigd.big.ac.cn/ncov/genome/export/meta"

In [4]:
# Path will take care of handling operating system differences.
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


In [5]:
# this values should be >= then the current number of isolates in CNCB
max_entries = 1000000

In [6]:
url = f'https://bigd.big.ac.cn/ncov/genome/query?&columns%5B0%5D.data=accession&columns%5B0%5D.name=&columns%5B0%5D.searchable=false&columns%5B0%5D.orderable=false&columns%5B0%5D.search.value=&columns%5B0%5D.search.regex=false&columns%5B1%5D.data=name&columns%5B1%5D.name=&columns%5B1%5D.searchable=true&columns%5B1%5D.orderable=true&columns%5B1%5D.search.value=&columns%5B1%5D.search.regex=false&columns%5B2%5D.data=accession&columns%5B2%5D.name=&columns%5B2%5D.searchable=true&columns%5B2%5D.orderable=true&columns%5B2%5D.search.value=&columns%5B2%5D.search.regex=false&columns%5B3%5D.data=gender&columns%5B3%5D.name=&columns%5B3%5D.searchable=true&columns%5B3%5D.orderable=true&columns%5B3%5D.search.value=&columns%5B3%5D.search.regex=false&columns%5B4%5D.data=age&columns%5B4%5D.name=&columns%5B4%5D.searchable=true&columns%5B4%5D.orderable=true&columns%5B4%5D.search.value=&columns%5B4%5D.search.regex=false&columns%5B5%5D.data=source&columns%5B5%5D.name=&columns%5B5%5D.searchable=true&columns%5B5%5D.orderable=true&columns%5B5%5D.search.value=&columns%5B5%5D.search.regex=false&columns%5B6%5D.data=relatedAccession&columns%5B6%5D.name=&columns%5B6%5D.searchable=true&columns%5B6%5D.orderable=true&columns%5B6%5D.search.value=&columns%5B6%5D.search.regex=false&columns%5B7%5D.data=genomeLineage.lineage&columns%5B7%5D.name=&columns%5B7%5D.searchable=true&columns%5B7%5D.orderable=true&columns%5B7%5D.search.value=&columns%5B7%5D.search.regex=false&columns%5B8%5D.data=completeness&columns%5B8%5D.name=&columns%5B8%5D.searchable=true&columns%5B8%5D.orderable=true&columns%5B8%5D.search.value=&columns%5B8%5D.search.regex=false&columns%5B9%5D.data=genomeQuality&columns%5B9%5D.name=&columns%5B9%5D.searchable=false&columns%5B9%5D.orderable=true&columns%5B9%5D.search.value=&columns%5B9%5D.search.regex=false&columns%5B10%5D.data=genomeQuality&columns%5B10%5D.name=&columns%5B10%5D.searchable=false&columns%5B10%5D.orderable=true&columns%5B10%5D.search.value=&columns%5B10%5D.search.regex=false&columns%5B11%5D.data=genomeQuality&columns%5B11%5D.name=&columns%5B11%5D.searchable=false&columns%5B11%5D.orderable=true&columns%5B11%5D.search.value=&columns%5B11%5D.search.regex=false&columns%5B12%5D.data=host&columns%5B12%5D.name=&columns%5B12%5D.searchable=true&columns%5B12%5D.orderable=true&columns%5B12%5D.search.value=&columns%5B12%5D.search.regex=false&columns%5B13%5D.data=collectDate&columns%5B13%5D.name=&columns%5B13%5D.searchable=true&columns%5B13%5D.orderable=true&columns%5B13%5D.search.value=&columns%5B13%5D.search.regex=false&columns%5B14%5D.data=location&columns%5B14%5D.name=&columns%5B14%5D.searchable=true&columns%5B14%5D.orderable=true&columns%5B14%5D.search.value=&columns%5B14%5D.search.regex=false&columns%5B15%5D.data=dataProvider&columns%5B15%5D.name=&columns%5B15%5D.searchable=true&columns%5B15%5D.orderable=true&columns%5B15%5D.search.value=&columns%5B15%5D.search.regex=false&columns%5B16%5D.data=submitDate&columns%5B16%5D.name=&columns%5B16%5D.searchable=true&columns%5B16%5D.orderable=true&columns%5B16%5D.search.value=&columns%5B16%5D.search.regex=false&columns%5B17%5D.data=dataSubmitter&columns%5B17%5D.name=&columns%5B17%5D.searchable=true&columns%5B17%5D.orderable=true&columns%5B17%5D.search.value=&columns%5B17%5D.search.regex=false&columns%5B18%5D.data=createDate&columns%5B18%5D.name=&columns%5B18%5D.searchable=true&columns%5B18%5D.orderable=true&columns%5B18%5D.search.value=&columns%5B18%5D.search.regex=false&columns%5B19%5D.data=country&columns%5B19%5D.name=&columns%5B19%5D.searchable=true&columns%5B19%5D.orderable=true&columns%5B19%5D.search.value=&columns%5B19%5D.search.regex=false&columns%5B20%5D.data=province&columns%5B20%5D.name=&columns%5B20%5D.searchable=true&columns%5B20%5D.orderable=true&columns%5B20%5D.search.value=&columns%5B20%5D.search.regex=false&columns%5B21%5D.data=lastModified&columns%5B21%5D.name=&columns%5B21%5D.searchable=true&columns%5B21%5D.orderable=true&columns%5B21%5D.search.value=&columns%5B21%5D.search.regex=false&order%5B0%5D.column=15&order%5B0%5D.dir=desc&start=0&length={max_entries}&search.value=&search.regex=false&'

In [7]:
response = json.loads(requests.get(url).text)
data = response['data']

In [9]:
# https://stackoverflow.com/questions/52795561/flattening-nested-json-in-pandas-data-frame
def flatten_json(nested_json, exclude=['']):
    """Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
            exclude: Keys to exclude from output.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}

    def flatten(x, name='', exclude=exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude: flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

In [10]:
df = pd.DataFrame([flatten_json(x, exclude=['variationInfoList']) for x in data])

In [11]:
df.fillna('', inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297051 entries, 0 to 297050
Data columns (total 44 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   id                             297051 non-null  int64 
 1   name                           297051 non-null  object
 2   accession                      297051 non-null  object
 3   source                         297051 non-null  object
 4   link                           297051 non-null  object
 5   location                       297051 non-null  object
 6   collectDate                    297051 non-null  object
 7   dataProvider                   297051 non-null  object
 8   submitDate                     297051 non-null  object
 9   dataSubmitter                  297051 non-null  object
 10  host                           297051 non-null  object
 11  country                        297051 non-null  object
 12  province                       297051 non-nu

### Download strain metadata

In [13]:
#df = pd.read_excel(metadata_url, dtype='str')
#df.fillna('', inplace=True)

In [14]:
print("Total number of strains:", df.shape[0])

Total number of strains: 297051


In [15]:
df.head(10)

Unnamed: 0,id,name,accession,source,link,location,collectDate,dataProvider,submitDate,dataSubmitter,host,country,province,city,relatedAccession,completeness,gender,age,lastModified,createDate,genomeQuality_id,genomeQuality_completeness,genomeQuality_virusLength,genomeQuality_nratio,genomeQuality_nbase,genomeQuality_mutation,genomeQuality_degenerateBase,genomeQuality_gap,genomeQuality_maxGapLength,genomeQuality_highDensity,genomeQuality_accession,genomeQuality_virusName,genomeQuality_createTime,genomeQuality_lastModified,genomeLineage_id,genomeLineage_accession,genomeLineage_lineage,genomeLineage_lineagesVersion,genomeLineage_status,genomeLineage_note,genomeLineage_createTime,genomeLineage_lastModified,genomeLineage,genomeQuality
0,43581,hCoV-19/Brazil/AC162535-IEC/2020,EPI_ISL_458139,GISAID,https://gisaid.org/,Brazil / Acre,2020-03-18,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Acre,,,Complete,Male,81,2020-09-09 11:31:17,2020-06-04 12:19:36,41657.0,Complete,29903.0,0.00%,0.0,11.0,0.0,0.0,-1.0,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458139,hCoV-19/Brazil/AC162535-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,77378.0,EPI_ISL_458139,B.1.1.33,2020/12/17,passed_qc,,2020-11-30 15:44:35,2020-12-28 17:08:54,,
1,43589,hCoV-19/Brazil/PA164218-IEC/2020,EPI_ISL_458147,GISAID,https://gisaid.org/,Brazil / Para,2020-04-24,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Para,,,Complete,Male,45,2020-09-09 11:31:17,2020-06-04 12:19:37,41665.0,Complete,29903.0,0.00%,0.0,12.0,0.0,0.0,-1.0,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458147,hCoV-19/Brazil/PA164218-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,109126.0,EPI_ISL_458147,B.1.1.248,2020/12/17,passed_qc,,2020-11-30 15:44:38,2020-12-28 17:08:54,,
2,100671,hCoV-19/Brazil/PA-IEC-165313/2020,EPI_ISL_524785,GISAID,34,Brazil / Para,2020-05-05,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Para,,,Complete,Male,34,2020-09-09 11:31:17,2020-08-29 18:49:44,100016.0,Complete,29903.0,0.00%,0.0,13.0,0.0,0.0,-1.0,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_524785,hCoV-19/Brazil/PA-IEC-165313/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,194837.0,EPI_ISL_524785,B.1.1.248,2020/12/17,passed_qc,,2020-11-30 15:44:43,2020-12-28 17:08:54,,
3,100679,hCoV-19/Brazil/AP-IEC-165669/2020,EPI_ISL_524793,GISAID,39,Brazil / Amapa,2020-04-29,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Amapa,,,Complete,Male,39,2020-09-09 11:31:17,2020-08-29 18:49:44,100024.0,Complete,29903.0,0.00%,0.0,11.0,0.0,0.0,-1.0,"28881~28887(7-4-0.57,SNP:28881; SNP:28882; SNP...",EPI_ISL_524793,hCoV-19/Brazil/AP-IEC-165669/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,42344.0,EPI_ISL_524793,B.1.1.33,2020/12/17,passed_qc,,2020-11-30 15:44:32,2020-12-28 17:08:54,,
4,100687,hCoV-19/Brazil/PB-IEC-161853/2020,EPI_ISL_524801,GISAID,60,Brazil / Paraiba,2020-03-19,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Paraiba,,,Complete,Male,60,2020-09-09 11:31:17,2020-08-29 18:49:44,100032.0,Complete,29903.0,0.00%,0.0,7.0,0.0,0.0,-1.0,NO,EPI_ISL_524801,hCoV-19/Brazil/PB-IEC-161853/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,41883.0,EPI_ISL_524801,B.1.5,2020/12/17,passed_qc,,2020-11-30 15:44:32,2020-12-28 17:08:54,,
5,43582,hCoV-19/Brazil/PA162802-IEC/2020,EPI_ISL_458140,GISAID,https://gisaid.org/,Brazil / Para,2020-04-07,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Para,,,Complete,Male,63,2020-09-09 11:31:17,2020-06-04 12:19:36,41658.0,Complete,29903.0,0.00%,0.0,10.0,0.0,0.0,-1.0,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458140,hCoV-19/Brazil/PA162802-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,109142.0,EPI_ISL_458140,B.1.1.248,2020/12/17,passed_qc,,2020-11-30 15:44:38,2020-12-28 17:08:54,,
6,43590,hCoV-19/Brazil/PA164684-IEC/2020,EPI_ISL_458148,GISAID,https://gisaid.org/,Brazil / Para,2020-04-27,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Para,,,Complete,Male,38,2020-09-09 11:31:17,2020-06-04 12:19:37,41666.0,Complete,29903.0,0.00%,0.0,13.0,0.0,0.0,-1.0,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458148,hCoV-19/Brazil/PA164684-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,77387.0,EPI_ISL_458148,B.1.1.33,2020/12/17,passed_qc,,2020-11-30 15:44:35,2020-12-28 17:08:54,,
7,100672,hCoV-19/Brazil/PA-IEC-165302/2020,EPI_ISL_524786,GISAID,58,Brazil / Para,2020-05-01,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Para,,,Complete,Male,58,2020-09-09 11:31:17,2020-08-29 18:49:44,100017.0,Complete,29903.0,0.00%,0.0,11.0,0.0,0.0,-1.0,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_524786,hCoV-19/Brazil/PA-IEC-165302/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,194784.0,EPI_ISL_524786,B.1.1.248,2020/12/17,passed_qc,,2020-11-30 15:44:43,2020-12-28 17:08:54,,
8,100680,hCoV-19/Brazil/MA-IEC-165425/2020,EPI_ISL_524794,GISAID,75,Brazil / Maranhao,2020-04-25,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Maranhao,,,Complete,Male,75,2020-09-09 11:31:17,2020-08-29 18:49:44,100025.0,Complete,29903.0,0.00%,0.0,10.0,0.0,0.0,-1.0,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_524794,hCoV-19/Brazil/MA-IEC-165425/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,194826.0,EPI_ISL_524794,B.1.1.33,2020/12/17,passed_qc,,2020-11-30 15:44:43,2020-12-28 17:08:54,,
9,43583,hCoV-19/Brazil/PA164239-IEC/2020,EPI_ISL_458141,GISAID,https://gisaid.org/,Brazil / Para,2020-04-26,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Para,,,Complete,Male,38,2020-09-09 11:31:17,2020-06-04 12:19:36,41659.0,Complete,29903.0,0.00%,0.0,12.0,0.0,0.0,-1.0,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458141,hCoV-19/Brazil/PA164239-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,77398.0,EPI_ISL_458141,B.1.1.248,2020/12/17,passed_qc,,2020-11-30 15:44:35,2020-12-28 17:08:54,,


### Asssign unique identifiers

In [16]:
# https://registry.identifiers.org/registry/insdc
insdc_pattern = re.compile('^([A-Z]\d{5}|[A-Z]{2}\d{6}|[A-Z]{4}\d{8}|[A-J][A-Z]{2}\d{5})(\.\d+)?$')
# https://registry.identifiers.org/registry/refseq
refseq_pattern = re.compile('^(((AC|AP|NC|NG|NM|NP|NR|NT|NW|XM|XP|XR|YP|ZP)_\d+)|(NZ\_[A-Z]{2,4}\d+))(\.\d+)?$')
epi_pattern = re.compile('^EPI_ISL_\d+$')

In [17]:
def assign_curie(id):
    id = id.strip()
    if len(id) > 0:
        if id.startswith('EPI'):
            return 'https://www.gisaid.org/' + id
        elif refseq_pattern.match(id) != None:
            return 'refseq:' + id
        elif insdc_pattern.match(id) != None:
            return 'insdc:' + id
        else:
            # TODO are URIs available for these cases?
            return id
    else:
        return id

In [18]:
def assign_curies(ids):
    return [assign_curie(id) for id in ids.split(',')]

In [19]:
def get_gisaid_id(ids):
    for id in ids:
        if id.startswith('https://www.gisaid.org/'):
            return id
        
    return ''

#### Rename and concatenate fields

In [20]:
#df['Accession ID'] = df['Accession ID'].str.strip()
#df['Related ID'] = df['Related ID'].str.strip()

# combine all ids into an accession column and assign curies
df['accessions'] = df['accession'] + df['relatedAccession'].apply(lambda s: ',' + s if len(s) > 0 else s)
df['accessions'] = df['accessions'].apply(assign_curies)
df['gisaidId'] = df['accessions'].apply(get_gisaid_id)
df['accessions'] = df['accessions'].apply(lambda x: ';'.join(x))

df['accession'] = df['accession'].apply(lambda s: assign_curie(s))

In [21]:
df.rename(columns={'genomeQuality_virusLength': 'sequenceLength'}, inplace=True)
#df.rename(columns={'Sequence Quality': 'sequenceQuality'}, inplace=True) # get from original spreadsheet
#df.rename(columns={'Quality Assessment': 'qualityAssessment'}, inplace=True) # get from original spreadsheet
df.rename(columns={'dataProvider': 'originatingLab'}, inplace=True)
#df.rename(columns={'Virus Strain Name': 'name'}, inplace=True)
df.rename(columns={'collectDate':'collectionDate'},inplace=True)
df.rename(columns={'Location':'location'}, inplace=True)
df.rename(columns={'genomeLineage_lineage': 'lineage'}, inplace=True)

Remove invalid collection date

In [22]:
df.query("collectionDate == '2020-00-00'")

Unnamed: 0,id,name,accession,source,link,location,collectionDate,originatingLab,submitDate,dataSubmitter,host,country,province,city,relatedAccession,completeness,gender,age,lastModified,createDate,genomeQuality_id,genomeQuality_completeness,sequenceLength,genomeQuality_nratio,genomeQuality_nbase,genomeQuality_mutation,genomeQuality_degenerateBase,genomeQuality_gap,genomeQuality_maxGapLength,genomeQuality_highDensity,genomeQuality_accession,genomeQuality_virusName,genomeQuality_createTime,genomeQuality_lastModified,genomeLineage_id,genomeLineage_accession,lineage,genomeLineage_lineagesVersion,genomeLineage_status,genomeLineage_note,genomeLineage_createTime,genomeLineage_lastModified,genomeLineage,genomeQuality,accessions,gisaidId
795,104895,covid_hub_pl_ibch_0028,insdc:LR877414,GenBank,https://www.ncbi.nlm.nih.gov/nuccore/LR877414,Poland,2020-00-00,WSSE,2020-08-17,"COVID-HUB-PL, Institute of Bioorganic Chemistr...",Homo sapiens,Poland,,,,Partial,Female,,2020-09-14 23:26:05,2020-09-09 18:34:25,104240.0,Partial,29903.0,91.98%,27050.0,-1.0,0.0,-1.0,-1.0,-1,LR877414,covid_hub_pl_ibch_0028,2020-09-09 18:35:26,2020-09-09 18:35:26,11550.0,LR877414,,2020/12/17,fail,N_content:0.92,2020-11-24 15:50:36,2020-12-28 17:08:54,,,insdc:LR877414;,
850,104905,covid_hub_pl_ibch_0044,insdc:LR877424,GenBank,https://www.ncbi.nlm.nih.gov/nuccore/LR877424,Poland,2020-00-00,WSSE,2020-08-17,"COVID-HUB-PL, Institute of Bioorganic Chemistr...",Homo sapiens,Poland,,,,Complete,Female,,2020-09-14 23:26:05,2020-09-09 18:34:25,104250.0,Complete,29903.0,2.22%,654.0,-1.0,0.0,-1.0,-1.0,-1,LR877424,covid_hub_pl_ibch_0044,2020-09-09 18:35:27,2020-09-09 18:35:27,43841.0,LR877424,B,2020/12/17,passed_qc,,2020-11-30 15:44:32,2020-12-28 17:08:54,,,insdc:LR877424;,


In [23]:
df['collectionDate'] = df['collectionDate'].apply(lambda d: '' if d == '2020-00-00' else d)

In [24]:
df['collectionDate'] = df['collectionDate'].apply(lambda d: dateutil.parser.parse(d) if len(d) > 0 else '')

In [25]:
df[df['accessions'].str.contains('refseq:NC_045512')]

Unnamed: 0,id,name,accession,source,link,location,collectionDate,originatingLab,submitDate,dataSubmitter,host,country,province,city,relatedAccession,completeness,gender,age,lastModified,createDate,genomeQuality_id,genomeQuality_completeness,sequenceLength,genomeQuality_nratio,genomeQuality_nbase,genomeQuality_mutation,genomeQuality_degenerateBase,genomeQuality_gap,genomeQuality_maxGapLength,genomeQuality_highDensity,genomeQuality_accession,genomeQuality_virusName,genomeQuality_createTime,genomeQuality_lastModified,genomeLineage_id,genomeLineage_accession,lineage,genomeLineage_lineagesVersion,genomeLineage_status,genomeLineage_note,genomeLineage_createTime,genomeLineage_lastModified,genomeLineage,genomeQuality,accessions,gisaidId
51876,14,Wuhan-Hu-1,insdc:MN908947,GenBank,https://www.ncbi.nlm.nih.gov/nuccore/MN908947,China / Hubei / Wuhan,2019-12-30,Shanghai Public Health Clinical Center & Schoo...,2020-01-17,Shanghai Public Health Clinical Center & Schoo...,Homo sapiens,China,Hubei,Wuhan,"NC_045512,EPI_ISL_402125",Complete,,,2020-05-20 11:14:12,2020-01-20 20:04:48,629.0,Complete,29903.0,0,0.0,0.0,0.0,0.0,-1.0,NO,MN908947,Wuhan-Hu-1,2020-03-17 14:27:13,2020-03-17 14:27:13,195185.0,MN908947,B,2020/10/30,passed_qc,,2020-11-30 15:44:43,2020-11-30 15:44:43,,,insdc:MN908947;refseq:NC_045512;https://www.gi...,https://www.gisaid.org/EPI_ISL_402125


In [26]:
df.head()

Unnamed: 0,id,name,accession,source,link,location,collectionDate,originatingLab,submitDate,dataSubmitter,host,country,province,city,relatedAccession,completeness,gender,age,lastModified,createDate,genomeQuality_id,genomeQuality_completeness,sequenceLength,genomeQuality_nratio,genomeQuality_nbase,genomeQuality_mutation,genomeQuality_degenerateBase,genomeQuality_gap,genomeQuality_maxGapLength,genomeQuality_highDensity,genomeQuality_accession,genomeQuality_virusName,genomeQuality_createTime,genomeQuality_lastModified,genomeLineage_id,genomeLineage_accession,lineage,genomeLineage_lineagesVersion,genomeLineage_status,genomeLineage_note,genomeLineage_createTime,genomeLineage_lastModified,genomeLineage,genomeQuality,accessions,gisaidId
0,43581,hCoV-19/Brazil/AC162535-IEC/2020,https://www.gisaid.org/EPI_ISL_458139,GISAID,https://gisaid.org/,Brazil / Acre,2020-03-18,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Acre,,,Complete,Male,81,2020-09-09 11:31:17,2020-06-04 12:19:36,41657.0,Complete,29903.0,0.00%,0.0,11.0,0.0,0.0,-1.0,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458139,hCoV-19/Brazil/AC162535-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,77378.0,EPI_ISL_458139,B.1.1.33,2020/12/17,passed_qc,,2020-11-30 15:44:35,2020-12-28 17:08:54,,,https://www.gisaid.org/EPI_ISL_458139;,https://www.gisaid.org/EPI_ISL_458139
1,43589,hCoV-19/Brazil/PA164218-IEC/2020,https://www.gisaid.org/EPI_ISL_458147,GISAID,https://gisaid.org/,Brazil / Para,2020-04-24,﻿Evandro Chagas Institute,2020-06-03,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Para,,,Complete,Male,45,2020-09-09 11:31:17,2020-06-04 12:19:37,41665.0,Complete,29903.0,0.00%,0.0,12.0,0.0,0.0,-1.0,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_458147,hCoV-19/Brazil/PA164218-IEC/2020,2020-06-04 12:20:14,2020-06-04 12:20:14,109126.0,EPI_ISL_458147,B.1.1.248,2020/12/17,passed_qc,,2020-11-30 15:44:38,2020-12-28 17:08:54,,,https://www.gisaid.org/EPI_ISL_458147;,https://www.gisaid.org/EPI_ISL_458147
2,100671,hCoV-19/Brazil/PA-IEC-165313/2020,https://www.gisaid.org/EPI_ISL_524785,GISAID,34,Brazil / Para,2020-05-05,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Para,,,Complete,Male,34,2020-09-09 11:31:17,2020-08-29 18:49:44,100016.0,Complete,29903.0,0.00%,0.0,13.0,0.0,0.0,-1.0,"28881~28883(3-3-1.00,SNP:28881; SNP:28882; SNP...",EPI_ISL_524785,hCoV-19/Brazil/PA-IEC-165313/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,194837.0,EPI_ISL_524785,B.1.1.248,2020/12/17,passed_qc,,2020-11-30 15:44:43,2020-12-28 17:08:54,,,https://www.gisaid.org/EPI_ISL_524785;,https://www.gisaid.org/EPI_ISL_524785
3,100679,hCoV-19/Brazil/AP-IEC-165669/2020,https://www.gisaid.org/EPI_ISL_524793,GISAID,39,Brazil / Amapa,2020-04-29,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Amapa,,,Complete,Male,39,2020-09-09 11:31:17,2020-08-29 18:49:44,100024.0,Complete,29903.0,0.00%,0.0,11.0,0.0,0.0,-1.0,"28881~28887(7-4-0.57,SNP:28881; SNP:28882; SNP...",EPI_ISL_524793,hCoV-19/Brazil/AP-IEC-165669/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,42344.0,EPI_ISL_524793,B.1.1.33,2020/12/17,passed_qc,,2020-11-30 15:44:32,2020-12-28 17:08:54,,,https://www.gisaid.org/EPI_ISL_524793;,https://www.gisaid.org/EPI_ISL_524793
4,100687,hCoV-19/Brazil/PB-IEC-161853/2020,https://www.gisaid.org/EPI_ISL_524801,GISAID,60,Brazil / Paraiba,2020-03-19,﻿Evandro Chagas Institute,2020-08-28,﻿Evandro Chagas Institute,Homo Sapiens,Brazil,Paraiba,,,Complete,Male,60,2020-09-09 11:31:17,2020-08-29 18:49:44,100032.0,Complete,29903.0,0.00%,0.0,7.0,0.0,0.0,-1.0,NO,EPI_ISL_524801,hCoV-19/Brazil/PB-IEC-161853/2020,2020-08-29 18:52:11,2020-08-29 18:52:11,41883.0,EPI_ISL_524801,B.1.5,2020/12/17,passed_qc,,2020-11-30 15:44:32,2020-12-28 17:08:54,,,https://www.gisaid.org/EPI_ISL_524801;,https://www.gisaid.org/EPI_ISL_524801


#### Assign taxonomy ids

In [27]:
# read Organism reference dictionary
organism_to_id = dict()
data = pd.read_csv("../../reference_data/OrganismDictionary.csv", comment='#')
for index, row in data.iterrows():
    organism_to_id[row['organism']] = row['taxonomyId']

In [28]:
print(organism_to_id)

{'human': 'taxonomy:9606', 'homo sapiens': 'taxonomy:9606', 'mus musculus': 'taxonomy:10090', 'rhinolophus affinis': 'taxonomy:59477 ', 'rhinolophus malayanus': 'taxonomy:608659', 'mustela lutreola': 'taxonomy:9666', 'panthera tigris jacksoni': 'taxonomy:419130', 'rhinolophus sp. (bat)': 'taxonomy:49442', 'bat': 'taxonomy:49442', 'manis javanica': 'taxonomy:9974', 'manis pentadactyla': 'taxonomy:143292', 'palm civet': 'taxonomy:71116', 'canine': 'taxonomy:9608', 'canis lupus familiaris': 'taxonomy:9615', 'felis catus': 'taxonomy:9685', 'neovison vison': 'taxonomy:452646', 'mesocricetus auratus': 'taxonomy:10036', 'panthera leo': 'taxonomy:9689', 'panthera tigris': 'taxonomy:9694', 'environment': 'taxonomy:151659', 'environmental': 'taxonomy:151659'}


In [29]:
# assign taxonomy id to host
df['host'] = df['host'].str.strip()
df['hostTaxonomyId'] = df['host'].apply(lambda s: organism_to_id.get(s.lower(), s))
df['hostTaxonomyId'].unique()

array(['taxonomy:9606', 'taxonomy:10090', 'taxonomy:151659',
       'taxonomy:9666', 'taxonomy:9615', 'taxonomy:143292',
       'taxonomy:9685', 'taxonomy:9974', 'unknown', 'taxonomy:608659',
       'taxonomy:452646', 'taxonomy:419130', 'taxonomy:59477 ',
       'Vero cell culture', 'taxonomy:9694', 'taxonomy:9689',
       'taxonomy:10036'], dtype=object)

In [30]:
df['taxonomyId'] = 'taxonomy:2697049' # SARS-CoV-2

#### Standardize location information

In [31]:
df[['loc0', 'loc1', 'loc2', 'loc3']] = df['location'].str.split('/', n=3, expand=True)
# strip white space
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [32]:
df['origLocation'] = df[['loc0', 'loc1', 'loc2', 'loc3']].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)

### Save strain metadata

In [33]:
# strains = df[['name', 'accession', 'accessions', 'gisaidId', 'source', 'taxonomyId', 'hostTaxonomyId', 
#               'sequenceLength', 'sequenceQuality', 'qualityAssessment', 'collectionDate', 'location', 
#               'origLocation', 'originatingLab']].copy()

In [34]:
strains = df[['name', 'accession', 'accessions', 'gisaidId', 'source', 'taxonomyId', 'hostTaxonomyId', 'lineage',
              'sequenceLength', 'completeness', 'gender', 'age', 'collectionDate', 'location', 
              'origLocation', 'originatingLab']].copy()

In [35]:
strains.head()

Unnamed: 0,name,accession,accessions,gisaidId,source,taxonomyId,hostTaxonomyId,lineage,sequenceLength,completeness,gender,age,collectionDate,location,origLocation,originatingLab
0,hCoV-19/Brazil/AC162535-IEC/2020,https://www.gisaid.org/EPI_ISL_458139,https://www.gisaid.org/EPI_ISL_458139;,https://www.gisaid.org/EPI_ISL_458139,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,,Complete,Male,81,2020-03-18,Brazil / Acre,"Brazil,Acre",﻿Evandro Chagas Institute
1,hCoV-19/Brazil/PA164218-IEC/2020,https://www.gisaid.org/EPI_ISL_458147,https://www.gisaid.org/EPI_ISL_458147;,https://www.gisaid.org/EPI_ISL_458147,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.248,,Complete,Male,45,2020-04-24,Brazil / Para,"Brazil,Para",﻿Evandro Chagas Institute
2,hCoV-19/Brazil/PA-IEC-165313/2020,https://www.gisaid.org/EPI_ISL_524785,https://www.gisaid.org/EPI_ISL_524785;,https://www.gisaid.org/EPI_ISL_524785,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.248,,Complete,Male,34,2020-05-05,Brazil / Para,"Brazil,Para",﻿Evandro Chagas Institute
3,hCoV-19/Brazil/AP-IEC-165669/2020,https://www.gisaid.org/EPI_ISL_524793,https://www.gisaid.org/EPI_ISL_524793;,https://www.gisaid.org/EPI_ISL_524793,GISAID,taxonomy:2697049,taxonomy:9606,B.1.1.33,,Complete,Male,39,2020-04-29,Brazil / Amapa,"Brazil,Amapa",﻿Evandro Chagas Institute
4,hCoV-19/Brazil/PB-IEC-161853/2020,https://www.gisaid.org/EPI_ISL_524801,https://www.gisaid.org/EPI_ISL_524801;,https://www.gisaid.org/EPI_ISL_524801,GISAID,taxonomy:2697049,taxonomy:9606,B.1.5,,Complete,Male,60,2020-03-19,Brazil / Paraiba,"Brazil,Paraiba",﻿Evandro Chagas Institute


In [36]:
print('Number of strains:',strains.shape[0])

Number of strains: 297051


In [37]:
strains.to_csv(NEO4J_IMPORT / "01c-CNCBStrainPre.csv", index=False)