# Viral Genome Data from NCBI
**[Work in progress]**

This notebook downloads and standardizes viral genome data from NCBI for ingestion into the Knowledge Graph.

Data source: [NCBI](https://www.ncbi.nlm.nih.gov)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
import dateutil
from pathlib import Path
from Bio import SeqIO
from Bio import Entrez
Entrez.email = "covid19@mybinder.org"

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_HOME = Path(os.getenv('NEO4J_HOME'))
print(NEO4J_HOME)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3


### NCBI Data Source
[NCBI Severe acute respiratory syndrome coronavirus 2 data hub](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Severe%20acute%20respiratory%20syndrome%20coronavirus%202,%20taxid:2697049)

In [4]:
# constants used to generate fasta url: 
# "https://www.ncbi.nlm.nih.gov/nuccore/{genbank_accession}/?report=fasta"

ncbi_nuccore_url = "https://www.ncbi.nlm.nih.gov/nuccore/"
fasta_format = "/?report=fasta"

### Download viral genome information

In [5]:
# TODO loop over a list of taxonomy ids. 

# for now SARS-CoV-2 is hardcoded.
tax_id = "2697049" # SARS-CoV-2, NC_045512
#tax_id = "694009" # SARS, NC_004718
#tax_id = "1335626" # MERS-CoV, NC_019843
#tax_id = "333387" # Bat SARS coronavirus HKU3-1
#tax_id = "147711" # Rhinovirus A
#tax_id = "147712" # Rhinovirus B
#tax_id = "186538" # Zaire ebolavirus, NC_002549

#term = f"txid{tax_id}[Organism]+refseq[filter]" # format for reference sequences only

### Download nucleotide accession numbers for the given taxonomy id

In [6]:
term = f"txid{tax_id}[Organism]"
handle = Entrez.esearch(db="nucleotide", retmax=5000, term=term, idtype="acc")
nuc_accessions = Entrez.read(handle)
handle.close()
print("Nucleotides:", nuc_accessions['Count'])

Nucleotides: 1731


In [7]:
# TODO refactor function into a separate .py file
def extract_genbank_features(gb_record, feature_type, qualifier_list) :
    answers = list()
    for (index, feature) in enumerate(gb_record.features) :
        answer = dict()
        if feature.type==feature_type :
            for qualifier in qualifier_list:
                if qualifier in feature.qualifiers :
                    #print(feature, feature.qualifiers)
                    # keep NCBI "1"-based indexing
                    answer['start'] = min(feature.location) + 1
                    answer['end'] = max(feature.location) + 1
                    #There should only be one locus_tag per feature, but there
                    #are usually several db_xref entries
                    for value in feature.qualifiers[qualifier] :
                        if value in answer :
                                print("WARNING - Duplicate key %s for %s features %i and %i" \
                                % (value, feature_type, answer[value], index))
                        else :
                            answer[qualifier] = value
                else:
                    answer[qualifier] = ''
            answers.append(answer)
    return answers

In [8]:
def process_nucleotide_record(accession):
    with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=accession) as handle:
        record = SeqIO.read(handle, "gb")
    # TODO cache genbank files in a temporary directory, so they don't need to be downloaded from scratch
    # every time this notebook is run
    source = extract_genbank_features(record, 'source', ['db_xref','host','isolate','isolation_source','mol_type','collection_date','country'])   
    source_df = pd.DataFrame(source)
    source_df['genbank_accession'] = accession
    source_df['complete'] = record.description.endswith('complete genome')
    
    return source_df

### Concatenate all Genbank dataframes

In [9]:
nuc_ids = nuc_accessions['IdList']
nuc_ids.append('NC_045512') # SARS-CoV-2 refseq is missing from list (why??). Add it back.
df_list = []

for nuc in nuc_ids:
    print(nuc, end=' ')
    df_list.append(process_nucleotide_record(nuc))
    
df1 = pd.concat(df_list)

MT396266.1 MT394531.1 MT394530.1 MT394529.1 MT394528.1 MT394864.1 MT396241.1 MT396248.1 MT396247.1 MT396246.1 MT396245.1 MT396244.1 MT396243.1 MT396242.1 MT385497.1 MT385496.1 MT385495.1 MT385494.1 MT385493.1 MT385492.1 MT385491.1 MT385490.1 MT385489.1 MT385488.1 MT385487.1 MT385486.1 MT385485.1 MT385484.1 MT385483.1 MT385482.1 MT385481.1 MT385480.1 MT385479.1 MT385478.1 MT385477.1 MT385476.1 MT385475.1 MT385474.1 MT385473.1 MT385472.1 MT385471.1 MT385470.1 MT385469.1 MT385468.1 MT385467.1 MT385466.1 MT385465.1 MT385464.1 MT385463.1 MT385462.1 MT385461.1 MT385460.1 MT385459.1 MT385458.1 MT385457.1 MT385456.1 MT385455.1 MT385454.1 MT385453.1 MT385452.1 MT385451.1 MT385450.1 MT385449.1 MT385448.1 MT385447.1 MT385446.1 MT385445.1 MT385444.1 MT385443.1 MT385442.1 MT385441.1 MT385440.1 MT385439.1 MT385438.1 MT385437.1 MT385436.1 MT385435.1 MT385434.1 MT385433.1 MT385432.1 MT385431.1 MT385430.1 MT385429.1 MT385428.1 MT385427.1 MT385426.1 MT385425.1 MT385424.1 MT385423.1 MT385422.1 MT385421.1

Let's work on a copy of the original data

In [29]:
df = df1.copy()
df1.head()

Unnamed: 0,start,end,db_xref,host,isolate,isolation_source,mol_type,collection_date,country,genbank_accession,complete
0,1,29880,taxon:2697049,Mustela lutreola,SARS-CoV-2/mink/NLD/1/2020,lung,genomic RNA,2020-04-24,Netherlands: Milheeze,MT396266.1,True
0,1,29872,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X003/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-31,USA: CA,MT394531.1,True
0,1,29870,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X004/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-31,USA: CA,MT394530.1,True
0,1,29863,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X001/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-24,USA: CA,MT394529.1,True
0,1,29867,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X002/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-24,USA: CA,MT394528.1,True


### Filter out incomplete records

In [30]:
# incomplete genomes
df.query('complete', inplace=True)

In [31]:
# filter out any records that don't match the tax id, 
# e.g., tax_ids from organisms that are lower in the taxonomy tree.
df.query(f"db_xref == 'taxon:{tax_id}'", inplace=True)

In [32]:
df.head()

Unnamed: 0,start,end,db_xref,host,isolate,isolation_source,mol_type,collection_date,country,genbank_accession,complete
0,1,29880,taxon:2697049,Mustela lutreola,SARS-CoV-2/mink/NLD/1/2020,lung,genomic RNA,2020-04-24,Netherlands: Milheeze,MT396266.1,True
0,1,29872,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X003/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-31,USA: CA,MT394531.1,True
0,1,29870,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X004/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-31,USA: CA,MT394530.1,True
0,1,29863,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X001/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-24,USA: CA,MT394529.1,True
0,1,29867,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X002/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-24,USA: CA,MT394528.1,True


### Create unique and interoperable identifiers

**id**: CURIE: [insdc](https://registry.identifiers.org/registry/insdc) (International Nucleotide Sequence Database Collaboration, INSDC)

**id**: CURIE: [ncbiprotein](https://registry.identifiers.org/registry/ncbiprotein) (NCBI Reference Sequences, Refseq)

**taxonomy_id**: CURIE: [taxonomy](https://registry.identifiers.org/registry/taxonomy) (NCBI Taxonomy)

A [CURIE](https://en.wikipedia.org/wiki/CURIE) (Compact URI) is a compact abbreviation for Uniform Resource Identifiers (URIs) that can be resolved by [Identifiers.org](https://identifiers.org/).

In [33]:
# remove version number from genbank accession to enable linking with other nodes
df['id'] = "insdc:" + df['genbank_accession'].apply(lambda s: s.split('.')[0])
# # NCBI reference sequences resolve through the ncbiprotein CURIE
df['id'] = df['id'].str.replace('insdc:NC_', 'ncbiprotein:NC_')

In [34]:
# parse taxonomy_id from db_xref record
df['taxonomy_id'] = 'taxonomy:' + df['db_xref'].str.split(':', expand=True)[1]
df.rename(columns={'isolate': 'name'}, inplace=True)

### Standardize data

In [35]:
# TODO: date standardization introduces artifacts, e.g. Dec 2019 -> 2019-12-01
# Add column that specifies time granularity: Y, M, D
df['collection_date'] = df['collection_date'].apply(lambda d: dateutil.parser.parse(d) if len(d) > 0 else '')

Standardize host organism and demographics

In [36]:
# TODO a find general solution to map host name to NCBI taxonomy**

# some organism specifications are ambiguous, 
# they don't match a specificn NCBI taxonomy

taxonomy_to_id = {'Human': '9606', 
                  'Homo sapiens': '9606',
                  'Rhinolophus affinis': '59477', 
                  'Mustela lutreola': '9666',
                  'Panthera tigris jacksoni': '419130',
                  'Rhinolophus sp. (bat)': '49442', # ambiguous
                  'bat': 'taxonomy:49442', # ambiguous
                  'Manis javanica': '9974',
                  'palm civet': '71116', # ambiguous
                  'Canine': '9608' # ambiguous
                 }

Split host record into host, sex, and age (e.g., Homo sapiens; female; age 40 -> Homo sapiens, female, 40)

In [37]:
# add placeholders in case the host record is empty to enbable 3-way split
df['host'] = df['host'].apply(lambda s: ' ; ; ' if len(s) < 1 else s)
df['host'] = df['host'].str.replace('age','')
df[['host','sex','age']] = df['host'].str.split(';', 2, expand=True)
df['host'] = df['host'].str.strip()

# assign taxonomy id to host
df['host_taxonomy_id'] = 'taxonomy:' + df['host'].apply(lambda s: taxonomy_to_id.get(s, ''))
df['sex'] = df['sex'].str.strip()
df['age'] = df['age'].str.strip()
df = df.fillna('')

### List entries where taxonomy cannot be assigned

In [38]:
df.query("host_taxonomy_id == 'taxonomy:'")

Unnamed: 0,start,end,db_xref,host,name,isolation_source,mol_type,collection_date,country,genbank_accession,complete,id,taxonomy_id,sex,age,host_taxonomy_id
0,1,29900,taxon:2697049,,SARS-CoV-2/human/CHN/YN-0306-466/2020,,genomic RNA,2020-03-06,China,MT396241.1,True,insdc:MT396241,taxonomy:2697049,,,taxonomy:
0,1,29897,taxon:2697049,Panthera tigris jacksoni,SARS-CoV-2/tiger/NY/040420/2020,oral swab; nasal swab; tracheal wash,genomic RNA,2020-04-02,USA: New York,MT365033.1,True,insdc:MT365033,taxonomy:2697049,,,taxonomy:
0,1,29889,taxon:2697049,,SARS-CoV-2/ENV/USA/UF-3/2020,breathing air using a VIVAS air sampler,genomic RNA,2020-03-25,USA,MT324684.1,True,insdc:MT324684,taxonomy:2697049,,,taxonomy:


In [19]:
df.head()

Unnamed: 0,start,end,db_xref,host,name,isolation_source,mol_type,collection_date,country,genbank_accession,complete,id,taxonomy_id,sex,age,host_taxonomy_id
0,1,29880,taxon:2697049,Mustela lutreola,SARS-CoV-2/mink/NLD/1/2020,lung,genomic RNA,2020-04-24,Netherlands: Milheeze,MT396266.1,True,insdc:MT396266,taxonomy:2697049,,,taxonomy:
0,1,29872,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X003/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-31,USA: CA,MT394531.1,True,insdc:MT394531,taxonomy:2697049,,,taxonomy:9606
0,1,29870,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X004/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-31,USA: CA,MT394530.1,True,insdc:MT394530,taxonomy:2697049,,,taxonomy:9606
0,1,29863,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X001/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-24,USA: CA,MT394529.1,True,insdc:MT394529,taxonomy:2697049,,,taxonomy:9606
0,1,29867,taxon:2697049,Homo sapiens,SARS-CoV-2/human/USA/CA-CZB-59X002/2020,nasopharyngeal/oropharyngeal swab,genomic RNA,2020-03-24,USA: CA,MT394528.1,True,insdc:MT394528,taxonomy:2697049,,,taxonomy:9606


### Assign locations
**TODO replace this section with a general geoparsing solution**

Fix inconsistencies and add missing location data

In [20]:
locations = {
    'India: Kerala State': 'India: Kerala',
    'USA: Snohomish County, WA': 'USA: WA, Snohomish County',
    'USA: San Francisco, CA': 'USA: CA, San Francisco',
    'USA: Nashville, TN': 'USA: TN, Nashville',
    'Canada: Toronto': 'Canada: Ontario, Toronto',
    'China: Shenzhen': 'China: Guangdong, Shenzhen',
    'China: Hangzhou': 'China: Zhejiang, Hangzhou',
    'China: Wuhan': 'China: Hubei, Wuhan',
    'Italy: Cagliari': 'Italy: Sardinia, Cagliari',
    'Viet Nam: Ho Chi Minh city': 'Vietnam: Ho Chi Minh City'
}
df['country'] = df['country'].apply(lambda s: locations.get(s, s))

In [21]:
states = {
        # US
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming',
        # Canada
        'AB': 'Alberta',
        'BC': 'British Columbia',
        'MB': 'Manitoba',
        'NB': 'New Brunswick',
        'NL': 'Newfoundland and Labrador',
        'NT': 'Northwest Territories',
        'NS': 'Nova Scotia',
        'NU': 'Nunavut',
        'ON': 'Ontario',
        'PE': 'Prince Edward Island',
        'QC': 'Quebec',
        'SK': 'Saskatchewan',
        'YT': 'Yukon'
}

Split country records into administrative areas

In [22]:
# add placeholders in case the country record is incomplete to enable 3-way split below
df['country'] = df['country'].apply(lambda s: s if ':' in s else s + ': , ')
df[['country','admin1','admin2']] = df['country'].str.split(':|,', 2, expand=True)
df['country'] = df['country'].str.strip()
df['admin2'] = df['admin2'].str.strip()
df['admin1'] = df['admin1'].str.strip()

# # expand states to full name
df['admin1'] = df['admin1'].apply(lambda s: states.get(s, s))
# create a unique ids
# TODO use unique ids from Geonames.org
df['country_id'] = df['country']
df['admin1_id'] = df['country'] + '-' + df['admin1']
df['admin2_id'] = df['country'] + '-' + df['admin1'] + '-' + df['admin2']

df.fillna(value='', inplace = True)

### Save data for Knowledge Graph Import

In [23]:
df = df[['id','name', 'taxonomy_id', 'collection_date', 'host_taxonomy_id',
         'sex', 'age', 'isolation_source']]
df.head()

Unnamed: 0,id,name,taxonomy_id,collection_date,host_taxonomy_id,sex,age,isolation_source
0,insdc:MT396266,SARS-CoV-2/mink/NLD/1/2020,taxonomy:2697049,2020-04-24,taxonomy:,,,lung
0,insdc:MT394531,SARS-CoV-2/human/USA/CA-CZB-59X003/2020,taxonomy:2697049,2020-03-31,taxonomy:9606,,,nasopharyngeal/oropharyngeal swab
0,insdc:MT394530,SARS-CoV-2/human/USA/CA-CZB-59X004/2020,taxonomy:2697049,2020-03-31,taxonomy:9606,,,nasopharyngeal/oropharyngeal swab
0,insdc:MT394529,SARS-CoV-2/human/USA/CA-CZB-59X001/2020,taxonomy:2697049,2020-03-24,taxonomy:9606,,,nasopharyngeal/oropharyngeal swab
0,insdc:MT394528,SARS-CoV-2/human/USA/CA-CZB-59X002/2020,taxonomy:2697049,2020-03-24,taxonomy:9606,,,nasopharyngeal/oropharyngeal swab


Unnamed: 0,id,name,taxonomy_id,collection_date,host_taxonomy_id,sex,age,isolation_source
0,insdc:MT396266,SARS-CoV-2/mink/NLD/1/2020,taxonomy:2697049,2020-04-24,taxonomy:,,,lung
0,insdc:MT396241,SARS-CoV-2/human/CHN/YN-0306-466/2020,taxonomy:2697049,2020-03-06,taxonomy:,,,
0,insdc:MT365033,SARS-CoV-2/tiger/NY/040420/2020,taxonomy:2697049,2020-04-02,taxonomy:,,,oral swab; nasal swab; tracheal wash
0,insdc:MT324684,SARS-CoV-2/ENV/USA/UF-3/2020,taxonomy:2697049,2020-03-25,taxonomy:,,,breathing air using a VIVAS air sampler


In [24]:
df.to_csv(NEO4J_HOME / "import/01a-NCBIStrain.csv", index=False)