# Load Viral Genome Data from NCBI
**[Work in progress]**

This notebook downloads and standardizes viral genome data from NCBI for ingestion into a Knowledge Graph.

Data source: [NCBI](https://www.ncbi.nlm.nih.gov)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import re
import pandas as pd
import dateutil
from pathlib import Path
import hashlib 

from Bio import SeqIO
from Bio import Entrez
Entrez.email = "covid19@mybinder.org"

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
# Path will take care of handling operating system differences.
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-328d8379-6ab4-4cc1-a397-2de37909d2e4/installation-4.1.0/import


### Download viral genome information

In [4]:
# TODO loop over a list of taxonomy ids. 

# for now SARS-CoV-2 is hardcoded.
#tax_id = "2697049" # SARS-CoV-2, NC_045512
#tax_id = "227984" # SARS, NC_004718
#tax_id = "1335626" # MERS-CoV, NC_019843

#tax_id = "147711" # Rhinovirus A NC_001617
#tax_id = "147712" # Rhinovirus B (B14: NC_001490)
#https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=12058&host=human

#tax_id = "186538" # Zaire ebolavirus, NC_002549

# https://www.ncbi.nlm.nih.gov/ipg/YP_009724389.1 identical protein groups
#https://www.ebi.ac.uk/uniprot/api/covid-19/uniprotkb/accession/P0DTC1.txt

In [5]:
organisms = pd.read_csv("../../reference_data/Organism.csv")

In [6]:
pathogens = organisms.query("type == 'Pathogen'")
refSeq = pathogens['genomeAccession'].values

### Download reference sequence nucleotide accession numbers for the given taxonomy id

In [7]:
# TODO refactor function into a separate .py file
def extract_genbank_features(gb_record, feature_type, qualifier_list) :
    answers = list()
    for (index, feature) in enumerate(gb_record.features) :
        answer = dict()
        if feature.type==feature_type :
            for qualifier in qualifier_list:
                if qualifier in feature.qualifiers :
                    #print(feature, feature.qualifiers)
                    # keep NCBI "1"-based indexing
                    answer['start'] = min(feature.location) + 1
                    answer['end'] = max(feature.location) + 1
                    #There should only be one locus_tag per feature, but there
                    #are usually several db_xref entries
                    for value in feature.qualifiers[qualifier] :
                        if value in answer :
                                print("WARNING - Duplicate key %s for %s features %i and %i" \
                                % (value, feature_type, answer[value], index))
                        else :
                            answer[qualifier] = value
                else:
                    answer[qualifier] = ''
            answers.append(answer)
    return answers

In [8]:
def process_protein_record(accession):
    with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=accession) as handle:
        record = SeqIO.read(handle, "gb")
        print(record)
    
    source = extract_genbank_features(record, "source", ["db_xref"])
    taxonomy = source[0]['db_xref']
    protein = extract_genbank_features(record, "CDS", ["gene","locus_tag","db_xref","product","protein_id"])
    protein_df = pd.DataFrame(protein)
    protein_df['fullLength'] = 'True'

    mature_peptide = extract_genbank_features(record, "mat_peptide", ["gene","locus_tag","db_xref","product","protein_id"])
    
    # some reference sequences are missing the mature peptides (why???)
    if len(mature_peptide) > 0:
        mature_peptide_df = pd.DataFrame(mature_peptide)
        mature_peptide_df['fullLength'] = 'False'

        # assign db_xref to the mature proteins based on the precursor's gene id
        mature_peptide_df['db_xref'] = mature_peptide_df['gene'].apply(lambda g: protein_df.query(f"gene=='{g}'")['db_xref'][0])
        df = pd.concat([protein_df, mature_peptide_df])
    else:
        df = protein_df
        
    df['taxonomyId'] = taxonomy
    df['genomeName'] = record.description
    df['genbank_accession'] = accession
    
    return df

### Concatenate all Genbank dataframes

In [9]:
df = pd.concat((process_protein_record(accession)) for accession in refSeq)

ID: NC_045512.2
Name: NC_045512
Description: Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
Database cross-references: BioProject:PRJNA485481
Number of features: 57
/molecule_type=ss-RNA
/topology=linear
/data_file_division=VRL
/date=18-JUL-2020
/accessions=['NC_045512']
/sequence_version=2
/keywords=['RefSeq']
/source=Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
/organism=Severe acute respiratory syndrome coronavirus 2
/taxonomy=['Viruses', 'Riboviria', 'Orthornavirae', 'Pisuviricota', 'Pisoniviricetes', 'Nidovirales', 'Cornidovirineae', 'Coronaviridae', 'Orthocoronavirinae', 'Betacoronavirus', 'Sarbecovirus']
/references=[Reference(title='A new coronavirus associated with human respiratory disease in China', ...), Reference(title='Programmed ribosomal frameshifting in decoding the SARS-CoV genome', ...), Reference(title='The structure of a rigorously conserved RNA element within the SARS virus genome', ...), Reference(title="A phyl

#### Get protein sequence and calculate md5 hash code as a unique id

In [10]:
def get_sequence(accession):
    with Entrez.efetch(db="protein", rettype="gb", retmode="text", id=accession) as handle:
        record = SeqIO.read(handle, "gb")
        return str(record.seq)

In [11]:
df['sequence'] = df['protein_id'].apply(get_sequence)

In [12]:
df['id'] = df['sequence'].apply(lambda seq: 'md5:' + hashlib.md5(seq.encode()).hexdigest())

### Create unique and interoperable identifiers

**genbank_id**: CURIE: [ncbiprotein](https://registry.identifiers.org/registry/ncbiprotein) (NCBI Reference Sequences, Refseq)

**ncbiprotein_id**: CURIE: [ncbiprotein](https://registry.identifiers.org/registry/ncbiprotein) (NCBI Reference Sequences, Refseq)

**ncbi_gene_id**: CURIE: [ncbigene](https://registry.identifiers.org/registry/ncbigene (NCBI Gene)

A [CURIE](https://en.wikipedia.org/wiki/CURIE) (Compact URI) is a compact abbreviation for Uniform Resource Identifiers (URIs) that can be resolved by [Identifiers.org](https://identifiers.org/).

In [13]:
# https://registry.identifiers.org/registry/insdc
insdc_pattern = re.compile('^([A-Z]\d{5}|[A-Z]{2}\d{6}|[A-Z]{4}\d{8}|[A-J][A-Z]{2}\d{5})(\.\d+)?$')

In [14]:
def assign_curie(id):
    id = id.strip()
    if len(id) > 0:
        if id.startswith('EPI'):
            return 'https://www.gisaid.org/' + id
        elif id.startswith('NC_'):
            return 'refseq:' + id
        elif insdc_pattern.match(id) != None:
            return 'insdc:' + id
        else:
            # TODO are URIs available for these cases?
            return id
    else:
        return id

In [15]:
# remove version number from genbank accessions to enable linking with other resources
df['genomeAccession'] = df['genbank_accession'].apply(lambda s: s.split('.')[0])
df['genomeAccession'] = df['genomeAccession'].apply(assign_curie)
df['proteinAccession'] = 'ncbiprotein:' + df['protein_id'].apply(lambda s: s.split('.')[0])
df['geneAccession'] = 'ncbigene:' + df['db_xref'].str.split(':', expand=True)[1]
df.rename(columns={'gene': 'geneName', 'product': 'proteinName', 'start': 'geneStart', 'end': 'geneEnd'}, inplace=True)
df['taxonomyId'] = 'taxonomy:' + df['taxonomyId'].str.split(':', expand=True)[1]
df.fillna('', inplace=True)

In [16]:
df.head(100)

Unnamed: 0,geneStart,geneEnd,geneName,locus_tag,db_xref,proteinName,protein_id,fullLength,taxonomyId,genomeName,genbank_accession,sequence,id,genomeAccession,proteinAccession,geneAccession
0,266,21555,ORF1ab,GU280_gp01,GeneID:43740578,ORF1ab polyprotein,YP_009724389.1,True,taxonomy:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,md5:e6608b50fcd6e004708a875615ddf2d9,refseq:NC_045512,ncbiprotein:YP_009724389,ncbigene:43740578
1,266,13483,ORF1ab,GU280_gp01,GeneID:43740578,ORF1a polyprotein,YP_009725295.1,True,taxonomy:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,md5:e781b58591b8dbdd15f84dcbdec82105,refseq:NC_045512,ncbiprotein:YP_009725295,ncbigene:43740578
2,21563,25384,S,GU280_gp02,GeneID:43740568,surface glycoprotein,YP_009724390.1,True,taxonomy:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,md5:4c35f09aac2f7be4f3cffd30c6aecac8,refseq:NC_045512,ncbiprotein:YP_009724390,ncbigene:43740568
3,25393,26220,ORF3a,GU280_gp03,GeneID:43740569,ORF3a protein,YP_009724391.1,True,taxonomy:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWL...,md5:f5c8b89ceac3f14e456577557df1ef40,refseq:NC_045512,ncbiprotein:YP_009724391,ncbigene:43740569
4,26245,26472,E,GU280_gp04,GeneID:43740570,envelope protein,YP_009724392.1,True,taxonomy:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...,md5:375e0f905c315e06a99c80b736c125d2,refseq:NC_045512,ncbiprotein:YP_009724392,ncbigene:43740570
5,26523,27191,M,GU280_gp05,GeneID:43740571,membrane glycoprotein,YP_009724393.1,True,taxonomy:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFL...,md5:1cd6abff79ad3633e17582eb0e576539,refseq:NC_045512,ncbiprotein:YP_009724393,ncbigene:43740571
6,27202,27387,ORF6,GU280_gp06,GeneID:43740572,ORF6 protein,YP_009724394.1,True,taxonomy:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSLTE...,md5:90b50e0be9abd893bd32b163d6933f0c,refseq:NC_045512,ncbiprotein:YP_009724394,ncbigene:43740572
7,27394,27759,ORF7a,GU280_gp07,GeneID:43740573,ORF7a protein,YP_009724395.1,True,taxonomy:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNSPF...,md5:f65213344e2e68de1cae4feb9c5e07b1,refseq:NC_045512,ncbiprotein:YP_009724395,ncbigene:43740573
8,27756,27887,ORF7b,GU280_gp08,GeneID:43740574,ORF7b,YP_009725318.1,True,taxonomy:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MIELSLIDFYLCFLAFLLFLVLIMLIIFWFSLELQDHNETCHA,md5:c7f0179da4ca26456ee905081f485cc8,refseq:NC_045512,ncbiprotein:YP_009725318,ncbigene:43740574
9,27894,28259,ORF8,GU280_gp09,GeneID:43740577,ORF8 protein,YP_009724396.1,True,taxonomy:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MKFLVFLGIITTVAAFHQECSLQSCTQHQPYVVDDPCPIHFYSKWY...,md5:3a77ad9207d29beacfbf6f18b0da9e26,refseq:NC_045512,ncbiprotein:YP_009724396,ncbigene:43740577


### Save data for Knowledge Graph Import

In [17]:
df = df[['id', 'genomeAccession', 'genomeName', 'geneName', 'geneStart', 'geneEnd', 'geneAccession', 'proteinAccession', 'proteinName', 'sequence', 'fullLength', 'taxonomyId']]
df.head(1000)

Unnamed: 0,id,genomeAccession,genomeName,geneName,geneStart,geneEnd,geneAccession,proteinAccession,proteinName,sequence,fullLength,taxonomyId
0,md5:e6608b50fcd6e004708a875615ddf2d9,refseq:NC_045512,Severe acute respiratory syndrome coronavirus ...,ORF1ab,266,21555,ncbigene:43740578,ncbiprotein:YP_009724389,ORF1ab polyprotein,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,True,taxonomy:2697049
1,md5:e781b58591b8dbdd15f84dcbdec82105,refseq:NC_045512,Severe acute respiratory syndrome coronavirus ...,ORF1ab,266,13483,ncbigene:43740578,ncbiprotein:YP_009725295,ORF1a polyprotein,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,True,taxonomy:2697049
2,md5:4c35f09aac2f7be4f3cffd30c6aecac8,refseq:NC_045512,Severe acute respiratory syndrome coronavirus ...,S,21563,25384,ncbigene:43740568,ncbiprotein:YP_009724390,surface glycoprotein,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,True,taxonomy:2697049
3,md5:f5c8b89ceac3f14e456577557df1ef40,refseq:NC_045512,Severe acute respiratory syndrome coronavirus ...,ORF3a,25393,26220,ncbigene:43740569,ncbiprotein:YP_009724391,ORF3a protein,MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWL...,True,taxonomy:2697049
4,md5:375e0f905c315e06a99c80b736c125d2,refseq:NC_045512,Severe acute respiratory syndrome coronavirus ...,E,26245,26472,ncbigene:43740570,ncbiprotein:YP_009724392,envelope protein,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...,True,taxonomy:2697049
5,md5:1cd6abff79ad3633e17582eb0e576539,refseq:NC_045512,Severe acute respiratory syndrome coronavirus ...,M,26523,27191,ncbigene:43740571,ncbiprotein:YP_009724393,membrane glycoprotein,MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFL...,True,taxonomy:2697049
6,md5:90b50e0be9abd893bd32b163d6933f0c,refseq:NC_045512,Severe acute respiratory syndrome coronavirus ...,ORF6,27202,27387,ncbigene:43740572,ncbiprotein:YP_009724394,ORF6 protein,MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSLTE...,True,taxonomy:2697049
7,md5:f65213344e2e68de1cae4feb9c5e07b1,refseq:NC_045512,Severe acute respiratory syndrome coronavirus ...,ORF7a,27394,27759,ncbigene:43740573,ncbiprotein:YP_009724395,ORF7a protein,MKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNSPF...,True,taxonomy:2697049
8,md5:c7f0179da4ca26456ee905081f485cc8,refseq:NC_045512,Severe acute respiratory syndrome coronavirus ...,ORF7b,27756,27887,ncbigene:43740574,ncbiprotein:YP_009725318,ORF7b,MIELSLIDFYLCFLAFLLFLVLIMLIIFWFSLELQDHNETCHA,True,taxonomy:2697049
9,md5:3a77ad9207d29beacfbf6f18b0da9e26,refseq:NC_045512,Severe acute respiratory syndrome coronavirus ...,ORF8,27894,28259,ncbigene:43740577,ncbiprotein:YP_009724396,ORF8 protein,MKFLVFLGIITTVAAFHQECSLQSCTQHQPYVVDDPCPIHFYSKWY...,True,taxonomy:2697049


In [18]:
df_genome = df[['genomeAccession', 'genomeName', 'taxonomyId']].copy()
df_genome.drop_duplicates(inplace=True)

In [19]:
df_genome.to_csv(NEO4J_IMPORT / "01c-NCBIGenome.csv", index=False)

In [20]:
df.to_csv(NEO4J_IMPORT / "01c-NCBIGeneProtein.csv", index=False)