# Load Viral Genome Data from NCBI
**[Work in progress]**

This notebook downloads and standardizes viral genome data from NCBI for ingestion into a Knowledge Graph.

Data source: [NCBI](https://www.ncbi.nlm.nih.gov)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import re
import pandas as pd
import dateutil
import time
from pathlib import Path
import hashlib 

from Bio import SeqIO
from Bio import Entrez
Entrez.email = "covid19@mybinder.org"

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
# Path will take care of handling operating system differences.
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Download viral genome information

In [4]:
# TODO loop over a list of taxonomy ids. 

# for now SARS-CoV-2 is hardcoded.
#tax_id = "2697049" # SARS-CoV-2, NC_045512
#tax_id = "227984" # SARS, NC_004718
#tax_id = "1335626" # MERS-CoV, NC_019843, NC_038294(UniProt)

#tax_id = "147711" # Rhinovirus A NC_001617
#tax_id = "147712" # Rhinovirus B (B14: NC_001490)
#https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=12058&host=human

#tax_id = "186538" # Zaire ebolavirus, NC_002549

# https://www.ncbi.nlm.nih.gov/ipg/YP_009724389.1 identical protein groups
#https://www.ebi.ac.uk/uniprot/api/covid-19/uniprotkb/accession/P0DTC1.txt

In [5]:
organisms = pd.read_csv("../../reference_data/Organism.csv")

In [6]:
pathogens = organisms.query("type == 'Pathogen'")
#refSeq = pathogens['genomeAccession'].values

In [7]:
genomes = pd.read_csv("../../reference_data/Genome.csv", dtype=str)
# restrict list to viral genomes
genomes = genomes.query("chromosome == 'Viral Chromosome'")

In [8]:
refSeq = genomes['refSeq'].str.split(':', expand=True)[1]

In [9]:
refSeq

0    NC_045512
1    NC_038294
2    NC_004718
3    NC_006577
4    NC_006213
5    NC_002645
6    NC_005831
Name: 1, dtype: object

### Download reference sequence nucleotide accession numbers for the given taxonomy id

In [10]:
# TODO refactor function into a separate .py file
def extract_genbank_features(gb_record, feature_type, qualifier_list) :
    answers = list()
    for (index, feature) in enumerate(gb_record.features) :
        answer = dict()
        if feature.type==feature_type :
            for qualifier in qualifier_list:
                if qualifier in feature.qualifiers :
                    #print(feature, feature.qualifiers)
                    # keep NCBI "1"-based indexing
                    answer['start'] = min(feature.location) + 1
                    answer['end'] = max(feature.location) + 1
                    #There should only be one locus_tag per feature, but there
                    #are usually several db_xref entries
                    for value in feature.qualifiers[qualifier] :
                        if value in answer :
                                print("WARNING - Duplicate key %s for %s features %i and %i" \
                                % (value, feature_type, answer[value], index))
                        else :
                            answer[qualifier] = value
                else:
                    answer[qualifier] = ''
            answers.append(answer)
    return answers

In [11]:
def process_protein_record(accession):
    with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=accession) as handle:
        record = SeqIO.read(handle, "gb")
        print(record)
    
    source = extract_genbank_features(record, "source", ["db_xref"])
    taxonomy = source[0]['db_xref']
    protein = extract_genbank_features(record, "CDS", ["gene","locus_tag","db_xref","product","protein_id"])
    protein_df = pd.DataFrame(protein)
    protein_df['fullLength'] = 'True'

    mature_peptide = extract_genbank_features(record, "mat_peptide", ["gene","locus_tag","db_xref","product","protein_id"])
    
    # some reference sequences are missing the mature peptides (why???)
    if len(mature_peptide) > 0:
        mature_peptide_df = pd.DataFrame(mature_peptide)
        mature_peptide_df['fullLength'] = 'False'

        # assign db_xref to the mature proteins based on the precursor's gene id
        mature_peptide_df['db_xref'] = mature_peptide_df['gene'].apply(lambda g: protein_df.query(f"gene=='{g}'")['db_xref'][0])
        df = pd.concat([protein_df, mature_peptide_df])
    else:
        df = protein_df
        
    df['taxonomyId'] = taxonomy
    df['genomeName'] = record.description
    df['genbank_accession'] = accession
    
    return df

### Concatenate all Genbank dataframes

In [12]:
df = pd.concat((process_protein_record(accession)) for accession in refSeq)

ID: NC_045512.2
Name: NC_045512
Description: Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
Database cross-references: BioProject:PRJNA485481
Number of features: 57
/molecule_type=ss-RNA
/topology=linear
/data_file_division=VRL
/date=18-JUL-2020
/accessions=['NC_045512']
/sequence_version=2
/keywords=['RefSeq']
/source=Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
/organism=Severe acute respiratory syndrome coronavirus 2
/taxonomy=['Viruses', 'Riboviria', 'Orthornavirae', 'Pisuviricota', 'Pisoniviricetes', 'Nidovirales', 'Cornidovirineae', 'Coronaviridae', 'Orthocoronavirinae', 'Betacoronavirus', 'Sarbecovirus']
/references=[Reference(title='A new coronavirus associated with human respiratory disease in China', ...), Reference(title='Programmed ribosomal frameshifting in decoding the SARS-CoV genome', ...), Reference(title='The structure of a rigorously conserved RNA element within the SARS virus genome', ...), Reference(title="A phyl

#### Get protein sequence and calculate md5 hash code as a unique id

In [13]:
def get_sequence(accession):
    with Entrez.efetch(db="protein", rettype="gb", retmode="text", id=accession) as handle:
        record = SeqIO.read(handle, "gb")
        time.sleep(2)
        return str(record.seq)

In [14]:
df['sequence'] = df['protein_id'].apply(get_sequence)

In [15]:
df['id'] = df['sequence'].apply(lambda seq: 'md5:' + hashlib.md5(seq.encode()).hexdigest())

### Create unique and interoperable identifiers

**proteinAccession**: CURIE: [ncbiprotein](https://registry.identifiers.org/registry/ncbiprotein) (NCBI Reference Sequences, Refseq)

**geneAccession**: CURIE: [ncbigene](https://registry.identifiers.org/registry/ncbigene) (NCBI Gene)

A [CURIE](https://en.wikipedia.org/wiki/CURIE) (Compact URI) is a compact abbreviation for Uniform Resource Identifiers (URIs) that can be resolved by [Identifiers.org](https://identifiers.org/).

In [16]:
df['proteinAccession'] = 'ncbiprotein:' + df['protein_id'].apply(lambda s: s.split('.')[0])
df['geneAccession'] = 'ncbigene:' + df['db_xref'].str.split(':', expand=True)[1]
df.rename(columns={'gene': 'geneName', 'product': 'proteinName', 'start': 'geneStart', 'end': 'geneEnd'}, inplace=True)
df.fillna('', inplace=True)

In [17]:
df.head()

Unnamed: 0,geneStart,geneEnd,geneName,locus_tag,db_xref,proteinName,protein_id,fullLength,taxonomyId,genomeName,genbank_accession,sequence,id,proteinAccession,geneAccession
0,266,21555,ORF1ab,GU280_gp01,GeneID:43740578,ORF1ab polyprotein,YP_009724389.1,True,taxon:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,md5:e6608b50fcd6e004708a875615ddf2d9,ncbiprotein:YP_009724389,ncbigene:43740578
1,266,13483,ORF1ab,GU280_gp01,GeneID:43740578,ORF1a polyprotein,YP_009725295.1,True,taxon:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,md5:e781b58591b8dbdd15f84dcbdec82105,ncbiprotein:YP_009725295,ncbigene:43740578
2,21563,25384,S,GU280_gp02,GeneID:43740568,surface glycoprotein,YP_009724390.1,True,taxon:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,md5:4c35f09aac2f7be4f3cffd30c6aecac8,ncbiprotein:YP_009724390,ncbigene:43740568
3,25393,26220,ORF3a,GU280_gp03,GeneID:43740569,ORF3a protein,YP_009724391.1,True,taxon:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWL...,md5:f5c8b89ceac3f14e456577557df1ef40,ncbiprotein:YP_009724391,ncbigene:43740569
4,26245,26472,E,GU280_gp04,GeneID:43740570,envelope protein,YP_009724392.1,True,taxon:2697049,Severe acute respiratory syndrome coronavirus ...,NC_045512,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...,md5:375e0f905c315e06a99c80b736c125d2,ncbiprotein:YP_009724392,ncbigene:43740570


### Save data for Knowledge Graph Import

In [18]:
df = df[['id', 'geneName', 'geneStart', 'geneEnd', 'geneAccession', 'proteinAccession', 'proteinName']]
df.tail(100)

Unnamed: 0,id,geneName,geneStart,geneEnd,geneAccession,proteinAccession,proteinName
11,md5:5f2b40651f7ae2660e7725b15c8ece99,ORF1ab,16167,17969,ncbigene:1489680,ncbiprotein:NP_828870,helicase/NTPase
12,md5:e96a69b1d6c91894c63fbdedb89da497,ORF1ab,17970,19550,ncbigene:1489680,ncbiprotein:NP_828871,3' to 5' exonuclease
13,md5:b0313af7694e9de5e34907f66710fc17,ORF1ab,19551,20588,ncbigene:1489680,ncbiprotein:NP_828872,endoribonuclease
14,md5:c43ab52e8f4cda7022ea09173315ec93,ORF1ab,20589,21482,ncbigene:1489680,ncbiprotein:NP_828873,2'-O-MTase
15,md5:5712d85232cb34e8518b438372519997,ORF1ab,265,804,ncbigene:1489680,ncbiprotein:YP_009944366,nsp1
16,md5:e7359462c5a1e7c2e107e13fab749830,ORF1ab,805,2718,ncbigene:1489680,ncbiprotein:YP_009944367,nsp2
17,md5:2bdaeed8def43a14b2f67907c499d2fd,ORF1ab,2719,8484,ncbigene:1489680,ncbiprotein:YP_009944368,nsp3
18,md5:9ed50d7af20a66840862dc670544d8f2,ORF1ab,8485,9984,ncbigene:1489680,ncbiprotein:YP_009944369,nsp4
19,md5:48c186dc0892d2ea89bfa77dfc7c53d5,ORF1ab,9985,10902,ncbigene:1489680,ncbiprotein:YP_009944370,3C-like protease
20,md5:27daeca4e98f21be747bb7ee854b33df,ORF1ab,10903,11772,ncbigene:1489680,ncbiprotein:YP_009944371,nsp6


### TODO find a premanent fix for missing data in GenBank records

In [19]:
df.query("geneAccession == ''")

Unnamed: 0,id,geneName,geneStart,geneEnd,geneAccession,proteinAccession,proteinName
6,md5:6fc05e513ec08d8b439f73b2851bfba0,,28108,28362,,ncbiprotein:YP_009555243,envelope protein


In [20]:
index = df.query("proteinAccession == 'ncbiprotein:YP_009555243'").index.values[0]
df.at[index, 'geneName'] = 'E'
df.at[index, 'geneAccession'] = 'ncbigene:39105217'

In [21]:
df.query("proteinAccession == 'ncbiprotein:YP_009555243'")

Unnamed: 0,id,geneName,geneStart,geneEnd,geneAccession,proteinAccession,proteinName
6,md5:6fc05e513ec08d8b439f73b2851bfba0,E,28108,28362,ncbigene:39105217,ncbiprotein:YP_009555243,envelope protein


In [22]:
df.query("proteinAccession == 'ncbiprotein:YP_009825051'") # protein name not connected to protein

Unnamed: 0,id,geneName,geneStart,geneEnd,geneAccession,proteinAccession,proteinName
2,md5:5b39d604648221117b0614889d3d6067,S,21492,25259,ncbigene:1489668,ncbiprotein:YP_009825051,spike glycoprotein


In [23]:
df.query("proteinAccession == 'ncbiprotein:YP_009555241'") # protein name not connected to protein

Unnamed: 0,id,geneName,geneStart,geneEnd,geneAccession,proteinAccession,proteinName
4,md5:a9fc2c01d20938618b58f690ae776309,S,23643,27704,ncbigene:39105218,ncbiprotein:YP_009555241,spike surface glycoprotein


In [24]:
df.to_csv(NEO4J_IMPORT / "01b-NCBIGeneProtein.csv", index=False)