# Load Viral Gene and Protein Data from NCBI
**[Work in progress]**

This notebook downloads and standardizes viral gene and protein data from NCBI for ingestion into a Knowledge Graph.

Data source: [NCBI](https://www.ncbi.nlm.nih.gov)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
import dateutil
from pathlib import Path
from Bio import SeqIO
from Bio import Entrez
Entrez.email = "covid19@mybinder.org"

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
# Path will take care of handling operating system differences.
NEO4J_HOME = Path(os.getenv('NEO4J_HOME'))
print(NEO4J_HOME)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3


### Download viral genome information

In [4]:
# TODO loop over a list of taxonomy ids. 

# for now SARS-CoV-2 is hardcoded.
tax_id = "2697049" # SARS-CoV-2, NC_045512
#tax_id = "694009" # SARS, NC_004718
#tax_id = "1335626" # MERS-CoV, NC_019843
#tax_id = "147711" # Rhinovirus A
#tax_id = "147712" # Rhinovirus B
#tax_id = "186538" # Zaire ebolavirus, NC_002549

### Download reference sequence nucleotide accession numbers for the given taxonomy id

In [5]:
term = f"txid{tax_id}[Organism]+refseq[filter]"
handle = Entrez.esearch(db="nucleotide", retmax=5000, term=term, idtype="acc")
nuc_accessions = Entrez.read(handle)
handle.close()
print("Nucleotides:", nuc_accessions['Count'])

Nucleotides: 1


In [6]:
#nuc_accessions['IdList'].append('MN908947') # used as reference genome (same as NC_045512)

In [7]:
# TODO refactor function into a separate .py file
def extract_genbank_features(gb_record, feature_type, qualifier_list) :
    answers = list()
    for (index, feature) in enumerate(gb_record.features) :
        answer = dict()
        if feature.type==feature_type :
            for qualifier in qualifier_list:
                if qualifier in feature.qualifiers :
                    #print(feature, feature.qualifiers)
                    # keep NCBI "1"-based indexing
                    answer['start'] = min(feature.location) + 1
                    answer['end'] = max(feature.location) + 1
                    #There should only be one locus_tag per feature, but there
                    #are usually several db_xref entries
                    for value in feature.qualifiers[qualifier] :
                        if value in answer :
                                print("WARNING - Duplicate key %s for %s features %i and %i" \
                                % (value, feature_type, answer[value], index))
                        else :
                            answer[qualifier] = value
                else:
                    answer[qualifier] = ''
            answers.append(answer)
    return answers

In [8]:
def process_protein_record(accession):
    with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=accession) as handle:
        record = SeqIO.read(handle, "gb")
    
    protein = extract_genbank_features(record, "CDS", ["gene","locus_tag","db_xref","product","protein_id"])
    protein_df = pd.DataFrame(protein)

    mature_peptide = extract_genbank_features(record, "mat_peptide", ["gene","locus_tag","db_xref","product","protein_id"])
    
    # some reference sequences are missing the mature peptides (why???)
    if len(mature_peptide) > 0:
        mature_peptide_df = pd.DataFrame(mature_peptide)

        # assign db_xref to the mature proteins based on the precursor's gene id
        mature_peptide_df['db_xref'] = mature_peptide_df['gene'].apply(lambda g: protein_df.query(f"gene=='{g}'")['db_xref'][0])
        df = pd.concat([protein_df, mature_peptide_df])
    else:
        df = protein_df
        
    df['genbank_accession'] = accession
    
    return df

### Concatenate all Genbank dataframes

In [9]:
df_list = []

for nuc in nuc_accessions['IdList']:
    print(nuc, end=' ')
    df_list.append(process_protein_record(nuc))
    
df = pd.concat(df_list)

NC_045512.2 

### Create unique and interoperable identifiers

**genbank_id**: CURIE: [ncbiprotein](https://registry.identifiers.org/registry/ncbiprotein) (NCBI Reference Sequences, Refseq)

**ncbiprotein_id**: CURIE: [ncbiprotein](https://registry.identifiers.org/registry/ncbiprotein) (NCBI Reference Sequences, Refseq)

**ncbi_gene_id**: CURIE: [ncbigene](https://registry.identifiers.org/registry/ncbigene (NCBI Gene)

A [CURIE](https://en.wikipedia.org/wiki/CURIE) (Compact URI) is a compact abbreviation for Uniform Resource Identifiers (URIs) that can be resolved by [Identifiers.org](https://identifiers.org/).

In [10]:
# remove version number from genbank accessions to enable linking with other nodes
# NCBI reference sequences resolve through ncbiprotein CURIES
df['id'] = "ncbiprotein:" + df['genbank_accession'].apply(lambda s: s.split('.')[0])
df['ncbiproteinId'] = 'ncbiprotein:' + df['protein_id'].apply(lambda s: s.split('.')[0])
df['ncbigeneId'] = 'ncbigene:' + df['db_xref'].str.split(':', expand=True)[1]
df.rename(columns={'product': 'name'}, inplace=True)

In [11]:
df.head(100)

Unnamed: 0,start,end,gene,locus_tag,db_xref,name,protein_id,genbank_accession,id,ncbiproteinId,ncbigeneId
0,266,21555,ORF1ab,GU280_gp01,GeneID:43740578,ORF1ab polyprotein,YP_009724389.1,NC_045512.2,ncbiprotein:NC_045512,ncbiprotein:YP_009724389,ncbigene:43740578
1,266,13483,ORF1ab,GU280_gp01,GeneID:43740578,ORF1a polyprotein,YP_009725295.1,NC_045512.2,ncbiprotein:NC_045512,ncbiprotein:YP_009725295,ncbigene:43740578
2,21563,25384,S,GU280_gp02,GeneID:43740568,surface glycoprotein,YP_009724390.1,NC_045512.2,ncbiprotein:NC_045512,ncbiprotein:YP_009724390,ncbigene:43740568
3,25393,26220,ORF3a,GU280_gp03,GeneID:43740569,ORF3a protein,YP_009724391.1,NC_045512.2,ncbiprotein:NC_045512,ncbiprotein:YP_009724391,ncbigene:43740569
4,26245,26472,E,GU280_gp04,GeneID:43740570,envelope protein,YP_009724392.1,NC_045512.2,ncbiprotein:NC_045512,ncbiprotein:YP_009724392,ncbigene:43740570
5,26523,27191,M,GU280_gp05,GeneID:43740571,membrane glycoprotein,YP_009724393.1,NC_045512.2,ncbiprotein:NC_045512,ncbiprotein:YP_009724393,ncbigene:43740571
6,27202,27387,ORF6,GU280_gp06,GeneID:43740572,ORF6 protein,YP_009724394.1,NC_045512.2,ncbiprotein:NC_045512,ncbiprotein:YP_009724394,ncbigene:43740572
7,27394,27759,ORF7a,GU280_gp07,GeneID:43740573,ORF7a protein,YP_009724395.1,NC_045512.2,ncbiprotein:NC_045512,ncbiprotein:YP_009724395,ncbigene:43740573
8,27756,27887,ORF7b,GU280_gp08,GeneID:43740574,ORF7b,YP_009725318.1,NC_045512.2,ncbiprotein:NC_045512,ncbiprotein:YP_009725318,ncbigene:43740574
9,27894,28259,ORF8,GU280_gp09,GeneID:43740577,ORF8 protein,YP_009724396.1,NC_045512.2,ncbiprotein:NC_045512,ncbiprotein:YP_009724396,ncbigene:43740577


### Save data for Knowledge Graph Import

In [12]:
# TODO for SARS-CoV-2 contains redundant protein products due to ORF1a, 
# which is part of the ORF1ab gene.
# For now, we are dropping the duplicates
df = df.drop_duplicates(['name']) 
df = df[['id','gene', 'start', 'end', 'ncbigeneId', 'ncbiproteinId', 'name']]
df.head(100)

Unnamed: 0,id,gene,start,end,ncbigeneId,ncbiproteinId,name
0,ncbiprotein:NC_045512,ORF1ab,266,21555,ncbigene:43740578,ncbiprotein:YP_009724389,ORF1ab polyprotein
1,ncbiprotein:NC_045512,ORF1ab,266,13483,ncbigene:43740578,ncbiprotein:YP_009725295,ORF1a polyprotein
2,ncbiprotein:NC_045512,S,21563,25384,ncbigene:43740568,ncbiprotein:YP_009724390,surface glycoprotein
3,ncbiprotein:NC_045512,ORF3a,25393,26220,ncbigene:43740569,ncbiprotein:YP_009724391,ORF3a protein
4,ncbiprotein:NC_045512,E,26245,26472,ncbigene:43740570,ncbiprotein:YP_009724392,envelope protein
5,ncbiprotein:NC_045512,M,26523,27191,ncbigene:43740571,ncbiprotein:YP_009724393,membrane glycoprotein
6,ncbiprotein:NC_045512,ORF6,27202,27387,ncbigene:43740572,ncbiprotein:YP_009724394,ORF6 protein
7,ncbiprotein:NC_045512,ORF7a,27394,27759,ncbigene:43740573,ncbiprotein:YP_009724395,ORF7a protein
8,ncbiprotein:NC_045512,ORF7b,27756,27887,ncbigene:43740574,ncbiprotein:YP_009725318,ORF7b
9,ncbiprotein:NC_045512,ORF8,27894,28259,ncbigene:43740577,ncbiprotein:YP_009724396,ORF8 protein


In [13]:
df.to_csv(NEO4J_HOME / "import/01c-NCBIRefSeq.csv", index=False)