# Load Viral Gene and Protein Data from NCBI
**[Work in progress]**

This notebook downloads and standardizes viral gene and protein data from NCBI for ingestion into a Knowledge Graph.

Data source: [NCBI](https://www.ncbi.nlm.nih.gov)

Author: Peter Rose (pwrose@ucsd.edu)

In [4]:
import os
import pandas as pd
import dateutil
from pathlib import Path
from Bio import SeqIO
from Bio import Entrez
Entrez.email = "covid19@mybinder.org"

In [5]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [6]:
# Path will take care of handling operating system differences.
NEO4J_HOME = Path(os.getenv('NEO4J_HOME'))
print(NEO4J_HOME)

TypeError: expected str, bytes or os.PathLike object, not NoneType

### Download viral genome information

In [7]:
# TODO loop over a list of taxonomy ids. 

# for now SARS-CoV-2 is hardcoded.
tax_id = "2697049" # SARS-CoV-2, NC_045512
#tax_id = "694009" # SARS, NC_004718
#tax_id = "1335626" # MERS-CoV, NC_019843
#tax_id = "147711" # Rhinovirus A
#tax_id = "147712" # Rhinovirus B
#tax_id = "186538" # Zaire ebolavirus, NC_002549

### Download reference sequence nucleotide accession numbers for the given taxonomy id

In [8]:
term = f"txid{tax_id}[Organism]+refseq[filter]"
handle = Entrez.esearch(db="nucleotide", retmax=5000, term=term, idtype="acc")
nuc_accessions = Entrez.read(handle)
handle.close()
print("Nucleotides:", nuc_accessions['Count'])

Nucleotides: 1


In [9]:
# TODO refactor function into a separate .py file
def extract_genbank_features(gb_record, feature_type, qualifier_list) :
    answers = list()
    for (index, feature) in enumerate(gb_record.features) :
        answer = dict()
        if feature.type==feature_type :
            for qualifier in qualifier_list:
                if qualifier in feature.qualifiers :
                    #print(feature, feature.qualifiers)
                    # keep NCBI "1"-based indexing
                    answer['start'] = min(feature.location) + 1
                    answer['end'] = max(feature.location) + 1
                    #There should only be one locus_tag per feature, but there
                    #are usually several db_xref entries
                    for value in feature.qualifiers[qualifier] :
                        if value in answer :
                                print("WARNING - Duplicate key %s for %s features %i and %i" \
                                % (value, feature_type, answer[value], index))
                        else :
                            answer[qualifier] = value
                else:
                    answer[qualifier] = ''
            answers.append(answer)
    return answers

In [10]:
def process_protein_record(accession):
    with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=accession) as handle:
        record = SeqIO.read(handle, "gb")
        
    protein = extract_genbank_features(record, "CDS", ["gene","locus_tag","db_xref","product","protein_id"])
    protein_df = pd.DataFrame(protein)

    mature_peptide = extract_genbank_features(record, "mat_peptide", ["gene","locus_tag","db_xref","product","protein_id"])
    
    # some reference sequences are missing the mature peptides (why???)
    if len(mature_peptide) > 0:
        mature_peptide_df = pd.DataFrame(mature_peptide)

        # assign db_xref to the mature proteins based on the precursor's gene id
        mature_peptide_df['db_xref'] = mature_peptide_df['gene'].apply(lambda g: protein_df.query(f"gene=='{g}'")['db_xref'][0])
        df = pd.concat([protein_df, mature_peptide_df])
    else:
        df = protein_df
        
    df['genbank_accession'] = accession
    
    return df

### Concatenate all Genbank dataframes

In [11]:
df_list = []

for nuc in nuc_accessions['IdList']:
    print(nuc, end=' ')
    df_list.append(process_protein_record(nuc))
    
df = pd.concat(df_list)

NC_045512.2 

In [21]:
def get_protein_fastas(df):
    
    #Iterating through only unique fastas to ensure minimal API requests
    unique_proteins = df.protein_id.unique()

    #Using Entrez's efetch command to get unique fastas
    file = Entrez.efetch(db = 'protein', rettype="fasta", retmode="fasta", id=list(unique_proteins))

    #This is a batch read so as to minimize API calls
    with file as handle:
        all_fastas = file.read()
        fastas = all_fastas.split('\t')    

    list_sequences = []
    fasta = ''

    #getting the fastas from entrez output file
    for element in all_fastas.split('\n'):
        if '>' in element:
            list_sequences += [fasta]
            fasta = ''
            continue
        else:
            fasta += element

    #A dummy out.txt file which will be deleted upon completion of the function
    with open ('out.txt', 'w') as f:
        f.write(all_fastas)

    #Parse the multiple fastas using Biopython
    fasta_sequences = SeqIO.parse('out.txt','fasta')

    list_sequences = []

    for fasta in fasta_sequences:
        list_sequences += [str(fasta.seq)]

    #Removing the temporary file
    os.remove('out.txt')
    
    #Creating a dataframe of sequences
    sequences = pd.DataFrame(list(zip(unique_proteins, list_sequences)), columns=['protein_id', 'protein_sequence'])
    
    #Merging only those sequences that exist in the NCBIRefSeq data.
    return df.merge(sequences)

In [13]:
df_with_sequences = get_protein_fastas(df)

### Create unique and interoperable identifiers

**genbank_id**: CURIE: [ncbiprotein](https://registry.identifiers.org/registry/ncbiprotein) (NCBI Reference Sequences, Refseq)

**ncbiprotein_id**: CURIE: [ncbiprotein](https://registry.identifiers.org/registry/ncbiprotein) (NCBI Reference Sequences, Refseq)

**ncbi_gene_id**: CURIE: [ncbigene](https://registry.identifiers.org/registry/ncbigene (NCBI Gene)

A [CURIE](https://en.wikipedia.org/wiki/CURIE) (Compact URI) is a compact abbreviation for Uniform Resource Identifiers (URIs) that can be resolved by [Identifiers.org](https://identifiers.org/).

In [14]:
# remove version number from genbank accessions to enable linking with other nodes
# NCBI reference sequences resolve through ncbiprotein CURIES
df['genbank_id'] = "ncbiprotein:" + df['genbank_accession'].apply(lambda s: s.split('.')[0])
df['ncbiprotein_id'] = 'ncbiprotein:' + df['protein_id'].apply(lambda s: s.split('.')[0])
df['ncbigene_id'] = 'ncbigene:' + df['db_xref'].str.split(':', expand=True)[1]

### Save data for Knowledge Graph Import

In [20]:
# TODO for SARS-CoV-2 contains redundant protein products due to ORF1a, 
# which is part of the ORF1ab gene.
# For now, we are dropping the duplicates
df = df.drop_duplicates(['product']) 
df = df[['genbank_id','gene', 'start', 'end', 'ncbigene_id', 'ncbiprotein_id', 'product']]
df['sequence'] = df_with_sequences['protein_sequence']
df.head(100)

Unnamed: 0,genbank_id,gene,start,end,ncbigene_id,ncbiprotein_id,product,sequence
0,ncbiprotein:NC_045512,ORF1ab,266,21555,ncbigene:43740578,ncbiprotein:YP_009724389,ORF1ab polyprotein,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...
1,ncbiprotein:NC_045512,ORF1ab,266,13483,ncbigene:43740578,ncbiprotein:YP_009725295,ORF1a polyprotein,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...
2,ncbiprotein:NC_045512,S,21563,25384,ncbigene:43740568,ncbiprotein:YP_009724390,surface glycoprotein,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
3,ncbiprotein:NC_045512,ORF3a,25393,26220,ncbigene:43740569,ncbiprotein:YP_009724391,ORF3a protein,MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWL...
4,ncbiprotein:NC_045512,E,26245,26472,ncbigene:43740570,ncbiprotein:YP_009724392,envelope protein,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...
5,ncbiprotein:NC_045512,M,26523,27191,ncbigene:43740571,ncbiprotein:YP_009724393,membrane glycoprotein,MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFL...
6,ncbiprotein:NC_045512,ORF6,27202,27387,ncbigene:43740572,ncbiprotein:YP_009724394,ORF6 protein,MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSLTE...
7,ncbiprotein:NC_045512,ORF7a,27394,27759,ncbigene:43740573,ncbiprotein:YP_009724395,ORF7a protein,MKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNSPF...
8,ncbiprotein:NC_045512,ORF7b,27756,27887,ncbigene:43740574,ncbiprotein:YP_009725318,ORF7b,MIELSLIDFYLCFLAFLLFLVLIMLIIFWFSLELQDHNETCHA
9,ncbiprotein:NC_045512,ORF8,27894,28259,ncbigene:43740577,ncbiprotein:YP_009724396,ORF8 protein,MKFLVFLGIITTVAAFHQECSLQSCTQHQPYVVDDPCPIHFYSKWY...


In [22]:
#df.to_csv(NEO4J_HOME / 'import/01c-NCBIRefSeq.csv', index = False)