# UniProt Viral and Host Protein Data
**[Work in progress]**

This notebook downloads and standardizes viral and host protein data from UniProt for ingestion into the Knowledge Graph.

Data source: [UniProt](https://www.uniprot.org/)

Authors: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import re
import hashlib 
import urllib

import pandas as pd
import numpy as np

from pathlib import Path
from Bio import SeqIO

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Get list of organisms to include in the Knowledge Graph

In [4]:
genomes = pd.read_csv("../../reference_data/Genome.csv", dtype=str)

In [5]:
genomes['taxonomy'] = genomes['taxonomyId'].apply(lambda x: x.split(':')[1])

In [6]:
columns = 'id,entry%20name,p,sequence,length,protein%20names,reviewed,organism-id,feature(CHAIN),feature(PEPTIDE),go(biological%20process)'

### Download data from UniProt

In [7]:
urls = [f'https://www.uniprot.org/uniprot/?query=organism:{taxon}+reviewed:yes&columns={columns}&format=tab'
        for taxon in genomes['taxonomy'].unique()]

In [8]:
unp = pd.concat((pd.read_csv(url, sep='\t', dtype='str') for url in urls))

In [9]:
unp.reset_index(drop=True,inplace=True)

In [10]:
unp.fillna('', inplace=True)
print(unp.shape)

(20431, 10)


In [11]:
unp.head()

Unnamed: 0,Entry,Entry name,Sequence,Length,Protein names,Status,Organism ID,Chain,Peptide,Gene ontology (biological process)
0,P0DTD1,R1AB_SARS2,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,7096,Replicase polyprotein 1ab (pp1ab) (ORF1ab poly...,reviewed,2697049,"CHAIN 1..7096; /note=""Replicase polyprotein 1...",,induction by virus of catabolism of host mRNA ...
1,P0DTC7,NS7A_SARS2,MKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNSPF...,121,ORF7a protein (ORF7a) (Accessory protein 7a) (...,reviewed,2697049,"CHAIN 16..121; /note=""ORF7a protein""; /evide...",,modulation by virus of host G0/G1 transition c...
2,P0DTD2,ORF9B_SARS2,MDPKISEMHPALRLVDPQIQLAVTRMENAVGRDQNNVGPKVYPIIL...,97,ORF9b protein (ORF9b) (Accessory protein 9b) (...,reviewed,2697049,"CHAIN 1..97; /note=""ORF9b protein""; /evidenc...",,suppression by virus of host MAVS activity [GO...
3,P0DTC9,NCAP_SARS2,MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLP...,419,Nucleoprotein (N) (Nucleocapsid protein) (NC) ...,reviewed,2697049,"CHAIN 1..419; /note=""Nucleoprotein""; /id=""PR...",,
4,P0DTC3,AP3A_SARS2,MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWL...,275,ORF3a protein (ORF3a) (Accessory protein 3a) (...,reviewed,2697049,"CHAIN 1..275; /note=""ORF3a protein""; /id=""PR...",,pore formation by virus in membrane of host ce...


Convert synonymes to a semicolon separated list to represent these one to many relationships in a CSV file.

In [12]:
unp['synonymes'] = unp['Protein names'].str.replace('(', ';')
unp['synonymes'] = unp['synonymes'].str.replace(' ;', ';')
unp['synonymes'] = unp['synonymes'].str.replace(')', '')

In [13]:
unp.head()

Unnamed: 0,Entry,Entry name,Sequence,Length,Protein names,Status,Organism ID,Chain,Peptide,Gene ontology (biological process),synonymes
0,P0DTD1,R1AB_SARS2,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,7096,Replicase polyprotein 1ab (pp1ab) (ORF1ab poly...,reviewed,2697049,"CHAIN 1..7096; /note=""Replicase polyprotein 1...",,induction by virus of catabolism of host mRNA ...,Replicase polyprotein 1ab;pp1ab;ORF1ab polypro...
1,P0DTC7,NS7A_SARS2,MKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNSPF...,121,ORF7a protein (ORF7a) (Accessory protein 7a) (...,reviewed,2697049,"CHAIN 16..121; /note=""ORF7a protein""; /evide...",,modulation by virus of host G0/G1 transition c...,ORF7a protein;ORF7a;Accessory protein 7a;Prote...
2,P0DTD2,ORF9B_SARS2,MDPKISEMHPALRLVDPQIQLAVTRMENAVGRDQNNVGPKVYPIIL...,97,ORF9b protein (ORF9b) (Accessory protein 9b) (...,reviewed,2697049,"CHAIN 1..97; /note=""ORF9b protein""; /evidenc...",,suppression by virus of host MAVS activity [GO...,ORF9b protein;ORF9b;Accessory protein 9b;ORF-9...
3,P0DTC9,NCAP_SARS2,MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLP...,419,Nucleoprotein (N) (Nucleocapsid protein) (NC) ...,reviewed,2697049,"CHAIN 1..419; /note=""Nucleoprotein""; /id=""PR...",,,Nucleoprotein;N;Nucleocapsid protein;NC;Protein N
4,P0DTC3,AP3A_SARS2,MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWL...,275,ORF3a protein (ORF3a) (Accessory protein 3a) (...,reviewed,2697049,"CHAIN 1..275; /note=""ORF3a protein""; /id=""PR...",,pore formation by virus in membrane of host ce...,ORF3a protein;ORF3a;Accessory protein 3a;Prote...


In [14]:
unp.query("Entry == 'P01042'")['Chain'].values

array(['CHAIN 19..644;  /note="Kininogen-1";  /id="PRO_0000006685"; CHAIN 19..380;  /note="Kininogen-1 heavy chain";  /id="PRO_0000006686"; CHAIN 390..644;  /note="Kininogen-1 light chain";  /id="PRO_0000006689"'],
      dtype=object)

In [15]:
unp.query("Entry == 'P01042'")['Peptide'].values

array(['PEPTIDE 376..389;  /note="T-kinin";  /id="PRO_0000372485"; PEPTIDE 380..389;  /note="Lysyl-bradykinin";  /id="PRO_0000006687"; PEPTIDE 381..389;  /note="Bradykinin";  /id="PRO_0000006688"; PEPTIDE 431..434;  /note="Low molecular weight growth-promoting factor";  /id="PRO_0000006690"'],
      dtype=object)

In [16]:
def parse_feature_record(record, feature_type):
    items = record.split(';')
    feature = np.empty(5, dtype=object)
        
    feature[0] = feature_type
    for item in items:
        item = item.strip()
        if '..' in item:
            start_end = item.split('..')
            # in a few cases a '?' is used to represent an unknown start or end, check if it's a digit
            if start_end[0].isdigit():
                feature[1] = start_end[0]
            else:
                feature[1] = ''
            if start_end[1].isdigit():
                feature[2] = start_end[1]
            else:
                feature[2] = ''
        elif item.startswith("/note="):
            name = item[6:].replace('\"', '')
            feature[3] = name
        elif item.startswith("/id="):
            pro_id = item[4:].replace('\"', '')
            feature[4] = 'uniprot.chain:' + pro_id
                
    return feature

In [17]:
def parse_features(row):
    chain_features = []
    if 'CHAIN' in row['Chain']:
        chains = row['Chain'].split('CHAIN')
        if chains[0] == '':
            chains = chains[1:]
        chain_features = [parse_feature_record(chain, 'CHAIN') for chain in chains]

    protein_features = []
    # Full-length (coding sequence) proteins are inconsistenly handled in UniProt. 
    # For some entries, the full-length protein is included
    # in the chain features (e.g. P0DTD1), for others it's not (e.g., P01042)
    # Check if full-length protein is included in chain list
    full_length = False
    for f in chain_features:
        if f[1] == '1' and f[2] == row['Length']:
            full_length = True
            break
    # Add entry if full-length protein is not in chain list
    if not full_length:
        protein_name = row['Protein names'].split('(')[0]
        protein_features = [np.array(['PROTEIN','1', row['Length'], protein_name,''], dtype=object)]
            
    peptide_features = []
    if 'PEPTIDE' in row['Peptide']:
        peptides = row['Peptide'].split('PEPTIDE')
        if peptides[0] == '':
            peptides = peptides[1:]
        peptide_features = [parse_feature_record(peptide, 'PEPTIDE') for peptide in peptides]
    
    return protein_features + chain_features + peptide_features

In [18]:
unp['Features'] = unp.apply(parse_features, axis=1)

In [19]:
unp = unp.explode('Features')

In [20]:
unp[['type', 'start', 'end', 'name', 'proId']] = unp.apply(lambda row: row['Features'], axis=1, result_type="expand")

Handle missing values

In [21]:
unp.fillna('', inplace=True)

#### Cleave sequences into peptides

In [22]:
def get_subsequence(row):
    if row['start'].isdigit() and row['end'].isdigit():
        start = int(row['start'])
        end = int(row['end'])
        sequence = row['Sequence']
        return sequence[start-1: end]
    else:
        return ''

In [23]:
unp['sequence'] = unp.apply(lambda row: get_subsequence(row), axis=1)

Set flag if protein chain is full length

In [24]:
unp['fullLength'] = (unp['start'] == '1') & (unp['end'] == unp['Length'])

In [25]:
unp['name'] = unp['name'].str.strip()

In [26]:
unp.rename(columns={'Organism ID': 'taxonomyId','Entry': 'accession', 'Entry name': 'entryName'
                   }, inplace=True)

##### Assign unique identifiers

md5 hashcodes for the protein sequence and CURIEs for accession and taxonomyId

In [27]:
unp['id'] = unp['sequence'].apply(lambda seq: 'md5:' + hashlib.md5(seq.encode()).hexdigest())

In [28]:
unp['accession'] = 'uniprot:' + unp['accession']
unp['taxonomyId'] = 'taxonomy:' + unp['taxonomyId']

In [29]:
unp.query("accession == 'uniprot:P01042'")

Unnamed: 0,accession,entryName,Sequence,Length,Protein names,Status,taxonomyId,Chain,Peptide,Gene ontology (biological process),synonymes,Features,type,start,end,name,proId,sequence,fullLength,id
8188,uniprot:P01042,KNG1_HUMAN,MKLITILFLCSRLLLSLTQESQSEEIDCNDKDLFKAVDAALKKYNS...,644,Kininogen-1 (Alpha-2-thiol proteinase inhibito...,reviewed,taxonomy:9606,"CHAIN 19..644; /note=""Kininogen-1""; /id=""PRO...","PEPTIDE 376..389; /note=""T-kinin""; /id=""PRO_...",antimicrobial humoral immune response mediated...,Kininogen-1;Alpha-2-thiol proteinase inhibitor...,"[PROTEIN, 1, 644, Kininogen-1 , ]",PROTEIN,1,644,Kininogen-1,,MKLITILFLCSRLLLSLTQESQSEEIDCNDKDLFKAVDAALKKYNS...,True,md5:693c7762bf152c58e00ff05e19347899
8188,uniprot:P01042,KNG1_HUMAN,MKLITILFLCSRLLLSLTQESQSEEIDCNDKDLFKAVDAALKKYNS...,644,Kininogen-1 (Alpha-2-thiol proteinase inhibito...,reviewed,taxonomy:9606,"CHAIN 19..644; /note=""Kininogen-1""; /id=""PRO...","PEPTIDE 376..389; /note=""T-kinin""; /id=""PRO_...",antimicrobial humoral immune response mediated...,Kininogen-1;Alpha-2-thiol proteinase inhibitor...,"[CHAIN, 19, 644, Kininogen-1, uniprot.chain:PR...",CHAIN,19,644,Kininogen-1,uniprot.chain:PRO_0000006685,QESQSEEIDCNDKDLFKAVDAALKKYNSQNQSNNQFVLYRITEATK...,False,md5:7fce5e096d222db791e61728783862ef
8188,uniprot:P01042,KNG1_HUMAN,MKLITILFLCSRLLLSLTQESQSEEIDCNDKDLFKAVDAALKKYNS...,644,Kininogen-1 (Alpha-2-thiol proteinase inhibito...,reviewed,taxonomy:9606,"CHAIN 19..644; /note=""Kininogen-1""; /id=""PRO...","PEPTIDE 376..389; /note=""T-kinin""; /id=""PRO_...",antimicrobial humoral immune response mediated...,Kininogen-1;Alpha-2-thiol proteinase inhibitor...,"[CHAIN, 19, 380, Kininogen-1 heavy chain, unip...",CHAIN,19,380,Kininogen-1 heavy chain,uniprot.chain:PRO_0000006686,QESQSEEIDCNDKDLFKAVDAALKKYNSQNQSNNQFVLYRITEATK...,False,md5:65df10fd1e958993a8df7cbc9eeaef49
8188,uniprot:P01042,KNG1_HUMAN,MKLITILFLCSRLLLSLTQESQSEEIDCNDKDLFKAVDAALKKYNS...,644,Kininogen-1 (Alpha-2-thiol proteinase inhibito...,reviewed,taxonomy:9606,"CHAIN 19..644; /note=""Kininogen-1""; /id=""PRO...","PEPTIDE 376..389; /note=""T-kinin""; /id=""PRO_...",antimicrobial humoral immune response mediated...,Kininogen-1;Alpha-2-thiol proteinase inhibitor...,"[CHAIN, 390, 644, Kininogen-1 light chain, uni...",CHAIN,390,644,Kininogen-1 light chain,uniprot.chain:PRO_0000006689,SSRIGEIKEETTVSPPHTSMAPAQDEERDSGKEQGHTRRHDWGHEK...,False,md5:918354eb803d70a0af51f56240aa6918
8188,uniprot:P01042,KNG1_HUMAN,MKLITILFLCSRLLLSLTQESQSEEIDCNDKDLFKAVDAALKKYNS...,644,Kininogen-1 (Alpha-2-thiol proteinase inhibito...,reviewed,taxonomy:9606,"CHAIN 19..644; /note=""Kininogen-1""; /id=""PRO...","PEPTIDE 376..389; /note=""T-kinin""; /id=""PRO_...",antimicrobial humoral immune response mediated...,Kininogen-1;Alpha-2-thiol proteinase inhibitor...,"[PEPTIDE, 376, 389, T-kinin, uniprot.chain:PRO...",PEPTIDE,376,389,T-kinin,uniprot.chain:PRO_0000372485,ISLMKRPPGFSPFR,False,md5:7c3750d37b3f3520024958fdba8c0d46
8188,uniprot:P01042,KNG1_HUMAN,MKLITILFLCSRLLLSLTQESQSEEIDCNDKDLFKAVDAALKKYNS...,644,Kininogen-1 (Alpha-2-thiol proteinase inhibito...,reviewed,taxonomy:9606,"CHAIN 19..644; /note=""Kininogen-1""; /id=""PRO...","PEPTIDE 376..389; /note=""T-kinin""; /id=""PRO_...",antimicrobial humoral immune response mediated...,Kininogen-1;Alpha-2-thiol proteinase inhibitor...,"[PEPTIDE, 380, 389, Lysyl-bradykinin, uniprot....",PEPTIDE,380,389,Lysyl-bradykinin,uniprot.chain:PRO_0000006687,KRPPGFSPFR,False,md5:33b2d4498a0558b6ae786d4a6d4620cd
8188,uniprot:P01042,KNG1_HUMAN,MKLITILFLCSRLLLSLTQESQSEEIDCNDKDLFKAVDAALKKYNS...,644,Kininogen-1 (Alpha-2-thiol proteinase inhibito...,reviewed,taxonomy:9606,"CHAIN 19..644; /note=""Kininogen-1""; /id=""PRO...","PEPTIDE 376..389; /note=""T-kinin""; /id=""PRO_...",antimicrobial humoral immune response mediated...,Kininogen-1;Alpha-2-thiol proteinase inhibitor...,"[PEPTIDE, 381, 389, Bradykinin, uniprot.chain:...",PEPTIDE,381,389,Bradykinin,uniprot.chain:PRO_0000006688,RPPGFSPFR,False,md5:c5a9e54cc23314d0f69ea9cca09ee617
8188,uniprot:P01042,KNG1_HUMAN,MKLITILFLCSRLLLSLTQESQSEEIDCNDKDLFKAVDAALKKYNS...,644,Kininogen-1 (Alpha-2-thiol proteinase inhibito...,reviewed,taxonomy:9606,"CHAIN 19..644; /note=""Kininogen-1""; /id=""PRO...","PEPTIDE 376..389; /note=""T-kinin""; /id=""PRO_...",antimicrobial humoral immune response mediated...,Kininogen-1;Alpha-2-thiol proteinase inhibitor...,"[PEPTIDE, 431, 434, Low molecular weight growt...",PEPTIDE,431,434,Low molecular weight growth-promoting factor,uniprot.chain:PRO_0000006690,WGHE,False,md5:fd9ee00f93cbe5c7daef3a211f113b63


### Save proteins

In [30]:
proteins = unp[['id', 'name', 'synonymes', 'accession', 'entryName', 'proId', 'sequence', 'start', 'end', 
                'fullLength', 'taxonomyId']].copy()
proteins.to_csv(NEO4J_IMPORT / '01a-UniProtProtein.csv', index = False)

In [31]:
proteins.head()

Unnamed: 0,id,name,synonymes,accession,entryName,proId,sequence,start,end,fullLength,taxonomyId
0,md5:e6608b50fcd6e004708a875615ddf2d9,Replicase polyprotein 1ab,Replicase polyprotein 1ab;pp1ab;ORF1ab polypro...,uniprot:P0DTD1,R1AB_SARS2,uniprot.chain:PRO_0000449618,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,1,7096,True,taxonomy:2697049
0,md5:5c2c364f44079728c451280435c4236a,Host translation inhibitor nsp1,Replicase polyprotein 1ab;pp1ab;ORF1ab polypro...,uniprot:P0DTD1,R1AB_SARS2,uniprot.chain:PRO_0000449619,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,1,180,False,taxonomy:2697049
0,md5:073edb2349ddcd9a72ecd9f5c1dccdc4,Non-structural protein 2,Replicase polyprotein 1ab;pp1ab;ORF1ab polypro...,uniprot:P0DTD1,R1AB_SARS2,uniprot.chain:PRO_0000449620,AYTRYVDNNFCGPDGYPLECIKDLLARAGKASCTLSEQLDFIDTKR...,181,818,False,taxonomy:2697049
0,md5:73935ca55d0ab6130627210ef6743c39,Non-structural protein 3,Replicase polyprotein 1ab;pp1ab;ORF1ab polypro...,uniprot:P0DTD1,R1AB_SARS2,uniprot.chain:PRO_0000449621,APTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVEL...,819,2763,False,taxonomy:2697049
0,md5:6890e7e8e12f73c2dfdae6805ebc9c4f,Non-structural protein 4,Replicase polyprotein 1ab;pp1ab;ORF1ab polypro...,uniprot:P0DTD1,R1AB_SARS2,uniprot.chain:PRO_0000449622,KIVNNWLKQLIKVTLVFLFVAAIFYLITPVHVMSKHTDFSSEIIGY...,2764,3263,False,taxonomy:2697049
