In [9]:
import re
import pandas as pd

from Bio import SeqIO
from anytree import Node, RenderTree

In [10]:
def add_nodes(nodes, parent, child):
    if parent not in nodes:
        nodes[parent] = Node(parent)  
    if child not in nodes:
        nodes[child] = Node(child)
    nodes[child].parent = nodes[parent]

In [3]:
proteins = list(SeqIO.parse('9606_all.fasta', 'fasta'))
gp_ids = [str(x.id.split('|')[1]) for x in list(SeqIO.parse('9606_gp.fasta', 'fasta'))]

In [4]:
data = []
for protein in proteins:
    uniprot_id = protein.id.split('|')[1]
    try:
        gene = re.search('GN=(.*?) ', protein.description).group(1)
    except AttributeError:
        try:
            gene = re.search('GN=(.*?)$', protein.description).group(1)
        except AttributeError:
            gene = ''
    gp = 1 if uniprot_id in gp_ids else 0
    data.append([protein.id.split('|')[0], gene, uniprot_id, gp, str(protein.seq)])
    
df = pd.DataFrame(data, columns=['db', 'gene', 'id', 'gp', 'seq'])

In [6]:
nodes = {}
for parent, child in zip(df['gene'],df['id']):
    add_nodes(nodes, parent, child)

with open('protein_tree.txt', 'w') as f:
    roots = list(df[~df['gene'].isin(df['id'])]['gene'].unique())
    for root in roots:
        for pre, _, node in RenderTree(nodes[root]):
            if node.name in gp_ids:
                f.write("%s%s*" % (pre, node.name))
                f.write('\n')
            else:
                f.write("%s%s" % (pre, node.name))
                f.write('\n')

In [13]:
df['gene'].value_counts()

            39069
HLA-B       10360
HLA-A        8870
HLA-C        8646
HLA-DRB1     4888
            ...  
DUX4L6          1
DUX4L7          1
GOLGA8H         1
GAGE12G         1
SYT15B          1
Name: gene, Length: 27515, dtype: int64

In [8]:
'PPAP2A' in roots

True

In [35]:
df.to_csv('human_proteome.csv', index=False)

In [59]:
gp_mapping = {}
for i, group in df.groupby('gene'):
    for id in list(group['id']):
        try:
            gp_mapping[id] = str(group[group['gp'] == 1]['id'].iloc[0])
        except IndexError:
            pass

In [62]:
import pickle
with open('canonical_protein_mapping.pickle', 'wb') as f:
    pickle.dump(gp_mapping, f)