In [24]:
import re

import pandas as pd

from Bio import SeqIO

In [25]:
from anytree import Node, RenderTree

def add_nodes(nodes, parent, child):
    if parent not in nodes:
        nodes[parent] = Node(parent)  
    if child not in nodes:
        nodes[child] = Node(child)
    nodes[child].parent = nodes[parent]

In [26]:
proteins = list(SeqIO.parse('9606_all.fasta', 'fasta'))
gp_ids = [str(x.id.split('|')[1]) for x in list(SeqIO.parse('9606_gp.fasta', 'fasta'))]

In [27]:
data = []
for protein in proteins:
    uniprot_id = protein.id.split('|')[1]
    try:
        gene = re.search('GN=(.*?) ', protein.description).group(1)
    except AttributeError:
        try:
            gene = re.search('GN=(.*?)$', protein.description).group(1)
        except AttributeError:
            gene = ''
    gp = 1 if uniprot_id in gp_ids else 0
    data.append([protein.id.split('|')[0], gene, uniprot_id, gp, str(protein.seq)])

In [31]:
df = pd.DataFrame(data, columns=['db', 'gene', 'id', 'gp', 'seq'])

In [22]:
nodes = {}
for parent, child in zip(df['gene'],df['id']):
    add_nodes(nodes, parent, child)

with open('protein_tree.txt', 'w') as f:
    roots = list(df[~df['gene'].isin(df['id'])]['gene'].unique())
    for root in roots:
        for pre, _, node in RenderTree(nodes[root]):
            if node.name in gp_ids:
                f.write("%s%s*" % (pre, node.name))
                f.write('\n')
            else:
                f.write("%s%s" % (pre, node.name))
                f.write('\n')

In [35]:
df.to_csv('human_proteome.csv', index=False)

In [36]:
df

Unnamed: 0,db,gene,id,gp,seq
0,tr,IFT20,A0A024QZ08,0,MAKDILGEAGLHFDELNKLRVLDPEVTQQTIELKEECKDFVDKIGQ...
1,tr,HDAC6,A0A024QZ26,0,MTSTGQDSTTTRQRRSRQNPQSPPQDSSVTSKRNIKKGAVPRSIPN...
2,tr,TBX2,A0A024QZ86,0,MREPALAASAMAYHPFHAPRPADFPMSAFLAAAQPSFFPALALPPG...
3,tr,EPHA2,A0A024QZA8,0,MELQAARACFALLWGCALAAAAAAQGKEVVLLDFAAAGGELGWLTH...
4,tr,CLN3,A0A024QZB8,0,MGGCAGSRRRFSDSEGEETVPEPRLPLLDHQGAHWKNAVGFWLLGL...
...,...,...,...,...,...
229381,tr,CACNA2D4,X6RLU5,0,XELVREVLFDAVVTAPMEAYWTALALNMSEESEHVVDMAFLGTRAG...
229382,tr,DDX5,X6RLV5,0,MSGYSSDRDRGRDRGFGAPRFGGSRAGPLSGKKFGNPGEKLVKKKW...
229383,tr,CACNA2D4,X6RLY7,0,MKLEFLQRKFWAATRQCSTVDGPCTQSCEDSDLDCFVIDNNGFILI...
229384,tr,ERC1,X6RM00,0,MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...


In [59]:
gp_mapping = {}
for i, group in df.groupby('gene'):
    for id in list(group['id']):
        try:
            gp_mapping[id] = str(group[group['gp'] == 1]['id'].iloc[0])
        except IndexError:
            pass

In [62]:
import pickle
with open('gp_mapping.pickle', 'wb') as f:
    pickle.dump(gp_mapping, f)