In [19]:
import re

import pandas as pd

from Bio import SeqIO

In [20]:
from anytree import Node, RenderTree

def add_nodes(nodes, parent, child):
    if parent not in nodes:
        nodes[parent] = Node(parent)  
    if child not in nodes:
        nodes[child] = Node(child)
    nodes[child].parent = nodes[parent]

In [21]:
proteins = list(SeqIO.parse('9606_all.fasta', 'fasta'))
gp_ids = [str(x.id.split('|')[1]) for x in list(SeqIO.parse('9606_gp.fasta', 'fasta'))]

In [11]:
data = []
for protein in proteins:
    uniprot_id = protein.id.split('|')[1]
    try:
        gene = re.search('GN=(.*?) ', protein.description).group(1)
    except AttributeError:
        try:
            gene = re.search('GN=(.*?)$', protein.description).group(1)
        except AttributeError:
            gene = ''
    gp = 1 if uniprot_id in gp_ids else 0
    data.append([protein.id.split('|')[0], gene, uniprot_id, gp, str(protein.seq)])

In [12]:
df = pd.DataFrame(data, columns=['db', 'gene', 'id', 'gp' 'seq', ])

In [22]:
nodes = {}
for parent, child in zip(df['gene'],df['id']):
    add_nodes(nodes, parent, child)

with open('protein_tree.txt', 'w') as f:
    roots = list(df[~df['gene'].isin(df['id'])]['gene'].unique())
    for root in roots:         # you can skip this for roots[0], if there is no forest and just 1 tree
        for pre, _, node in RenderTree(nodes[root]):
            if node.name in gp_ids:
                f.write("%s%s*" % (pre, node.name))
                f.write('\n')
            else:
                f.write("%s%s" % (pre, node.name))
                f.write('\n')