In [74]:
import re

import pandas as pd

from Bio import SeqIO

In [75]:
proteins = list(SeqIO.parse('UP000005640-with-isoforms.fasta', 'fasta'))

In [77]:
data = []
for protein in proteins:
    try:
        gene = re.search('GN=(.*?) ', protein.description).group(1)
    except AttributeError:
        try:
            gene = re.search('GN=(.*?)$', protein.description).group(1)
        except AttributeError:
            gene = ''
    data.append([protein.id.split('|')[0], gene, protein.id.split('|')[1], str(protein.seq)])

sp|O00370|LORF2_HUMAN
sp|Q9N2K0|ENH1_HUMAN
tr|A0A494C116|A0A494C116_HUMAN
tr|E7ENX8|E7ENX8_HUMAN
tr|E9PSI1|E9PSI1_HUMAN
sp|Q9N2J8|ENH3_HUMAN
tr|A0A087WUJ7|A0A087WUJ7_HUMAN
tr|A0A087WZY1|A0A087WZY1_HUMAN
tr|A0A0B4J203|A0A0B4J203_HUMAN
tr|A0A0B4J269|A0A0B4J269_HUMAN
tr|A0A0J9YYC4|A0A0J9YYC4_HUMAN
tr|A0A3B3IRV1|A0A3B3IRV1_HUMAN
tr|A0A3B3ISS9|A0A3B3ISS9_HUMAN
tr|A0A3B3ITA1|A0A3B3ITA1_HUMAN
tr|A0A494C100|A0A494C100_HUMAN
tr|A0A7P0T816|A0A7P0T816_HUMAN
tr|A0A7P0T9J3|A0A7P0T9J3_HUMAN
tr|A0A7P0T9P9|A0A7P0T9P9_HUMAN
sp|A6NIZ1|RP1BL_HUMAN
sp|C0HLV8|MP31_HUMAN
sp|C9JQL5|DSA2D_HUMAN
tr|E7EVH7|E7EVH7_HUMAN
tr|H0Y858|H0Y858_HUMAN
tr|H3BM21|H3BM21_HUMAN
tr|H3BNH8|H3BNH8_HUMAN
tr|I3L3B4|I3L3B4_HUMAN
tr|K7N7A8|K7N7A8_HUMAN
tr|M0QYV0|M0QYV0_HUMAN
tr|M0R2C6|M0R2C6_HUMAN
tr|U3KQV3|U3KQV3_HUMAN
tr|A0A087WSV2|A0A087WSV2_HUMAN
tr|A0A087WT57|A0A087WT57_HUMAN
tr|A0A087WV05|A0A087WV05_HUMAN
tr|A0A087WV58|A0A087WV58_HUMAN
tr|A0A087WVE0|A0A087WVE0_HUMAN
tr|A0A087WW49|A0A087WW49_HUMAN
tr|A0A087WZ62|A0A087WZ62_HUMA

sp|P0C879|YJ018_HUMAN
sp|P0C880|YT014_HUMAN
sp|P0DMU3|F231L_HUMAN
sp|Q0VFX4|YL016_HUMAN
sp|Q1RN00|YC018_HUMAN
sp|Q499Y3|YJ016_HUMAN
sp|Q56UQ5|TPT1L_HUMAN
sp|Q5VSD8|YI029_HUMAN
sp|Q5VV11|U633B_HUMAN
sp|Q6AWC8|YK026_HUMAN
tr|Q6JHZ5|Q6JHZ5_HUMAN
sp|Q6P435|SMG1L_HUMAN
tr|Q6XYB5|Q6XYB5_HUMAN
tr|Q6YL49|Q6YL49_HUMAN
sp|Q6ZPA2|YS039_HUMAN
tr|Q6ZPB1|Q6ZPB1_HUMAN
sp|Q6ZQT7|YJ013_HUMAN
sp|Q6ZQY7|YO026_HUMAN
sp|Q6ZR03|YU004_HUMAN
sp|Q6ZR54|YN009_HUMAN
sp|Q6ZRG5|YQ015_HUMAN
sp|Q6ZRM9|YG024_HUMAN
sp|Q6ZRN7|YP029_HUMAN
sp|Q6ZRP5|YD019_HUMAN
sp|Q6ZRU5|YQ032_HUMAN
sp|Q6ZRX8|YL004_HUMAN
sp|Q6ZS46|YF009_HUMAN
sp|Q6ZS49|YQ050_HUMAN
sp|Q6ZS52|YF013_HUMAN
sp|Q6ZS92|YD022_HUMAN
sp|Q6ZSA8|YS025_HUMAN
sp|Q6ZSN1|YI023_HUMAN
sp|Q6ZSR3|YO027_HUMAN
sp|Q6ZSR6|YP007_HUMAN
sp|Q6ZSV7|YF010_HUMAN
sp|Q6ZTC4|YT009_HUMAN
sp|Q6ZTI0|YK032_HUMAN
sp|Q6ZUG5|YC006_HUMAN
sp|Q6ZUT4|YL014_HUMAN
sp|Q6ZVH6|YK004_HUMAN
sp|Q6ZVL8|YP033_HUMAN
sp|Q6ZVQ6|YS045_HUMAN
sp|Q6ZVU0|YK022_HUMAN
sp|Q6ZWC4|YS043_HUMAN
tr|Q71RC1|Q71RC1_HUMAN
sp|Q7

In [43]:
df = pd.DataFrame(data, columns=['db', 'gene', 'id', 'seq'])

In [63]:
df.loc[df['gene'] == df['id'], 'gene'] = ''

In [64]:
from anytree import Node, RenderTree

def add_nodes(nodes, parent, child):
    if parent not in nodes:
        nodes[parent] = Node(parent)  
    if child not in nodes:
        nodes[child] = Node(child)
    nodes[child].parent = nodes[parent]

In [69]:
nodes = {}
for parent, child in zip(df['gene'],df['id']):
    add_nodes(nodes, parent, child)

with open('tree.txt', 'w') as f:
    roots = list(df[~df['gene'].isin(df['id'])]['gene'].unique())
    for root in roots:         # you can skip this for roots[0], if there is no forest and just 1 tree
        for pre, _, node in RenderTree(nodes[root]):
            f.write("%s%s" % (pre, node.name))
            f.write('\n')

In [71]:
df[df['gene'] == '']

Unnamed: 0,db,gene,id,seq
14992,tr,,A0A494C116,MLTFFLVSGGSLWLFVEFVLSLLEKMQTQEILRILRLPELGDLGQF...
15117,tr,,E7ENX8,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...
15145,tr,,E9PSI1,MSGLGRLFGKGKKEKGPTPEEAIQKLKETEKILIKKQEFLEQKIQQ...
17829,tr,,A0A087WUJ7,MTSSRLWFSLLLAAAFAGRATALWPWPQNFQTSDQRYVLYPNNFQF...
17862,tr,,A0A087WZY1,MRVGVSLCYPGWSSVVGSWLTAVWNSWAQTILPPQPSAAEEYFACW...
...,...,...,...,...
103531,tr,,V9GYU9,MALQGISVVELSGLAPGPFCAMVLADFGARVVRVDRPGSRYDVSRL...
103555,tr,,V9GYY9,YFGMDHNVDQTGKAVIINKTSNTRIPEQRFSEHIKDEKNTEFQQRF...
103586,tr,,V9GZ38,GRYVPPSSTDRSPYEKSGSRRTRYEESLRTRAVAEDAPSTHYFLPV...
103591,tr,,V9GZ45,MALQGISVVELSGLAPGPFCAMVLADFGARVVRVDRPGSRYDVSRL...


In [72]:
df

Unnamed: 0,db,gene,id,seq
0,tr,DMD,A0A075B6G3,MLWWEEVEDCYEREDVQKKTFTKWVNAQFSKFGKQHIENLFSDLQD...
1,tr,DGKI,A0A087WV00,MDAAGRGCHLLPLPAARGPARAPAAAAAAAASPPGPCSGAACAPSA...
2,tr,BOLA2-SMG1P6,A0A087WZT3,MELSAEYLREKLQRDLEAEHVLPSPGGVGQVRGETAASETQLGS
3,sp,CYP2D7,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...
4,tr,PTGS1,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...
...,...,...,...,...
103825,tr,CACNA2D4,X6RLU5,XELVREVLFDAVVTAPMEAYWTALALNMSEESEHVVDMAFLGTRAG...
103826,tr,DDX5,X6RLV5,MSGYSSDRDRGRDRGFGAPRFGGSRAGPLSGKKFGNPGEKLVKKKW...
103827,tr,CACNA2D4,X6RLY7,MKLEFLQRKFWAATRQCSTVDGPCTQSCEDSDLDCFVIDNNGFILI...
103828,tr,ERC1,X6RM00,MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...
