In [19]:
import re

import pandas as pd

from Bio import SeqIO

In [20]:
from anytree import Node, RenderTree

def add_nodes(nodes, parent, child):
    if parent not in nodes:
        nodes[parent] = Node(parent)  
    if child not in nodes:
        nodes[child] = Node(child)
    nodes[child].parent = nodes[parent]

In [21]:
proteins = list(SeqIO.parse('9606_all.fasta', 'fasta'))
gp_ids = [str(x.id.split('|')[1]) for x in list(SeqIO.parse('9606_gp.fasta', 'fasta'))]

In [11]:
data = []
for protein in proteins:
    try:
        gene = re.search('GN=(.*?) ', protein.description).group(1)
    except AttributeError:
        try:
            gene = re.search('GN=(.*?)$', protein.description).group(1)
        except AttributeError:
            gene = ''
    data.append([protein.id.split('|')[0], gene, protein.id.split('|')[1], str(protein.seq)])

In [12]:
df = pd.DataFrame(data, columns=['db', 'gene', 'id', 'seq', ])

In [17]:
nodes = {}
for parent, child in zip(df['gene'],df['id']):
    add_nodes(nodes, parent, child)

with open('protein_tree.txt', 'w') as f:
    roots = list(df[~df['gene'].isin(df['id'])]['gene'].unique())
    for root in roots:         # you can skip this for roots[0], if there is no forest and just 1 tree
        for pre, _, node in RenderTree(nodes[root]):
            if node.name in gp_ids:
                f.write("%s%s*" % (pre, node.name))
                f.write('\n')
            else:
                f.write("%s%s" % (pre, node.name))
                f.write('\n')

IFT20
HDAC6
TBX2
EPHA2
CLN3
SIRT1
PPAP2A
GSK3A
NR4A1
LTB4DH
NF2
MYH9
SOX10
BECN1
NTRK2
SYK
SMAD4
BCL2
CCR2
CTNNB1
RHOA
PRKCD
DRD2
ATM
BACE1
PARP1
BAD
TCIRG1
ADAM10
SMAD3
CORO1A
ZFP36L1
PSEN1
MMP2
CFTR
NAPE-PLD
LDLR
RAB3A
PRKACA
SLC34A1
RET
PENK
SULF1
EYA1
ENG
ABL1
TRAF2
AGPAT2
hCG_1987119
DDR2
ATP2B4
AQP1
CDC42
ALPL
KITLG
ATP2B1
SYT1
TBX3
MTM1
CDH2
EHMT2
PPARD
MAPK14
SRF
ELOVL5
HSP90AB1
KATNAL1
RB1
LIG4
GBA

DMD
DGKI
BOLA2-SMG1P6
CYP2D7
PTGS1
CDK5
EZH2
HNF1A
PTPN22
PIGBOS1
SIK1B
ZBTB32
MT-RNR1
HEL-S-172mP
CD300H
ACVRL1
ACADM
ADA
NKX2-5
BAK1
BBS4
BCL2L1
APOE
CRK
HEXA
ATP1A2
NPM1
PPP3CA
STK11
RNF6
AXIN1
TP63
HGS
TLR2
SDC3
RIPK2
AP3B1
DTNBP1
ZNF688
ZGPAT
FASN
SHANK3
NBDY
MECP2
CIROP
CCDC196
LPL
FSHR
ATAT1
APP
SPAST
PIK3R1
CTNNA1
PTEN
AGK
TGFB1
TRBC2
GLI2
HDAC4
PTGS2
FGF5A
FGF4B
GRIK2
SLC12A8
RBM47
TTC26
TSNARE1
TMEM129
E2F8
UBA6
ESYT2
ESYT3
MED19
BLTP3B
POTEB3
IGLC7
SHTN1
CYP1A1
SLC5A10
FEZF1
TMEM120B
CLRN2
ARHGAP10
IRGM
FAM170A
ANO9
SLC22A23
ODAM
FAM168B
PXDNL
ILVBL
UBE2QL1
SYCE3
PLEKHG3

HLA-DPB1
PRM1
PRM2
ERBB2
WNT1
NTRK1
CAPNS1
MT1A
MT1E
MT1F
AMY2A
HSPB1
TYMS
CYBB
RPN1
RPN2
GNAI2
H2AC8
SLC4A2
GYPC
IFNW1
IFNA6
IFNA4
IGF1
ATP1A1
ATP1B1
CHGB
ALDOB
ARG1
APOD
ALDH2
CYP17A1
ITGB3
ITGB2
CYP11A1
S100A8
INHA
IL4
IL5
HMGN1
SERPINB2
SERPINE1
PRKCG
SLC25A5
SERPINA5
SERPING1
CFI
F13B
ISG15
LGALS2
MPO
PCCA
PCCB
CYP1A2
CYP2E1
ALPP
EIF2S1
HMGN2
FGF1
IL6
EDN1
ICAM1
RPLP1
RPLP2
RPLP0
SCG5
JUN
FABP3
POLR3D
REG1A
CLEC3B
SSB
ATP5MC1
HLA-DQB2
SERPINA7
SERPIND1
TFAP2A
ITGB1
PRKCB
KRT18
KRT8
CRYBA1
CSN2
CALB1
MYL1
HPN
COL5A2
GYPB
CD1A
CD5
UROD
UGT2B4
INSR
LCK
FYN
BCHE
GLA
CCK
IGKV4-1
IGHV4-34
HLA-DOA
GSN
PGR
PTMA
CDK1
ATP5F1B
C2
S100A9
S100A6
APOA4
CD2
EIF4E
CEACAM5
CKM
ENO1
FCER2
PYGL
GPI
POLB
TPM3
ITGAV
CRH
KLK1
H2BC11
SERPINE2
LIPF
EPHX1
TH
DBI
FABP1
LDHB
NEFM
CENPB
TPO
GPX1
THBD
PGK2
PROS1
P4HB
KLK3
H1-0
ASGR1
ASGR2
CRYGC
CRYGD
ADH1A
FES
CSF1R
CTSD
ANXA2
C8A
C8B
GP1BA
C8G
CAPN1
TUBB
MT1B
CA3
IVL
PRSS1
PRSS2
GRP
CSN3
CHRNG
ADRB2
DCN
PSAP
HEXB
CTSL
PFN1
BPGM
APRT
CD3E
EPRS1
CTSB
LDHC
HSP9

PAK5
BCCIP
STK26
C5AR2
ABI3
PTGFRN
CTTNBP2NL
IBTK
CHD7
HEATR5B
DNAH1
KIF17
ZNFX1
CHPF2
MARCHF4
RRBP1
ARHGAP20
ZNF319
ANKIB1
KLHL14
MAP10
KLHL8
CEP126
IFT80
CPSF2
IGSF9
KLHL9
LARS1
CC2D2A
TXNDC16
RCOR3
MYEF2
KLHL42
EIF2AK4
DISP3
WDR35
LRP2BP
TBC1D14
CGN
ADAMTS9
KANSL3
KLHL13
HECW2
STARD9
FRMD4A
ANKFY1
RERE
SUCLA2
WRAP73
GMPR2
SLC17A7
SLC17A6
PSMC3IP
GNG13
B3GAT1
STX18
DPM3
IMPACT
ZNF219
UVRAG
ATXN10
MBD2
NCDN
TFIP11
ONECUT1
NFKBIL1
EPS15L1
DNMT3B
MYO1A
GALP
SPRR3
HSFX2
XCL2
ORC3
RHCG
CLCF1
SAE1
NLK
MAGEC2
COPG2
RNF7
PI4KB
MYOT
MRC2
CRNN
IL36RN
XPR1
COMMD3
STOML1
GNG12
ABCD2
PPARGC1A
HCST
MTRR
UXT
ASH2L
CPNE7
P2RX2
OPTC
DHCR7
MGAT4C
CACNG4
TRPC4
TNFRSF10D
DKK3
HEY2
METTL1
GULP1
VPS29
EIF3K
EXTL2
GRHPR
UPB1
CTSZ
LHX3
CKLF
RPS6KB2
DNAJB9
DNAJB11
GABBR1
RNF14
SUCO
UBA2
DKK4
POLK
CTNNAL1
GHRL
DBF4
MORF4L1
NXF1
SEL1L
WNT16
B4GALT7
PEF1
BIN2
ZMYM2
COPS7A
HESX1
CTSF
DUX4
SLC25A10
FBLN5
KLK11
B4GALT6
SLC9A2
LPAR3
CLN8
HSPB7
APEX2
REV1
ZNF69
PPP1R1B
AASS
CLIP2
ZNF212
ZNF282
UQCR10
ZMAT5
SEC14L4
T

EKLF
ADIH
ADIF2
ADIQ
ADIR
ADIO
ADID
ADIN
ADIF1
ADIP
ADIM
DQB2
LOC112267897
LOC647264
C1orf185
LCE7A
MSANTD5
KIF1BP
DIVA
GAEC1
DRB3
HLA-DQ3
hba2
TMEM173
SCPPPQ1
RhD
HLA-DRB1*11
HLA-C*04
C11orf98
BLACAT1
LOC114841035
PWWP4
EPPIN-WFDC6
FAM246B
FAM246A
LOC112577516
LOC728392
TAX1BP2
RPS23RG1
KMT2A/MLLT6
SEPT12
20
keratin
TGF-beta1
PFP
IL1beta
IL-12RB2
NF-kB
cJUN
Bcl6
p38
TNF-alpha
Eomes
BCR-ABL
KIP75
TCRBV13S2
TCRBV6S5A2
TCRBV13S5
TCRBV22S1A2N1T
TCRBV12S2A1T
HLA-A-6601
HLA-A30JS
HLA-A30
HLA-A24
HLA-A33
HLA-A68
HLA-A*7402
B-4901
HLA_B
HLA-B*50IM
HLA-B*8201
HLA-B*45ZJ
HLA-Cw*0702
HLA-Cw*1204
HLA-Cw*1604
TCRBV13S9/13S2A1T
LOC122526780
LOC122319696
SMIM45
TCRBV6S5A1N1
TCRBV6S1A1N1
TCRBV6S8A2T
TCRBV5S6A3N2T
TCRBV13S6A2T
TCRBV6S3A1N1T
TCRBV6S2A1N1T
TCRBV6S4A1
TCRBV23S1A2T
TCRBV8S2A1T
TCRBV8S3
TCRBV24S1A3T
TCRBV18S1
RHCe
PAI-1
SMIM42
LOC112267881
CFAP298-TCP10L
TRGJ2
DRB1*0101V1
HLA-DRB1*14JW
HLA-DRB1*07ROS
C2orf66
SUMO6
CXNT
GSDMB-1
HLA-DRB4*
Glycam1
C8orf44-SGK3
HLA-DPB1*
HLA-DRB5*
HLA-DRB3*
FA