In [None]:
# (Timing: ~ 1600 s)

import tarfile, re, time

keggdir = '..' # the folder of the downloaded KEGG database

# organism info
fhand = open(keggdir + '/kegg/genes/misc/taxonomy')

org2kingdom = dict()
org2species = dict()

for line in fhand:
    if line.startswith('# Eukaryotes'):
        kingdom = 'E'
    elif line.startswith('## Bacteria'):
        kingdom = 'B'
    elif line.startswith('## Archaea'):
        kingdom = 'A'
    elif not line.startswith('#'):
        org2kingdom[line.split('\t')[1]] = kingdom
        org2species[line.split('\t')[1]] = line.split('\t')[3].rstrip()
fhand.close()

tar = tarfile.open(keggdir + '/kegg/ligand/reaction.tar.gz', 'r:gz')

# reaction-compound relationship
f_rc = tar.extractfile('reaction/reaction.lst')
rxn2cmpd = dict() # reaction to compound
# metabolites to be removed
rmMets1 = ['C00001','C00007','C00012','C00017','C00039','C00046','C00080']
rmMets2 = ['C00002','C00003','C00004','C00005','C00006','C00008','C00009',
           'C00010','C00013','C00016','C00019','C00020','C00021','C00035',
           'C00044','C00061','C00144','C00390','C00399','C01352','C01847']
for line in f_rc:
    r_id = line.decode().split(':')[0]
    cmpd_tmp = re.findall('[A-Z]\d{5}', line.decode().split(':')[1])
    cmpd_tmp1 = [ x for x in cmpd_tmp if x not in rmMets1]
    cmpd_tmp2 = [ x for x in cmpd_tmp1 if x not in rmMets2]
    if len(cmpd_tmp1) > 0:
        if len(cmpd_tmp2) > 0:
            cmpd = cmpd_tmp2
        else:
            cmpd = cmpd_tmp1
        rxn2cmpd[r_id] = cmpd
f_rc.close()

# reaction-EC relationship
f_rec = tar.extractfile('reaction/reaction')
rxn2ec = dict() # reaction to ec
flag = False # used for EC numbers in multiple lines
for line in f_rec:
    
    if line.decode().startswith('ENTRY'):
        r_id = re.findall('R\d{5}', line.decode())[0]
        rxn2ec[r_id] = []
    elif line.decode().startswith('ENZYME'):
        rxn2ec[r_id] += re.findall('\S+\.\S+\.\S+\.\S+', line.decode())
        if line.decode().endswith(' \n'):
            flag = True
    # for EC numbers in multiple lines
    if flag and line.decode().startswith(' '):
        rxn2ec[r_id] += re.findall('\S+\.\S+\.\S+\.\S+', line.decode())
        if line.decode().endswith(' \n'):
            flag = True
        else:
            flag = False

f_rec.close()
tar.close()


# KO- enzyme/reaction relationship
tar = tarfile.open(keggdir + '/kegg/genes/ko.tar.gz', 'r:gz')

# dict for KO-EC and KO-reaction
def generate_KOdict(file_name):
    KOdict = dict()
    f = tar.extractfile(file_name)
    for line in f:
        ko = line.decode().split('\t')[0][3:]
        ko_vals = line.decode().split('\t')[1].rstrip()[3:]
        if ko not in KOdict:
            KOdict[ko] = [ko_vals]
        else:
            KOdict[ko].append(ko_vals)
    f.close()
    return KOdict
ko2ec = generate_KOdict('ko/ko_enzyme.list') # KO to EC (not used in this project as there exists reaction to EC relationship)
ko2rxn = generate_KOdict('ko/ko_reaction.list') # KO to reaction


# write file for each organism based on KO-genes relationship

f_kg = tar.extractfile('ko/ko_genes.list')

org_list = []
time_start = time.time()

for line in f_kg:
    ko = line.decode().split('\t')[0][3:]
    if ko in ko2rxn: # only include genes with reactions
        gene_id = line.decode().split('\t')[1].rstrip()
        org_id = gene_id.split(':')[0]
        
        if org_id in org2kingdom:
            fout = open('output/' + org_id + '.txt', 'a')
            rxn_list = ko2rxn[ko]
            for rxn in rxn_list:
                ec_list = ';'.join(rxn2ec[rxn]) # (multiple ec numbers are grouped in one cell but could be splitted in future)
                cmpd_list = rxn2cmpd[rxn]
                for cmpd in cmpd_list:
                    line_to_write = gene_id+'\t'+org_id+'\t'+org2kingdom[org_id]+'\t'+ko+'\t'+rxn+'\t'+ec_list+'\t'+cmpd+'\n'
                    fout.write(line_to_write)

            fout.close()

            if org_id not in org_list:
                org_list += [org_id]
                if len(org_list)%100 == 0: 
                    time_end = time.time()
                    print('Done for', len(org_list), '/', len(org2kingdom), 'organisms', '[time cost:', round(time_end-time_start), 's]')

f_kg.close()
tar.close()