# KEGG Data

## Table of Content <a class="anchor" id="toc"></a>
#### Data
* [Globals](#globals)
* [KEGG API](#api)
* [Reactions](#reactions)
* [Enzymes](#enzymes)
* [Compounds](#compounds)
* [Data Files](#files)

## <a class="anchor" id="globals"></a>Globals [$\Uparrow$](#toc)
#### Directories Used in the Workflow

In [1]:
import yaml
with open("config.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

from os import makedirs
DATASET = 'KEGG'
DATA = config['datadir'][DATASET]
makedirs(DATA, exist_ok=True)

## <a class="anchor" id="api"></a>KEGG API [$\Uparrow$](#toc)
KEGG provides access to data through a REST interface at _http://rest.kegg.jp_. It returns database entries in text format, which can be parsed through the method _kegg__lookup_ as given below. The method is not complete, and some properties must be skipped for the parser to not run into an error. However, for the purpose of collecting data for _enviLink_ it is sufficient.

In [2]:
def kegg_lookup(client, identifier, skipped=None):
    page = client.get(f'http://rest.kegg.jp/get/{identifier}').content.decode()
    def mapentry(entry):
        import re
        lines = entry.strip().split('\n')
        prop = None
        superprop = None
        indentation = None
        for line in lines:
            m = re.match('( {0,2})(\S+)(\s+)(\S.*)?$', line)
            if not m:
                m = re.match('( {0,2})(\S+)$', line)
            if m:
                if m.group(1):
                    if superprop is None:
                        superprop = prop
                else:
                    superprop = None
                prop = m.group(2)
                
                try:
                    obj = m.group(4)
                except IndexError:
                    obj = None
                
                if obj and (not skipped or prop not in skipped):
                    indentation = sum([len(m.group(i)) for i in [1,2,3]])
                    yield {'subject':identifier, 'property':prop, 'super':superprop, 'object':obj}

            elif not skipped or (prop not in skipped and superprop not in skipped):
                m = re.match(f' {{{indentation}}}(\S.*)$', line)
                if m:
                    yield {'subject':identifier, 'property':prop, 'super':superprop, 'object':m.group(1)}
                else:
                    raise Exception('strange line', identifier, line)
    
    for entry in page.split('///'):
        if entry.strip():
            for triple in mapentry(entry):
                yield triple

#### Example

In [3]:
from requests.sessions import Session
client = Session()
reaction = 'R00004'
print(client.get(f'http://rest.kegg.jp/get/{reaction}').content.decode())

ENTRY       R00004                      Reaction
NAME        diphosphate phosphohydrolase;
            pyrophosphate phosphohydrolase
DEFINITION  Diphosphate + H2O <=> 2 Orthophosphate
EQUATION    C00013 + C00001 <=> 2 C00009
ENZYME      3.6.1.1
DBLINKS     RHEA: 24579
///



In [4]:
from pandas import DataFrame
DataFrame(list(kegg_lookup(client, reaction)))

Unnamed: 0,object,property,subject,super
0,R00004 Reaction,ENTRY,R00004,
1,diphosphate phosphohydrolase;,NAME,R00004,
2,pyrophosphate phosphohydrolase,NAME,R00004,
3,Diphosphate + H2O <=> 2 Orthophosphate,DEFINITION,R00004,
4,C00013 + C00001 <=> 2 C00009,EQUATION,R00004,
5,3.6.1.1,ENZYME,R00004,
6,RHEA: 24579,DBLINKS,R00004,


## <a class="anchor" id="reactions"></a>Reactions [$\Uparrow$](#toc)

In [5]:
from pandas import read_csv
kegg_reactions = read_csv('http://rest.kegg.jp/list/reaction', header=None, sep="\t", names=['identifier', 'description'])

In [6]:
kegg_reactions['rid'] = kegg_reactions.identifier.str[3:]
kegg_reactions.tail()

Unnamed: 0,identifier,description,rid
11470,rn:R12649,"3-[(R)-glyceryl]-diphospho-5'-guanosine:7,8-di...",R12649
11471,rn:R12650,"L-isoleucine, 2-oxoglutarate:oxygen oxidoreduc...",R12650
11472,rn:R12651,"31-hydroxy-L-isoleucine, 2-oxoglutarate:oxygen...",R12651
11473,rn:R12652,"abieta-8,11,13-triene,[reduced NADPH---hemopro...",R12652
11474,rn:R12653,(+)-pinoresinol:azurin oxidoreductase; (+)-Pin...,R12653


In [7]:
DataFrame(kegg_lookup(client, 'R12653', None))

Unnamed: 0,object,property,subject,super
0,R12653 Reaction,ENTRY,R12653,
1,(+)-pinoresinol:azurin oxidoreductase,NAME,R12653,
2,(+)-Pinoresinol + 2 Oxidized azurin + H2O <=> ...,DEFINITION,R12653,
3,C05366 + 2 C05357 + C00001 <=> C22302 + 2 C053...,EQUATION,R12653,
4,1.17.9.2,ENZYME,R12653,


In [8]:
from tqdm import tqdm
tqdm.pandas()

def extend_entries(lst, entry, skipped=None):
    lst.extend(kegg_lookup(client, entry, skipped))

reaction_entries = []
kegg_reactions.progress_apply(lambda row: extend_entries(reaction_entries, row.rid), axis=1)
krdf = DataFrame(reaction_entries)

100%|██████████| 11475/11475 [38:46<00:00,  4.93it/s]


In [9]:
krdf.to_csv(f'{DATA}/kegg_reaction_entries.tsv', index=None, sep="\t")

In [10]:
krdf = read_csv(f'{DATA}/kegg_reaction_entries.tsv', sep="\t")
krdf[(krdf.property == 'ENZYME') & (krdf.subject == 'R00714')]

Unnamed: 0,object,property,subject,super
6956,1.2.1.16 1.2.1.79,ENZYME,R00714,


In [11]:
krdf[(krdf.property == 'ENZYME') & (krdf['object'].str.contains(' '))]

Unnamed: 0,object,property,subject,super
67,1.11.1.6 1.11.1.21,ENZYME,R00009,
91,1.11.1.13 1.11.1.16,ENZYME,R00011,
121,1.2.4.1 2.2.1.6 4.1.1.1,ENZYME,R00014,
166,1.12.7.2 1.12.99.-,ENZYME,R00019,
285,1.10.3.1 1.14.18.1,ENZYME,R00031,
384,1.10.3.1 1.14.18.1,ENZYME,R00045,
468,1.10.3.1 1.14.18.1,ENZYME,R00058,
544,1.14.13.- 1.17.3.2,ENZYME,R00069,
655,1.1.3.14 1.10.3.1,ENZYME,R00080,
662,1.7.2.1 7.1.1.9,ENZYME,R00081,


## <a class="anchor" id="enzymes"></a>Enzymes [$\Uparrow$](#toc)

In [12]:
def extend_enzymes(ers, reaction, enzymelist):
    for e in enzymelist.split():
        ers.append([reaction, e])

ers = []
krdf[krdf.property == 'ENZYME'].apply(
    lambda row: extend_enzymes(ers, row.subject, row.object),
axis=1)
reaction_enzymes = DataFrame(ers, columns=['reaction', 'enzyme'])

reaction_enzymes.enzyme.drop_duplicates().shape[0], reaction_enzymes.enzyme.shape[0]

(5733, 11359)

In [13]:
def enzyme_name(ecn):
    try:
        enzdf = DataFrame(list(kegg_lookup(client, ecn)))
        ename = enzdf[enzdf.property=='NAME'].object.values[0]
        return {'enzyme':ecn, 'ename':ename.strip(';')}
    except Exception as e:
        from sys import stderr
        stderr.write(f"{e} for {ecn}\n")
        return {'enzyme':ecn, 'ename':None}
enzyme_names = DataFrame([enzyme_name(ecn) for ecn in reaction_enzymes.enzyme.drop_duplicates() if not ecn.endswith('.-')])
enzyme_names.tail()

Unnamed: 0,ename,enzyme
5604,3-phospho-D-glycerate guanylyltransferase,2.7.7.106
5605,L-isoleucine 31-dioxygenase,1.14.11.74
5606,31-hydroxy-L-isoleucine 4-dioxygenase,1.14.11.75
5607,ferruginol synthase,1.14.14.175
5608,(+)-pinoresinol hydroxylase,1.17.9.2


In [14]:
reaction_enzymes = reaction_enzymes.merge(enzyme_names, on='enzyme', how='left')
reaction_enzymes.head()

Unnamed: 0,reaction,enzyme,ename
0,R00001,3.6.1.10,endopolyphosphatase
1,R00002,1.18.6.1,nitrogenase
2,R00004,3.6.1.1,inorganic diphosphatase
3,R00005,3.5.1.54,allophanate hydrolase
4,R00006,2.2.1.6,acetolactate synthase


#### Filtering out of reactions with ambiguous enzyme annotation
Reactions to which multiple enzymes are annotated at the 3<sup>rd</sup> level were removed from the reaction list because in these cases the relation between enzymes and reactions remains ambiguous, e.g. [R02260](https://www.genome.jp/dbget-bin/www_bget?rn:R02260).

In [15]:
reaction_enzymes['3rd_level'] = ['.'.join(enzyme.split('.')[:3]) for enzyme in reaction_enzymes.enzyme]
enzyme_count = reaction_enzymes.loc[:,['reaction', '3rd_level']].groupby('reaction').agg({'3rd_level': 'nunique'})
unambiguous_enzyme_reactions = enzyme_count[enzyme_count['3rd_level'] == 1].index

In [16]:
infos = {
    'number of annotated enzymes':reaction_enzymes.enzyme.drop_duplicates().shape[0],
    'number of annotated reactions': reaction_enzymes.reaction.drop_duplicates().shape[0],
    'number of unambiguously annotated reactions': unambiguous_enzyme_reactions.shape[0],
}
infos

{'number of annotated enzymes': 5733,
 'number of annotated reactions': 10139,
 'number of unambiguously annotated reactions': 9959}

## <a class="anchor" id="compounds"></a>Compounds [$\Uparrow$](#toc)
The list of compounds is collected by parsing reaction equations of those reactions that are related to an enzyme.

In [17]:
import re
def rlayout(rid, unwell=None):
    equations = krdf[(krdf.subject == rid) & (krdf.property == 'EQUATION')].object.values
    assert len(equations) == 1
    
    # remove '(side x)' remarks from equation
    equation = re.sub(r'\(side \d+\)', '', equations[0])
    
    def stoich(p):
        nc = p.split()
        if len(nc) == 1:
            if re.match(r'[CG]\d{5}$', nc[0]):
                return (nc[0], '1')
            m = re.match(r'([CG]\d{5})\((.+)\)$', nc[0])
            if m:
                return (m.group(1), m.group(2))
        if len(nc) == 2:
            if re.match(r'[CG]\d{5}$', nc[1]):
                return (nc[1], nc[0])
            m = re.match(r'([CG]\d{5})\((in|out)\)$', nc[1])
            if m:
                return (m.group(1), nc[0])
        assert False, rid
    substrates, products = [dict([stoich(p) for p in side.split(' + ')]) for side in equation.split(' <=> ')]
    autotransients = set(substrates.keys()).intersection(set(products.keys()))
    stoichn_corr = dict()
    if len(autotransients):
        if unwell is None:
            print(f'not well defined: {rid}')
        else:
            unwell.append(rid)
        for at in autotransients:
            try:
                sn = int(substrates[at])
                pn = int(products[at])
                stoichn_corr[at] = {False: str(sn - min(sn,pn)), True: str(pn - min(sn,pn))}
            except ValueError:
                for nm in ['n', 'm']:
                    if substrates[at] == nm and nm in products[at].split('+'):
                        stoichn_corr[at] = {False: '0', True: '+'.join([p for p in products[at].split('+') if p != nm])}
                    elif products[at] == nm and nm in substrates[at].split('+'):
                        stoichn_corr[at] = {True: '0', False: '+'.join([p for p in substrates[at].split('+') if p != nm])}
                    elif substrates[at] == nm and products[at].startswith(nm+'-'):
                        stoichn_corr[at] = {True: '0', False: products[at].split('-')[1]}
                    elif products[at] == nm and substrates[at].startswith(nm+'-'):
                        stoichn_corr[at] = {False: '0', True: substrates[at].split('-')[1]}
    
    rclasses = krdf[(krdf.subject == rid) & (krdf.property == 'RCLASS')].object.values
    essentials = set()
    for e in [r.split()[1].split('_') for r in rclasses]:
        essentials = essentials.union(e)
    for isproduct, nc in zip([False, True], [substrates, products]):
        for c, n in nc.items():
            yield {
                'reaction': rid,
                'compound': c,
                'stoichn': n,
                'stoichn_corr': stoichn_corr.get(c,dict()).get(isproduct,n),
                'isproduct': isproduct,
                'rclass': c in essentials,
                'auto_transient': c in autotransients
            }
DataFrame(rlayout('R11747'))

not well defined: R11747


Unnamed: 0,auto_transient,compound,isproduct,rclass,reaction,stoichn,stoichn_corr
0,True,C00760,False,False,R11747,n+m,n
1,False,C00030,False,False,R11747,1,1
2,False,C00007,False,False,R11747,1,1
3,False,C21628,True,False,R11747,1,1
4,True,C00760,True,False,R11747,m,0
5,False,C00028,True,False,R11747,1,1
6,False,C00001,True,False,R11747,1,1


In [18]:
from tqdm import tqdm_notebook

def collect_rlayout(reactions, unwell):
    for r in tqdm_notebook(reactions):
        for y in rlayout(r, unwell):
            yield y
unwell = []
rpdf = DataFrame(collect_rlayout(reaction_enzymes[
    reaction_enzymes.reaction.isin(unambiguous_enzyme_reactions)
].reaction.drop_duplicates(), unwell))
rpdf.to_csv(f'{DATA}/kegg-rrole.tsv', sep="\t", index=None)
rpdf.shape, len(unwell)

HBox(children=(IntProgress(value=0, max=9959), HTML(value='')))




((44814, 7), 177)

#### Filtering out of reactions with rclass stoichiometric numbers greater than 1
The matching algorithm does not consider stoichiometry of reactions. In pre-experiments, we realized that this might lead to fortuitous matches. We therefore restricted the reactions analyzed with the matching algorithm to  those that have no stoichiometric numbers different from 1, excepted for cases where the compounds with stoichiometric numbers different from 1 were not part of KEGG's reaction class annotation for that reaction.

In [19]:
stoichiometric_problematic_reactions = rpdf[(rpdf.stoichn_corr != '1') & (rpdf.rclass)].reaction.drop_duplicates()
stoichiometric_problematic_reactions.shape

(504,)

In [20]:
rpdf_r = rpdf[~rpdf.reaction.isin(stoichiometric_problematic_reactions)]
rpdf_r.shape, rpdf.shape

((42299, 7), (44814, 7))

In [21]:
rpdf = rpdf_r

#### SMILES from the _Indigo_ Toolkit
Although the compound entries from the REST interface provide sufficient information about the molecular structure of compounds, the information is not written in a readily readable format. Therefore the download feature from KEGG's main page was used to retrieve molecule data as _Mol_ files, which were converted to SMILES strings by use of _EPAM_'s _Indigo_ toolkit. Note that some of the compounds do not have a defined molecular structure and that for some of the compounds conversion to a SMILES string with the _Indigo_ toolkit failed.

In [22]:
from indigo import Indigo
def getSmilesFromCid(cid, idg):
    mol = client.get(f'https://www.genome.jp/dbget-bin/www_bget?-f+m+compound+{cid}').content.decode()
    return idg.loadMolecule(mol).smiles()
getSmilesFromCid('C00001', Indigo())

'O([H])[H]'

In [23]:
faultycompounds = []
idg = Indigo()
def compound_maps(clist):
    for cid in tqdm_notebook(clist):
        try:
            yield {'compound':cid, 'smiles':getSmilesFromCid(cid, idg)}
        except Exception as e:
            faultycompounds.append([cid, e.__class__.__name__, str(e)])
cpdf = DataFrame(compound_maps(rpdf['compound'].drop_duplicates().values))
cpdf.shape[0], 'successfully converted, ', len(faultycompounds), 'failed'

HBox(children=(IntProgress(value=0, max=8336), HTML(value='')))




(8291, 'successfully converted, ', 45, 'failed')

#### Special Compounds: Cofactors
Complete pairs of cofactors are considered non-essential and will be removed from the reaction equation in the <i>[match](./match.ipynb#cofactors)</i> finding step of the workflow.<br>
Here they are standardized and dumped to a file in the data directory.

In [24]:
cofactors = cpdf[cpdf['compound'].isin(
    ['C00003', 'C00004', 'C00005', 'C00006', 'C00016', 'C01352', 'C00390', 'C00399', 'C00002', 'C00008', 'C00044', 'C00035', 'C00075', 'C00015']
)]
cofactors

Unnamed: 0,compound,smiles
38,C00002,N1(C=NC2C(N)=NC=NC1=2)[C@H]1[C@H](O)[C@@H]([C@...
40,C00008,N1(C=NC2C(N)=NC=NC1=2)[C@H]1[C@H](O)[C@@H]([C@...
44,C00004,N1(C=NC2C(N)=NC=NC1=2)[C@H]1[C@H](O)[C@@H]([C@...
45,C00003,[C@H]1(O)[C@H]([N+]2C=CC=C(C(=O)N)C=2)O[C@H](C...
52,C00006,[C@H]1(O[C@@H]([C@@H](O)[C@H]1OP(=O)(O)O)COP(=...
53,C00005,[C@H]1(O[C@@H]([C@@H](O)[C@H]1OP(=O)(O)O)COP(=...
78,C00390,C1(OC)C(O)=C(C)C(C{-}/C=C(/C{+n}C/C=C(\C)/C)\C...
80,C00399,C1(OC)C(=O)C(C)=C(C{-}/C=C(/C{+n}C/C=C(\C)/C)\...
85,C00015,[C@@H]1([C@H](O)[C@@H]([C@@H](COP(=O)(O)OP(=O)...
87,C00075,[C@@H]1([C@H](O)[C@@H]([C@@H](COP(=O)(O)OP(=O)...


In [25]:
cofactor_entries = []
cofactors.apply(lambda row: extend_entries(cofactor_entries, row['compound'], skipped=['BRITE']), axis=1)
cofactor_df = DataFrame(cofactor_entries)

cofactors.merge(
    cofactor_df[cofactor_df.property == 'NAME'].groupby('subject').agg({'object':'sum'}),
    left_on='compound', right_on='subject'
).rename(columns={'object':'name'})\
 .to_csv(f'{DATA}/cofactors.tsv', sep='\t', index=None)

In [26]:
read_csv(f'{DATA}/cofactors.tsv', sep='\t').to_csv(f'{DATA}/../cofactors.tsv', sep='\t', index=None, header=None)
import subprocess
command = f'../bin/standardize -f 2 -o {DATA}/../cofactors.standardized.tsv {DATA}/../cofactors.tsv'
sr = subprocess.run(command.split(), capture_output=True)
sr.returncode or None

## <a class="anchor" id="files"></a>Data Files [$\Uparrow$](#toc)
#### Compounds
Only compounds with valid SMILES strings were taken into account.

In [27]:
nosmiles = list(DataFrame(faultycompounds, columns=['compound', 'exception', 'message'])['compound'].values)\
         + list(cpdf[cpdf.smiles == '']['compound'].values)

Abstract SMILES containing undefined residues `e.g., [*]` or repetitive substructures `e.g., {-}..{+n}` are not suitable for rule application and were excluded.

In [28]:
abstractsmiles = list(cpdf[cpdf.smiles.str.contains('[\*\{]')]['compound'].values)

In [29]:
RAWCOMPOUNDS = f'{DATA}/{DATASET}_compounds.tsv'
cpdf[~cpdf['compound'].isin(nosmiles + abstractsmiles)]\
    .loc[:,['compound','compound','smiles']]\
    .to_csv(RAWCOMPOUNDS, sep='\t', header=None, index=None)

In [30]:
read_csv(RAWCOMPOUNDS, sep='\t', header=None).shape

(6508, 3)

#### Reactions
Compounds present on both sides of the reaction equation, i.e., as substrates and products, were removed from the side with the lower stoichiometric number or even from both sides if their stoichiometric numbers were equal. Reactions with abstract compounds or compounds without SMILES string were excluded altogether.

In [31]:
excluded_reactions = rpdf[
    rpdf['compound'].isin(nosmiles + abstractsmiles)
].reaction.drop_duplicates().values
len(excluded_reactions)

2871

In [32]:
rrole_with_smiles = rpdf[
    ~rpdf.reaction.isin(excluded_reactions)
  & (rpdf.stoichn_corr != '0')
].merge(cpdf, on='compound')
rrole_with_smiles.shape

(28499, 8)

In [33]:
substrates = rrole_with_smiles[rrole_with_smiles.isproduct == 0]\
    .groupby(['reaction'])\
    .agg({'smiles': lambda smiles: "%s" % '.'.join(smiles)})
substrates.columns = ['substrates']

products = rrole_with_smiles[rrole_with_smiles.isproduct == 1]\
    .groupby(['reaction'])\
    .agg({'smiles': lambda smiles: "%s" % '.'.join(smiles)})
products.columns = ['products']

rs = substrates.join(products).loc[:,['substrates','products']]
rs['reaction'] = rs.index
rs = rs.reindex(['substrates','reaction','products'], axis=1)
rs.reset_index(inplace=True, drop=True)
print(rs.shape)
rs.head()

(6584, 3)


Unnamed: 0,substrates,reaction,products
0,P(=O)(O)(O)OP(=O)(O)O.O([H])[H],R00004,P(=O)(O)(O)O
1,[Mn+2].OO.[H+],R00011,O([H])[H].[Mn+3]
2,C(/C)=[N+](\[O-])/[O-].O=O.N1C(=O)C2NC3C=C(C)C...,R00025,O([H])[H].CC=O.N(=O)O.N1C(=O)C2=NC3=CC(C)=C(C)...
3,O=O.[N+]1(=CSC(CCO)=C1C)CC1C=NC(C)=NC=1N,R00033,OO.C([N+]1=CSC(CC=O)=C1C)C1C=NC(C)=NC=1N
4,O([H])[H].O=O.C([N+]1=CSC(CC=O)=C1C)C1C=NC(C)=...,R00072,OO.N1C(N)=C(C[N+]2=CSC(CC(=O)O)=C2C)C=NC=1C


In [34]:
RAWREACTIONS = f'{DATA}/{DATASET}_reactions.tsv'
rs.to_csv(RAWREACTIONS, sep='\t', header=None, index=None)

#### Reaction Enzyme Links

In [35]:
REACTIONENZYMES = f'{DATA}/{DATASET}_reaction-enzymes.tsv'
reaction_enzymes[~reaction_enzymes.reaction.isin(excluded_reactions)]\
    .to_csv(REACTIONENZYMES, sep="\t", index=None, header=None)
read_csv(REACTIONENZYMES, sep="\t", header=None, names=['reaction', 'enzyme', 'ename', '3rd_level']).head()

Unnamed: 0,reaction,enzyme,ename,3rd_level
0,R00002,1.18.6.1,nitrogenase,1.18.6
1,R00004,3.6.1.1,inorganic diphosphatase,3.6.1
2,R00005,3.5.1.54,allophanate hydrolase,3.5.1
3,R00006,2.2.1.6,acetolactate synthase,2.2.1
4,R00008,4.1.3.17,4-hydroxy-4-methyl-2-oxoglutarate aldolase,4.1.3
