In [16]:
import pandas as pd
from tqdm import tqdm
import json
import re

def extract_elements(df, column_name):
    '''
    Function to extract the chemical elements that exist in the compounds
    '''
    # define the regular expression pattern to match the chemical formula
    pattern = r'[A-Z][a-z]?'
    # initialize a set to store the element symbols
    elements = set()
    # loop over the values in the specified column of the DataFrame
    for value in df[column_name].values:
        # find all matches of the pattern in the value string
        matches = re.findall(pattern, value)
        # add the matches to the set of elements
        elements.update(matches)
    return elements

def extract_stoichiometry(formula):
    '''
    Exctracts the stoichiometry 
    '''
    # define the regular expression pattern to match the chemical formula
    pattern = r'([A-Z][a-z]?)(\d*)'
    # initialize the dictionary to store the element symbol and its stoichiometry
    stoichiometry = {}
    # loop over the matches of the pattern in the formula string
    for match in re.findall(pattern, formula):
        symbol, count = match
        # if the count is empty, set it to 1
        count = int(count) if count else 1
        # add the symbol and count to the stoichiometry dictionary
        stoichiometry[symbol] = count
    return stoichiometry

In [17]:
_df = pd.read_excel('data/original/KEGG_Pathway_Search_Ori.xlsx', sheet_name='Compound')

df=_df.copy()

# example usage
element_names = extract_elements(df, 'Formula')
print('The chemical elements that could be found in the given metabolites are:\n', element_names)

# Create a col for every element
for elm in element_names: df[elm]=0

df['polymer'] = 0
for row in range(len(df)):
    formula = df['Formula'].iloc[row]
    stoichiometry = extract_stoichiometry(formula)
    for key, value in stoichiometry.items():
        df.loc[df.index[row], key] = value
    if 'n' in df['Formula'].iloc[row]: 
        df.loc[df.index[row], 'polymer'] = 1

The chemical elements that could be found in the given metabolites are:
 {'W', 'R', 'Cl', 'Hg', 'Fe', 'Mn', 'Se', 'C', 'X', 'Na', 'Mo', 'P', 'B', 'H', 'I', 'As', 'O', 'Mg', 'N', 'Ni', 'S', 'Br', 'F', 'Co'}


In [9]:
# dict of chemical elements and their molecular weight
elements = {
    'Co': 58.93,
    'Se': 78.96,
    'Cl': 35.45,
    'Ni': 58.69,
    'N': 14.01,
    'Hg': 200.6,
    'B': 10.81,
    'F': 19.00,
    'Fe': 55.85,
    'Br': 79.90,
    'W': 183.8,
    'Mo': 95.94,
    'Mn': 54.94,
    'I': 126.9,
    'C': 12.01,
    'Na': 22.99,
    'H': 1.008,
    'O': 16.00,
    'S': 32.07,
    'As': 74.92,
    'P': 30.97,
    'Mg': 24.31
}

# calculate the molecular weights of every compound
mw = []
for row in tqdm(range(len(df))):
    weight = 0
    for col in elements.keys():
        weight = weight + elements[col] * df.iloc[row][col]
    if (df.iloc[row]['R'] + df.iloc[row]['polymer']) != 0:
        mw.append(weight * (df.iloc[row]['R'] + df.iloc[row]['polymer'])/2)
    else:
        mw.append(weight)
        
df['mol_weight'] = mw

# Col that contains the info if the compound is a polymer or not
df['polymer'] = 0
for row in range(len(df)):
    if 'n' in df['Formula'].iloc[row]: 
        df.loc[df.index[row], 'polymer'] = 1

df.to_csv('data/compounds_final.csv')

100%|██████████| 8591/8591 [00:23<00:00, 370.75it/s]


In [10]:
''' *********** Create pairs dataset ************** '''
# Load the Excel file into a pandas DataFrame
rxns = pd.read_excel('data/original/KEGG_Pathway_Search_Ori.xlsx', sheet_name='Reaction')

# drop unusefull columns
rxns.drop(columns=['Compound Pair (0.1)', 'Compound Pair (0.2)', 
                    'Compound Pair (0.3)', 'Compound Pair (0.4)', 
                    'Compound Pair (0.5)', 'Compound Pair (0.6)', 
                    'Compound Pair (0.7)', 'Compound Pair (0.8)',
                    'Compound Pair (0.9)',], inplace=True)
        
rxns.rename({'Compound Pair (1.0)':('Reaction_pair')}, axis=1, inplace=True)
rxns['Reaction_pair'] = rxns['Reaction_pair'].apply(lambda x: json.loads(x))

pairs = []
reactions = []
for i, row in enumerate(range(len(rxns))):
    for d in (rxns['Reaction_pair'][row]):
        for key, values in d.items():
            for value in values:
                pairs.append(f"{key}_{value}")
                reactions.append(rxns.iloc[i]['Entry'])


pairs = pd.DataFrame(pairs, columns=['Reactant_pair'])
pairs['KEGG_reactions'] = reactions
pairs['source'] = pairs['Reactant_pair'].apply(lambda x: x.split('_')[0])
pairs['target'] = pairs['Reactant_pair'].apply(lambda x: x.split('_')[1])
# group by Reactant_pair and combine KEGG_reactions values with comma-separated values
grouped = pairs.groupby('Reactant_pair')['KEGG_reactions'].apply(lambda x: ','.join(x)).reset_index()
# select the first row of each group as the row to include in the final dataframe
pairs = pairs.groupby('Reactant_pair').first().reset_index()
# add the combined KEGG_reactions column to the final dataframe
pairs['KEGG_reactions'] = grouped['KEGG_reactions']
# sort the final dataframe by Reactant_pair and reset the index
pairs = pairs.sort_values('Reactant_pair').reset_index(drop=True)
idx_drop = pairs[pairs['source'] == pairs['target']].index # Drop pairs where source==target
pairs.drop(idx_drop, inplace=True)
print('Pairs shape:', pairs.shape)

Pairs shape: (22680, 4)


In [11]:
kegg_pairs = pd.read_csv('data/original/kegg_pairs.csv', sep='\t')
kegg_pairs['source'] = kegg_pairs['Reactant_pair'].apply(lambda x: x.split('_')[0])
kegg_pairs['target'] = kegg_pairs['Reactant_pair'].apply(lambda x: x.split('_')[1])
kegg_pairs.drop(columns=['CAR', 'RPAIR_main'], inplace=True)

# concatenate the dataframes vertically
concatenated = pd.concat([pairs, kegg_pairs], ignore_index=True)
source_mw = df.set_index('Entry')['mol_weight']
target_mw = source_mw.reindex(concatenated['target']).values
concatenated['MW'] = abs(source_mw.reindex(concatenated['source']).values - target_mw) / (source_mw.reindex(concatenated['source']).values + target_mw + 1e-6)
concatenated['num_reactions'] = concatenated['KEGG_reactions'].apply(lambda x: len(x.split(',')))
concatenated.to_csv('data/pairs_final.csv')

## Reaction dataset

In [1]:
import pandas as pd

df = pd.read_excel('data/original/KEGG_Pathway_Search_Ori.xlsx', sheet_name='Reaction')
df.to_csv('data/reactions_final.csv')


In [4]:
df.head(3)

Unnamed: 0,Entry,Names,Definition,Equation,Direction,Coefficient,Compound,SMILES,Compound Pair (0.1),Compound Pair (0.2),...,Compound Pair (0.6),Compound Pair (0.7),Compound Pair (0.8),Compound Pair (0.9),Compound Pair (1.0),Status,Comment,EC Number,Rhea,Reference
0,R00004,"[""diphosphate phosphohydrolase"", ""pyrophosphat...",Diphosphate + H2O <=> 2 Orthophosphate,C00013 + C00001 <=> 2 C00009,"""<=>""","[[1, 1], [2]]","[[C00013, C00001], [C00009]]","[[""O=P(O)(O)OP(=O)(O)O"", ""O""], [""O=P(O)(O)O""]]","[{}, {}]","[{}, {}]",...,"[{}, {}]","[{}, {}]","[{}, {}]","[{}, {}]","[{}, {}]",Balanced,"""""","[""3.6.1.1""]","[""24579""]",""""""
1,R00005,"[""urea-1-carboxylate amidohydrolase""]",Urea-1-carboxylate + H2O <=> 2 CO2 + 2 Ammonia,C01010 + C00001 <=> 2 C00011 + 2 C00014,"""<=>""","[[1, 1], [2, 2]]","[[C01010, C00001], [C00011, C00014]]","[[""NC(=O)NC(=O)O"", ""O""], [""O=C=O"", ""N""]]","[{}, {}]","[{}, {}]",...,"[{}, {}]","[{}, {}]","[{}, {}]","[{}, {}]","[{}, {}]",Balanced,The yeast enzyme (but not that from green alga...,"[""3.5.1.54""]","[""19032""]",""""""
2,R00006,"[""pyruvate:pyruvate acetaldehydetransferase (d...",2-Acetolactate + CO2 <=> 2 Pyruvate,C00900 + C00011 <=> 2 C00022,"""<=>""","[[1, 1], [2]]","[[C00900, C00011], [C00022]]","[[""CC(=O)C(C)(O)C(=O)O"", ""O=C=O""], [""CC(=O)C(=...","[{""C00900"": [""C00022""]}, {""C00022"": [""C00900""]}]","[{""C00900"": [""C00022""]}, {""C00022"": [""C00900""]}]",...,"[{""C00900"": [""C00022""]}, {""C00022"": [""C00900""]}]","[{""C00900"": [""C00022""]}, {""C00022"": [""C00900""]}]","[{""C00900"": [""C00022""]}, {""C00022"": [""C00900""]}]","[{""C00900"": [""C00022""]}, {""C00022"": [""C00900""]}]","[{""C00900"": [""C00022""]}, {""C00022"": [""C00900""]}]",Balanced,TPP-dependent enzymatic reaction (R00014+R03050),"[""2.2.1.6""]",[],""""""


In [10]:
import pandas as pd
from itertools import product
import json 

df['Compound']  = df['Compound'].apply(lambda x: json.loads(x))

pairs = pd.DataFrame()
reacs = []
for i, data in enumerate(df['Compound']):
    combinations = list(product(data[0], data[1]))
    reaction = df['Entry'].iloc[i]
    # Create a list of dictionaries representing each row with the pairs
    rows = [{'Pairs': '_'.join(pair)} for pair in combinations]
    
    for j in range(len(combinations)):reacs.append(reaction)

    # Append each row to the pairs DataFrame
    pairs = pairs.append(rows, ignore_index=True)

In [13]:
pairs['source'] = pairs['Pairs'].apply(lambda x: x.split('_')[0])
pairs['target'] = pairs['Pairs'].apply(lambda x: x.split('_')[1])
pairs['Reaction'] = reacs
pairs.to_csv('data/pairs_final.csv')
pairs

Unnamed: 0,Pairs,source,target,Reaction
0,C00013_C00009,C00013,C00009,R00004
1,C00001_C00009,C00001,C00009,R00004
2,C01010_C00011,C01010,C00011,R00005
3,C01010_C00014,C01010,C00014,R00005
4,C00001_C00011,C00001,C00011,R00005
...,...,...,...,...
55975,C00048_C00001,C00048,C00001,R12693
55976,C03227_C05645,C03227,C05645,R12694
55977,C03227_C00037,C03227,C00037,R12694
55978,C00048_C05645,C00048,C05645,R12694
