In [1]:
import pandas as pd
import re
import html
import numpy as np

### Load and clean original data

In [2]:
df_act = pd.read_excel('Allosteric_interactions_BRENDA.xlsx', sheet_name='Activators', names=['EC', 'Enz', 'Met', 'Org', 'Mode'], usecols='A,B,C,E,I')
df_inh = pd.read_excel('Allosteric_interactions_BRENDA.xlsx', sheet_name='Inhibitors', names=['EC', 'Enz', 'Met', 'Org', 'Mode'], usecols='A,B,C,E,I')

#Remove metabolites named "additional information" and data from viruses, and limit species name to two names
df_act = df_act[~df_act['Org'].str.contains('virus')]
df_act = df_act[~df_act['Met'].str.contains('additional information')]
df_act['Org'] = df_act['Org'].apply(lambda x: ' '.join(x.split()[:2]))

df_inh.Org = df_inh.Org.fillna('')
df_inh = df_inh[~df_inh['Org'].str.contains('virus')]
df_inh = df_inh[~df_inh['Met'].str.contains('additional information')]
df_inh['Org'] = df_inh['Org'].apply(lambda x: ' '.join(x.split()[:2]))

In [None]:
#Save the cleaned dataframes to csv
df_act.to_csv('activators_clean.csv')
df_inh.to_csv('inhibitors_clean.csv')

### Integrate ChEBI and InChI Ids

In [3]:
#Load chebi and inchi IDs
df_chebi_BRENDA = pd.read_csv('brenda_compounds.tsv', sep='\t', names=['Met', 'Inchi', 'ChEBI'], header=0)

#Integrate chebi and inchi IDs into BRENDA data
df_act_chebi = pd.merge(df_act, df_chebi_BRENDA[['Met', 'ChEBI', 'Inchi']], how="left", on='Met')
df_inh_chebi = pd.merge(df_inh, df_chebi_BRENDA[['Met', 'ChEBI', 'Inchi']], how="left", on='Met')

### Isolate intracellular compounds

#### Load and clean BiGG data

In [4]:
df_bigg = pd.read_excel('bigg_metabolites.xlsx')

# Get one database link per row
df_bigg_split = df_bigg.assign(database_links=df_bigg['database_links'].str.split(';')).explode('database_links')

In [5]:
# Isolate the chebi id rows from the BiGG dataframe
df_bigg_chebis = df_bigg_split[df_bigg_split['database_links'].str.contains('chebi','CHEBI')]

In [6]:
# Extract the ChEBI id (ChEBI:12345) from the database links
def extract_chebi(link):
    for i in range(1,7):
        if link[-i].isdigit():
            continue
        else:
            index = i+5
            return link[-index:]

In [7]:
# Make list of the ChEBI ids
bigg_chebi_list = df_bigg_chebis['database_links'].apply(lambda x: extract_chebi(x)).tolist()

#### Integrate with BRENDA data

In [8]:
df_act_intracellular = df_act_chebi[df_act_chebi['ChEBI'].isin(bigg_chebi_list)]
df_inh_intracellular = df_inh_chebi[df_inh_chebi['ChEBI'].isin(bigg_chebi_list)]

#Those with NaN or '-' (without CHEBI id) were not included in the intracellular group

### Remove inorganic compounds from the intracellular interactions

In [9]:
#Get chemical formula from Inchi id
df_act_intracellular['Inchi'] = df_act_intracellular['Inchi'].str.split('/').str[1]
df_inh_intracellular['Inchi'] = df_inh_intracellular['Inchi'].str.split('/').str[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_act_intracellular['Inchi'] = df_act_intracellular['Inchi'].str.split('/').str[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inh_intracellular['Inchi'] = df_inh_intracellular['Inchi'].str.split('/').str[1]


In [10]:
#Function for parsing the chemical formula to make dictionary of 'atom: number of atoms'
from warnings import warn


ELEMENT_RE = re.compile(r'(?P<atom>[A-Z][a-z]?)(?P<coeff>\d*)')


def parse_formula(formula):
    """ Convert compound formula from string to dictionary.

    For example, C6H12O6 (glucose) becomes {C:6, H:12, O:6}.

    Args:
        formula (str): compound formula

    Returns:
        dict: formula as a dictionary
    """
    return {atom: (int(coeff) if coeff else 1)
            for atom, coeff in re.findall(ELEMENT_RE, formula)}



In [11]:
#Parse formulas in dataframes
df_act_intracellular['formula'] = df_act_intracellular['Inchi'].apply(lambda x: parse_formula(x))
df_inh_intracellular['formula'] = df_inh_intracellular['Inchi'].apply(lambda x: parse_formula(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_act_intracellular['formula'] = df_act_intracellular['Inchi'].apply(lambda x: parse_formula(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inh_intracellular['formula'] = df_inh_intracellular['Inchi'].apply(lambda x: parse_formula(x))


In [12]:
#Define function for determining whether a compound is organic

def is_organic(formula):
    if all(key in formula for key in ('C', 'H', 'O')):
        return 1
    return 0

In [13]:
#Apply function to data
df_act_intracellular['organic'] = df_act_intracellular['formula'].apply(lambda x: is_organic(x))
df_inh_intracellular['organic'] = df_inh_intracellular['formula'].apply(lambda x: is_organic(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_act_intracellular['organic'] = df_act_intracellular['formula'].apply(lambda x: is_organic(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inh_intracellular['organic'] = df_inh_intracellular['formula'].apply(lambda x: is_organic(x))


In [14]:
#Filter out the inorganic compounds and drop duplicates
df_act_intra_org = df_act_intracellular[df_act_intracellular.organic != 0]
df_inh_intra_org = df_inh_intracellular[df_inh_intracellular.organic != 0]

In [15]:
#Drop unnecessary columns and duplicates from data
df_act_intra_org_final = df_act_intra_org.drop(['Inchi', 'formula', 'organic'], axis=1).drop_duplicates()
df_inh_intra_org_final = df_inh_intra_org.drop(['Inchi', 'formula', 'organic'], axis=1).drop_duplicates()

In [52]:
#Save the files to csv
df_act_intra_org_final.to_csv('activators_intracellular.csv')
df_inh_intra_org_final.to_csv('inhibitors_intracellular.csv')

In [21]:
df_inh_intra_org_final

Unnamed: 0,EC,Enz,Met,Org,Mode,ChEBI
56,1.1.1.1,alcohol dehydrogenase,2-propanol,Sulfolobus acidocaldarius,-,CHEBI:17824
58,1.1.1.1,alcohol dehydrogenase,2-thioacetate,Equus caballus,-,CHEBI:30066
112,1.1.1.1,alcohol dehydrogenase,acetaldehyde,Saccharomyces cerevisiae,-,CHEBI:15343
113,1.1.1.1,alcohol dehydrogenase,acetaldehyde,Meyerozyma guilliermondii,-,CHEBI:15343
114,1.1.1.1,alcohol dehydrogenase,acetaldehyde,Crocus sativus,-,CHEBI:15343
...,...,...,...,...,...,...
260698,7.6.2.9,ABC-type quaternary amine transporter,Betaine aldehyde,Aphanothece halophytica,-,CHEBI:15710
260701,7.6.2.9,ABC-type quaternary amine transporter,carnitine,Lactococcus lactis,-,CHEBI:3424
260702,7.6.2.9,ABC-type quaternary amine transporter,choline,Aphanothece halophytica,-,CHEBI:15354
260706,7.6.2.9,ABC-type quaternary amine transporter,proline,Lactococcus lactis,-,CHEBI:60039


In [28]:
#Make one big file of both activators and inhibitors and save to csv
df_BRENDA_intracellular = pd.concat([df_act_intra_org_final, df_inh_intra_org_final]).reset_index(drop=True)
df_BRENDA_intracellular.to_csv('BRENDA_interactions_intracellular.txt')