In [1]:
import pandas as pd
from collections import Counter
import ast

### Load and organize data

In [2]:
#Load data from BRENDA
df = pd.read_csv('BRENDA_interactions.txt', header=0, sep='\t', index_col=0).drop_duplicates().reset_index(drop=True)

In [6]:
#Create a dataframe of the top hundred organisms
brenda_top_hundred = pd.read_csv('top_hundred_organisms.txt', index_col=0, header=0, sep='\t')

#Create dataframe with interactions from only top hundred organisms
df_top_organisms_brenda = df[df['Org'].isin(list(brenda_top_hundred['Org']))]

In [8]:
#One of the top hundred organisms in the data from BRENDA is "mammal"
#In the phylogenetic tree I created, these are the organisms on the "Mammal" branch:
mammals = ['Cavia_porcellus',
 'Rattus_norvegicus',
 'Rattus_sp.',
 'Mus_musculus',
 'Cricetulus_griseus',
 'Mesocricetus_auratus',
 'Oryctolagus_cuniculus',
 'Homo_sapiens',
 'Ovis_aries',
 'Bos_taurus',
 'Sus_scrofa',
 'Equus_caballus',
 'Canis_lupus']


#Replace the underscore with space
mammals_new = []
for mammal in mammals:
    mammals_new.append(mammal.replace('_', ' '))

In [9]:
#Set every organism listed as "Mammalia" to the list of mammals
for ind, row in df_top_organisms_brenda.iterrows():
    if row['Org'] == 'Mammalia':
        df_top_organisms_brenda['Org'].at[ind] = mammals_new

In [10]:
#Explode dataframe on Org to get one organism on each row, and remove duplicates
df_top_organisms_brenda = df_top_organisms_brenda.explode('Org').drop_duplicates(['EC', 'Org', 'Mode', 'ChEBI'])

### Annotate the phylogenetic tree with interactions

#### Find top EC numbers and their respective top metabolite

In [17]:
#Group the organisms by EC number to get a set of organisms for each EC number
df_groups_top_ECs = df_top_organisms_brenda.groupby('EC')['Org'].apply(set).reset_index()

In [18]:
#Create column with the number of organisms every EC number is documented for
#This is done to find the top ten documented EC numbers
df_groups_top_ECs['Count'] = df_groups_top_ECs.agg({'Org':len})

#Take the top ten EC numbers
df_top_ten_ECs = df_groups_top_ECs.sort_values('Count', ascending=False).head(10)

In [13]:
#Add an "interaction" column (CHEBI:mode) to the interaction data 
df_top_organisms_brenda['Interaction'] = df_top_organisms_brenda['ChEBI'] + ':' + df_top_organisms_brenda['Mode']

#Group the interactions by EC number to get a set of interactions for each EC number
df_groups_top_ECs_with_mets = df_top_organisms_brenda.groupby('EC')['Interaction'].apply(list).reset_index()

In [14]:
#Create column with the count of each interaction for each EC number
#This is done to get the top interaction (metabolite+mode) for each EC number
df_groups_top_ECs_with_mets['Counts'] = df_groups_top_ECs_with_mets['Interaction'].apply(lambda x: dict(Counter(x)))

In [20]:
#Make a dataframe of the top ten EC numbers and their interaction counts
df_top_ten_ECs_interactions = pd.merge(df_top_ten_ECs, df_groups_top_ECs_with_mets[['EC', 'Counts']], how='left', on='EC')

In [22]:
#Make column with the top interaction for each EC number
df_top_ten_ECs_interactions['Top interaction'] = df_top_ten_ECs_interactions['Counts'].apply(lambda x: max(x, key=x.get))

In [26]:
#Group organisms in BRENDA by EC number and interaction 
#This is done to get the organisms in which the interactions (EC number:ChEBI:mode) are documented
df_groups_ECs_interactions = df_top_organisms_brenda.groupby(['EC', 'Interaction'])['Org'].apply(set).reset_index() 

In [28]:
#Make a column of full interaction (EC number+interaction)
df_groups_ECs_interactions['EC+Interaction'] = df_groups_ECs_interactions['EC'] + ':' + df_groups_ECs_interactions['Interaction']

In [31]:
#Make a column of full interaction (EC number+interaction)
df_top_ten_ECs_interactions['EC+Interaction'] = df_top_ten_ECs_interactions['EC'] + ':' + df_top_ten_ECs_interactions['Top interaction']

In [34]:
#Drop every row from the BRENDA data that does not represent the top ten interactions (EC number:ChEBI:mode)
#This is done to get a dataframe of the top ten EC numbers, their top metabolic regulator + mode, and the organisms in which these are documented

for ind, row in df_groups_ECs_interactions.iterrows():
    if row['EC+Interaction'] not in list(df_top_ten_ECs_interactions['EC+Interaction']):
        df_groups_ECs_interactions.drop(ind, inplace=True)

In [42]:
#Make column of the ChEBI id of the top metabolic regulator
df_groups_ECs_interactions['ChEBI'] = df_groups_ECs_interactions['Interaction'].apply(lambda x: x[:-2])

#Make column of the mode of the top metabolic regulation
df_groups_ECs_interactions['Mode'] = df_groups_ECs_interactions['Interaction'].apply(lambda x: x[-1])

In [43]:
#Make a dictionary of one synonym for each metabolite
df_mets_grouped = df[['Met', 'ChEBI']].drop_duplicates().groupby(['ChEBI'])['Met'].apply(list).apply(lambda x: x[0]).reset_index()
mets_dict = pd.Series(df_mets_grouped.Met.values, index=df_mets_grouped.ChEBI).to_dict()

#Map the ChEBI ids to metabolite name
df_groups_ECs_interactions['Met'] = df_groups_ECs_interactions['ChEBI'].map(mets_dict)

#### Make binary annotation file for iTOL

In [127]:
#Create a dataframe of the top hundred organisms, minus mammalia
#Results in 99 organisms
df_itol_binary = brenda_top_hundred
df_itol_binary = df_itol_binary[~df_itol_binary['Org'].isin(['Mammalia'])]

In [47]:
#Create the dataframe of binary values for plotting interactions

#Make a copy of the binary dataframe
df_itol_binary2 = df_itol_binary

#Iterate through every organism in the tree
for ind1, row1 in df_itol_binary2.iterrows():
    #Iterate through every interaction we are interested in plotting 
    for ind2, row2 in df_groups_ECs_interactions.iterrows():
        #Make the interaction label
        interaction = f"{row2['EC']} {row2['Met']} {row2['Mode']}"
        
        #Check if the interaction is documented for the organism in question
        if row1['Org'] in row2['Org']:
            #Set the binary value to 1 if interaction is documented
            df_itol_binary2.loc[ind1, interaction] = 1
        else:
            #Set the binary value to 0 if interaction is not documented
            df_itol_binary2.loc[ind1, interaction] = -1
            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [48]:
#Replace ' ' with underscore in organism names
df_itol_binary2['Org'] = df_itol_binary2['Org'].replace(' ', '_', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_itol_binary2['Org'] = df_itol_binary2['Org'].replace(' ', '_', regex=True)


In [None]:
#Copy the binary dataframe to clipboard for creating the file that is uploaded to iTOL
df_itol_binary2.to_clipboard(index=False)

### Map predicted interactions to tree

#### Load an organize data

In [52]:
#Load the predicted interactions from enrichment analysis
df_predicted = pd.read_csv('predicted_interactions.txt', header=0, index_col=0)

In [63]:
#Make a list of the interactions (ChEBI+mode) that are mapped to the tree
interactions_in_tree = list(df_groups_ECs_interactions['Interaction'])

In [65]:
#Filter the dataframe of predicted interactions for those that are in the tree
df_predicted_tree = df_predicted[df_predicted['interaction'].apply(lambda x: x in interactions_in_tree)]

In [66]:
#Make a list of the EC numbers that are mapped to the tree
interactions_in_tree_EC = [row['EC'] for i,row in df_groups_ECs_interactions.iterrows()]

In [77]:
#Load the file of features downloaded for every EC number + organism pair that is in the tree
df_features = pd.read_csv('features_for_ECs_in_tree.txt', index_col=0, header=0)

In [83]:
#When downloading features for the EC number + organism pairs, entries for other organisms are also returned.
#These entries are removed by only keeping those that are from the top hundred organisms
df_features_tree = df_features[df_features['Org'].apply(lambda x: x in set(brenda_top_hundred['Org']))]

In [85]:
#Set every organism listed as "Mammalia" to the list of mammals
for ind, row in df_features_tree.iterrows():
    if row['Org'] == 'Mammalia':
        df_features_tree['Org'].at[ind] = mammals_new

In [90]:
#Explode dataframe on organism and drop duplicates
df_features_tree_exploded_org = df_features_tree.explode('Org').drop_duplicates(['EC numbers', 'Org', 'Interpro_ids'])

In [97]:
#Make sure that the InterPro ids are evaluated as strings
df_features_tree_exploded_org['Interpro_ids'] = df_features_tree_exploded_org['Interpro_ids'].apply(lambda string: ast.literal_eval(string))

In [98]:
#Explode dataframe on interpro ids to get one each row
df_features_tree_exploded_interpro = df_features_tree_exploded_org.explode('Interpro_ids')

In [100]:
#Remove the "InterPro" part of the InterPro ids
df_features_tree_exploded_interpro['Interpro_ids'] = df_features_tree_exploded_interpro['Interpro_ids'].str.replace('InterPro:', '')

In [101]:
#Remove duplicates and reset dataframe index
df_features_tree_exploded_interpro.drop_duplicates(inplace=True)
df_features_tree_exploded_interpro.reset_index(drop=True, inplace=True)

#### Find organisms for which interactions are predicted

In [104]:
#Make groups of features that are associated with each interaction in the tree
feature_groups = df_predicted_tree.groupby('interaction').agg({'features':set}).reset_index()

In [107]:
#Make a dictionary with each interaction in the tree as keys and an empty list as values
#This dictionary is to be used for finding the organisms for which these interactions are predicted
predicted_organisms = {key: [] for key in list(feature_groups['interaction'])}

In [111]:
#This block of code iterates through all the features that were downloaded for the EC number and organism combinations,
#and checks whether this feature predicts an interaction. 

#Iterate through every row of features retrieved for the EC number + organism combo
for ind1, row1 in df_features_tree_exploded_interpro.iterrows():
    
    #Iterate through every row of features associated with interactions
    for ind2, row2 in feature_groups.iterrows():
        
        #Check if the feature retrieved for the EC+org combo is associated with the interaction and not already noted 
        if (row1['Interpro_ids'] in row2['features']) and (row1['Org'] not in predicted_organisms[row2['interaction']]):
            
            #Add the organism to the list of organisms for which the interaction is predicted
            predicted_organisms[row2['interaction']].append(row1['Org'])   

In [113]:
#Add a column to the interactions dataframe for the organisms for which the interaction is predicted
df_groups_ECs_interactions['Org_predicted'] = ''

In [115]:
#This block of code iterates through the dictionary of interactions and organisms for which they are predicted to be present,
#and adds those organisms to the column of predicted organisms in the interactions dataframe.

for interaction, organisms in predicted_organisms.items():
    for ind, row in df_groups_ECs_interactions.iterrows():
        if '{}:{}'.format(row['ChEBI'], row['Mode']) == interaction:
            df_groups_ECs_interactions['Org_predicted'][ind] = organisms

In [132]:
#Create the dataframe of binary values for plotting documented and predicted interactions

#Make a copy of the binary dataframe
df_itol_binary3 = df_itol_binary

#Iterate through every organism in the tree
for ind1, row1 in df_itol_binary3.iterrows():
    #Iterate through every interaction we are interested in plotting 
    for ind2, row2 in df_groups_ECs_interactions.iterrows():
        #Make the interaction label
        interaction = f"{row2['EC']} {row2['Met']} {row2['Mode']}"
        
        #Check if the interaction is documented for the organism in question
        if row1['Org'] in row2['Org']:
            #Set the binary value to 1 if interaction is documented
            df_itol_binary3.loc[ind1, interaction] = 1
        #Check if the interaction is predicted for the organism in question
        elif row1['Org'] in row2['Org_predicted']:
            #Set the binary value to 0 if interaction is predicted
            df_itol_binary3.loc[ind1, interaction] = 0
        else:
            #Set the binary value to 0 if interaction is not documented or predicted
            df_itol_binary3.loc[ind1, interaction] = -1
            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [133]:
#Replace ' ' in organism name with underscore
df_itol_binary3['Org'] = df_itol_binary3['Org'].replace(' ', '_', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_itol_binary3['Org'] = df_itol_binary3['Org'].replace(' ', '_', regex=True)


In [135]:
#Check number of documented and predicted interactions
#The number of documented interactions is given by the number of 1s --> 214 documented interactions
#The number of predicted interactions is given by the number of 0s --> 230 predicted interactions
df_itol_binary3.stack().value_counts()

-1.0                             546
0.0                              230
1.0                              214
Cricetulus_griseus                 1
Bacillus_subtilis                  1
                                ... 
Lactiplantibacillus_plantarum      1
Mesocricetus_auratus               1
Pyrococcus_furiosus                1
Staphylococcus_aureus              1
Salmonella_enterica                1
Length: 102, dtype: int64

In [134]:
df_itol_binary3.to_clipboard(index=False)

Unnamed: 0,Org,1.3.5.1 malonate -,2.2.1.6 L-valine -,2.7.1.1 D-glucose 6-phosphate -,2.7.1.11 citrate -,2.7.1.30 alpha-glycerophosphate -,"2.7.1.40 D-fructose 1,6-bisphosphate +",2.7.2.4 L-threonine -,2.7.7.27 3-phosphoglycerate +,3.1.3.11 AMP -,6.4.1.1 acetyl-CoA +
0,Homo_sapiens,1.0,0.0,1.0,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0
1,Rattus_norvegicus,1.0,0.0,1.0,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0
2,Escherichia_coli,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,-1.0
3,Bos_taurus,1.0,-1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0
4,Saccharomyces_cerevisiae,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
95,Acetoanaerobium_sticklandii,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
96,Cricetulus_griseus,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
97,Helicobacter_pylori,0.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0,-1.0
98,Cucumis_sativus,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0


### Make file of predicted interactions for validation

In [136]:
#Prepare a dataframe
df_predicted_validation = pd.DataFrame(columns=['Organism', 'EC number', 'Metabolite (ChEBI)', 'Mode'])

In [138]:
#Iterate through the interactions dataframe to find predicted interactions
for ind, row in df_groups_ECs_interactions.iterrows():
    #Set EC, ChEBI and mode variables
    EC = row['EC']
    ChEBI = row['ChEBI']
    Mode = row['Mode']

    #Find the organisms for which interactions were predicted and not documented
    Organisms = set(row['Org_predicted']).difference(row['Org'])
    
    #Add the predicted interactions to the dataframe
    for Organism in Organisms:
        df_predicted_validation = pd.concat([pd.DataFrame([[Organism, EC, ChEBI, Mode]], columns=df_predicted_validation.columns), df_predicted_validation], ignore_index=True)
            

In [140]:
#Make a dictionary of all metabolite name synonyms
df_mets_grouped_all_names = df[['Met', 'ChEBI']].drop_duplicates().groupby(['ChEBI'])['Met'].apply(set).reset_index()
mets_dict_all_names = pd.Series(df_mets_grouped_all_names.Met.values, index=df_mets_grouped_all_names.ChEBI).to_dict()

#Map the ChEBI ids to all metabolite synonyms
df_predicted_validation['Metabolite (names)'] = df_predicted_validation['Metabolite (ChEBI)'].map(mets_dict_all_names)

In [145]:
#Make a dictionary of enzyme names for each EC number
df_enz_grouped = df[['Enz', 'EC']].drop_duplicates().groupby(['EC'])['Enz'].apply(set).reset_index()
enz_dict = pd.Series(df_enz_grouped.Enz.values, index=df_enz_grouped.EC).to_dict()

#Map EC number to enzyme name and change to string
df_predicted_validation['Enzyme (name)'] = df_predicted_validation['EC number'].map(enz_dict)
df_predicted_validation['Enzyme (name)'] = df_predicted_validation['Enzyme (name)'].apply(lambda x: ''.join(x))

In [147]:
#Make a dictionary of modes as symbols and words 
modes_dict = {'-': 'Inhibition', '+': 'Activation'}

#Map modes to words
df_predicted_validation['Mode (act/inh)'] = df_predicted_validation['Mode'].map(modes_dict)

In [151]:
#Save file to csv
df_predicted_validation.to_csv('predicted_interactions_to_be_validated.txt', index=False, sep='\t')