In [1]:
import pandas as pd
from collections import Counter

### Load and organize data

In [2]:
#Load the cleaned up interaction data
df = pd.read_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/interactions/BRENDA_interactions_intracellular.txt', header=0, index_col=0)

#Create a dataframe of the top hundred organisms
brenda_top_hundred = pd.read_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/support/top_hundred_organisms.txt', index_col=0, header=0, sep='\t')

#Create dataframe with interactions from only top hundred organisms
df_top_organisms_brenda = df[df['Org'].isin(list(brenda_top_hundred['Org']))]

In [4]:
#One of the top hundred organisms in the data from BRENDA is "mammal"
#In the phylogenetic tree I created, these are the organisms on the "Mammal" branch:
#(See the file 'create_phylotree.ipynb' for how these were retrieved)
mammals = ['Cavia_porcellus',
 'Rattus_norvegicus',
 'Rattus_sp.',
 'Mus_musculus',
 'Cricetulus_griseus',
 'Mesocricetus_auratus',
 'Oryctolagus_cuniculus',
 'Homo_sapiens',
 'Ovis_aries',
 'Bos_taurus',
 'Sus_scrofa',
 'Equus_caballus',
 'Canis_lupus']


#Replace the underscore with space
mammals_new = []
for mammal in mammals:
    mammals_new.append(mammal.replace('_', ' '))

In [5]:
#Set every organism listed as "Mammalia" to the list of mammals
for ind, row in df_top_organisms_brenda.iterrows():
    if row['Org'] == 'Mammalia':
        df_top_organisms_brenda['Org'].at[ind] = mammals_new

#Explode dataframe on Org to get one organism on each row, and remove duplicates
df_top_organisms_brenda = df_top_organisms_brenda.explode('Org').drop_duplicates(['EC', 'Org', 'Mode', 'ChEBI'])

### Annotate the phylogenetic tree with interactions

#### Find top EC numbers and their respective top metabolic regulator

##### First: Find top ten EC numbers

In [7]:
#Group the organisms by EC number to get a set of organisms for each EC number
df_groups_top_ECs = df_top_organisms_brenda.groupby('EC')['Org'].apply(set).reset_index()

In [8]:
#Create a column with the number of organisms every EC number is documented for
#This is done to find the top ten documented EC numbers
df_groups_top_ECs['Count'] = df_groups_top_ECs.agg({'Org':len})

#Take the top ten EC numbers (these are to be mapped to the tree)
df_top_ten_ECs = df_groups_top_ECs.sort_values('Count', ascending=False).head(10)

##### Second: Find the top interactions for each of the top ten EC numbers

In [10]:
#Add an "interaction" column (CHEBI:mode) to the interaction data 
df_top_organisms_brenda['Interaction'] = df_top_organisms_brenda['ChEBI'] + ':' + df_top_organisms_brenda['Mode']

#Group the interactions by EC number to get a set of interactions for each EC number
df_groups_top_ECs_with_mets = df_top_organisms_brenda.groupby('EC')['Interaction'].apply(list).reset_index()

In [11]:
#Create column with the count of each interaction for each EC number
#This is done to get the top interaction (metabolite+mode) for each EC number
df_groups_top_ECs_with_mets['Counts'] = df_groups_top_ECs_with_mets['Interaction'].apply(lambda x: dict(Counter(x)))

In [12]:
#Make a dataframe of the top ten EC numbers and their interaction counts
df_top_ten_ECs_interactions = pd.merge(df_top_ten_ECs, df_groups_top_ECs_with_mets[['EC', 'Counts']], how='left', on='EC')

#Make a column with the top interaction for each EC number
df_top_ten_ECs_interactions['Top interaction'] = df_top_ten_ECs_interactions['Counts'].apply(lambda x: max(x, key=x.get))

##### Third: Find the organisms in which these protein-metabolite interactions are documented

In [13]:
#Group organisms in BRENDA by EC number and interaction 
#This is done to get the organisms in which the interactions (EC number:ChEBI:mode) are documented
df_groups_ECs_interactions = df_top_organisms_brenda.groupby(['EC', 'Interaction'])['Org'].apply(set).reset_index() 

#Make a column of the interaction (EC number+interaction) in the dataframe of all interactions
df_groups_ECs_interactions['EC+Interaction'] = df_groups_ECs_interactions['EC'] + ':' + df_groups_ECs_interactions['Interaction']

#Make a column of the interaction (EC number+interaction) in the dataframe of the top ten EC numbers
df_top_ten_ECs_interactions['EC+Interaction'] = df_top_ten_ECs_interactions['EC'] + ':' + df_top_ten_ECs_interactions['Top interaction']

#Drop every row from the BRENDA data that does not represent the top ten interactions (EC number:ChEBI:mode)
#This is done to get a dataframe of the top ten EC numbers, their top metabolic regulator + mode, and the organisms in which these are documented

for ind, row in df_groups_ECs_interactions.iterrows():
    if row['EC+Interaction'] not in list(df_top_ten_ECs_interactions['EC+Interaction']):
        df_groups_ECs_interactions.drop(ind, inplace=True)

In [14]:
#Make column of the ChEBI id of the top metabolic regulator
df_groups_ECs_interactions['ChEBI'] = df_groups_ECs_interactions['Interaction'].apply(lambda x: x[:-2])

#Make column of the mode of the top metabolic regulation
df_groups_ECs_interactions['Mode'] = df_groups_ECs_interactions['Interaction'].apply(lambda x: x[-1])

###### Map ChEBI id to metabolite name for easier visualization

In [16]:
#Make a dictionary of one synonym for each metabolite
df_mets_grouped = df[['Met', 'ChEBI']].drop_duplicates().groupby(['ChEBI'])['Met'].apply(list).apply(lambda x: x[0]).reset_index()
mets_dict = pd.Series(df_mets_grouped.Met.values, index=df_mets_grouped.ChEBI).to_dict()

#Map the ChEBI ids to metabolite name
df_groups_ECs_interactions['Met'] = df_groups_ECs_interactions['ChEBI'].map(mets_dict)

In [21]:
#Save the dataframe of mapped interactions to csv
df_groups_ECs_interactions.to_csv('C:/Users/Elin/Documents/GitHub/predicting-allostery/datafiles/phylotree/interactions.csv')

#### Make binary annotation file for iTOL

In [18]:
#Create a dataframe of the top hundred organisms, minus mammalia
#Results in 99 organisms
df_itol_binary = brenda_top_hundred
df_itol_binary = df_itol_binary[~df_itol_binary['Org'].isin(['Mammalia'])]

In [19]:
#Create the dataframe of binary values for plotting interactions

#Iterate through every organism in the tree
for ind1, row1 in df_itol_binary.iterrows():
    #Iterate through every interaction we are interested in plotting 
    for ind2, row2 in df_groups_ECs_interactions.iterrows():
        #Make the interaction label
        interaction = f"{row2['EC']} {row2['Met']} {row2['Mode']}"
        
        #Check if the interaction is documented for the organism in question
        if row1['Org'] in row2['Org']:
            #Set the binary value to 1 if interaction is documented
            df_itol_binary.loc[ind1, interaction] = 1
        else:
            #Set the binary value to 0 if interaction is not documented
            df_itol_binary.loc[ind1, interaction] = -1
            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [None]:
#Replace ' ' with underscore in organism names
df_itol_binary['Org'] = df_itol_binary['Org'].replace(' ', '_', regex=True)

#Copy the binary dataframe to clipboard for creating the file that is uploaded to iTOL
df_itol_binary.to_clipboard(index=False)