In [43]:
# enrichment and feature ranking
import pandas as pd
from pickle import load
import cobra
import medusa
import os
import json
import numpy as np

In [29]:
ensembles = {}
importances = {}
cluster_membership = {}
collapsed_features = {}
# exclude species that had non-variant ensemble structure or predictions.
exclude = ['Bacillus megaterium','Stenotrophomonas maltophilia','Achromobacter xylosoxidans','Achromobacter piechaudii']
for species_file in os.listdir("../results/deep_ensembles/"):
    species_name = species_file.split(".")[0]
    
    if species_name not in exclude:
        with open("../results/deep_ensembles/"+species_name+".pickle",'rb') as infile:
            print(species_name)
            ensembles[species_name] = load(infile)

        # load the feature importance data for all species
        importances[species_name] = pd.read_csv('../results/classification_results/'+species_name+'_feature_importance.csv',sep=',', index_col = 0)

        # save the cluster membership information
        cluster_membership[species_name] = pd.read_csv('../results/classification_results/'+species_name+'_clusters.csv',sep=',')

        # load the collapsed feature sets
        with open('../results/collapsed_features/'+species_name+'_collapsed_features.json','r') as jsonfile:
            collapsed_features[species_name] = json.load(jsonfile)

Streptococcus equinus
Aeromonas salmonicida
Flavobacterium johnsoniae
Ralstonia solanacearum
Streptococcus mitis
Bacillus pumilus
Enterococcus faecalis
Pseudomonas putida
Haemophilus parasuis
Chryseobacterium gleum
Staphylococcus haemolyticus
Neisseria mucosa
Listeria monocytogenes
Neisseria flavescens
Streptococcus pneumoniae
Haemophilus influenzae
Sphingobacterium spiritivorum
Ralstonia pickettii
Corynebacterium glutamicum
Staphylococcus epidermidis
Pseudomonas mendocina
Streptococcus gallolyticus
Staphylococcus aureus
Streptococcus oralis
Neisseria meningitidis
Corynebacterium efficiens
Streptococcus vestibularis
Pseudomonas stutzeri
Listeria seeligeri


In [40]:
# get the average fractional importance for each reaction.
# first, expand collapsed features and duplicate their fractional importance
reaction_fractional_importances = {}
reaction_fractional_differences = {}
for species in importances.keys():
    for reaction in importances[species].index.values:
        # extract all reactions from the collapsed feature set
        subreactions = collapsed_features[species][reaction]
        # get rid of upper/lower bound and _c suffix
        subreactions = [r.split('_')[0] for r in subreactions]
        # remove redundant features (upper/lower bound)
        subreactions = list(set(subreactions))
        for r in subreactions:
            # remember: all 'r's get assigned the same importance/difference as the label for the feature set, 'reaction'
            if r in reaction_fractional_importances.keys():
                reaction_fractional_importances[r].append(importances[species].loc[reaction]['importance'])
                reaction_fractional_differences[r].append(abs(importances[species].loc[reaction]['fraction active in 0'] -
                                                             importances[species].loc[reaction]['fraction active in 1']))
            else:
                reaction_fractional_importances[r] = [importances[species].loc[reaction]['importance']]
                reaction_fractional_differences[r] = [abs(importances[species].loc[reaction]['fraction active in 0'] -
                                                             importances[species].loc[reaction]['fraction active in 1'])]

In [65]:
mean_fractional_importances = {r:np.mean(reaction_fractional_importances[r]) for r in reaction_fractional_importances.keys()}
mean_fractional_differences = {r:np.mean(reaction_fractional_differences[r]) for r in reaction_fractional_differences.keys()}
reaction_model_counts = {r:len(reaction_fractional_importances[r]) for r in reaction_fractional_importances.keys()}
for_df = {'mean fractional importance':mean_fractional_importances,
          'mean fractional difference':mean_fractional_differences,
          'species count':reaction_model_counts}
means_as_df = pd.DataFrame.from_dict(for_df)

In [66]:
sorted_df = means_as_df.sort_values(by='mean fractional importance', ascending=False)
sorted_df

Unnamed: 0,mean fractional difference,mean fractional importance,species count
rxn11676,0.361326,0.089114,7
rxn05291,0.264500,0.085019,11
rxn09016,0.259828,0.084754,3
rxn00541,0.325900,0.078196,5
rxn00356,0.550151,0.077244,2
rxn00213,0.183506,0.067700,3
rxn00214,0.330547,0.060542,2
rxn10139,0.155914,0.058693,1
rxn01919,0.287927,0.057819,11
rxn12298,0.283202,0.055131,11
