In [1]:
# This notebook demonstrates that there is no meaningful difference in classifier performance
# when assessed using a 70/30 training/test split and simply taking the out-of-bag accuracy
# when using 100% of the data as RF input (i.e., the traditional RF approach).

import os
import medusa
from pickle import load
import pandas as pd
import json
import numpy

from sklearn.ensemble import RandomForestClassifier


In [2]:
# load the gene knockout dataframes
gene_knockout_results = {}    
for fname in os.listdir('../results/gene_knockouts/'):
    species = fname.split('.')[0]
    gene_knockout_results[species] = pd.read_csv(
        '../results/gene_knockouts/'+species+'.csv',sep=',',index_col=0)

In [3]:
# load the ensembles, which we will need later.
# load all ensembles
ensembles = {}
exclude = ['Bacillus megaterium','Stenotrophomonas maltophilia']
for species_file in os.listdir("../results/deep_ensembles/"):
    species_name = species_file.split(".")[0]
    
    if species_name not in exclude:
        with open("../results/deep_ensembles/"+species_name+".pickle",'rb') as infile:
            ensembles[species_name] = load(infile)

In [4]:
# Get the ensemble feature states for each species, which will be used as input
# to regression with principle coordinates as output.
feature_frames = {}
for species in ensembles.keys():
    ensemble = ensembles[species]
    # Grab the features and states for the ensemble and convert to a dataframe
    feature_dict = {}
    for feature in ensemble.features:
        feature_dict[feature.id] = feature.states
    feature_frame = pd.DataFrame.from_dict(feature_dict)

    # Convert to a boolean dataframe for faster computation
    # (0 values become False, all others become True (i.e. active features))
    feature_frames[species] = feature_frame.astype(bool)

In [5]:
# collapse perfectly correlated features. Takes a while because of the pairwise comparison
# of contents for an entire column in each dataframe.
feature_sets = {}
for species in feature_frames.keys():
    feature_sets[species] = {}
    checkfeat = feature_frames[species].columns
    usedfeat = []
    for feature1 in checkfeat:
        if feature1 not in usedfeat:
            identical_features = [feature1]
            usedfeat.append(feature1)
            for feature2 in checkfeat:
                if feature2 is not feature1 and feature2 not in usedfeat:
                    if feature_frames[species][feature1].equals(feature_frames[species][feature2]):
                        identical_features.append(feature2)
                        usedfeat.append(feature2)
            feature_sets[species][feature1] = identical_features


In [6]:
# Apply the reduced feature sets to filter the dataframes.
# each key in the dictionary for each species represents a group of perfectly correlated features.
for species in feature_frames.keys():
    feature_frames[species] = feature_frames[species][list(feature_sets[species].keys())]

In [7]:
# Threshold the gene essentiality simulations and collapse perfectly correlated genes into a single variable
collapsed_knockouts = {}
gene_sets = {}
for species in gene_knockout_results.keys():
    # threshold to binarize the data as essential/nonessential (True for growth when knocked out)
    collapsed_knockout = gene_knockout_results[species]
    collapsed_knockout = collapsed_knockout > 1E-6
    collapsed_knockout = collapsed_knockout.loc[(collapsed_knockout == 0).sum(axis=1) != len(collapsed_knockout.columns)]
    collapsed_knockout = collapsed_knockout.loc[(collapsed_knockout == 1).sum(axis=1) < len(collapsed_knockout.columns)]
    collapsed_knockout = collapsed_knockout.T
    
    gene_sets[species] = {}
    checkgenes = collapsed_knockout.columns
    usedgene = []
    print (species)
    print("Genes before collapse: " + str(len(checkgenes)))
    
    for gene1 in checkgenes:
        if gene1 not in usedgene:
            identical_genes = [gene1]
            usedgene.append(gene1)
            for gene2 in checkgenes:
                if gene2 is not gene1 and gene2 not in usedgene:
                    if collapsed_knockout[gene1].equals(collapsed_knockout[gene2]):
                        identical_genes.append(gene2)
                        usedgene.append(gene2)
            gene_sets[species][gene1] = identical_genes
    collapsed_knockouts[species] = collapsed_knockout[list(gene_sets[species].keys())]
    print("Genes after collapse: " + str(len(collapsed_knockouts[species].columns)))


Pseudomonas putida
Genes before collapse: 25
Genes after collapse: 9
Haemophilus parasuis
Genes before collapse: 28
Genes after collapse: 21
Neisseria meningitidis
Genes before collapse: 34
Genes after collapse: 19
Neisseria mucosa
Genes before collapse: 24
Genes after collapse: 15
Streptococcus gallolyticus
Genes before collapse: 9
Genes after collapse: 8
Haemophilus influenzae
Genes before collapse: 17
Genes after collapse: 12
Sphingobacterium spiritivorum
Genes before collapse: 20
Genes after collapse: 12
Streptococcus vestibularis
Genes before collapse: 12
Genes after collapse: 11
Streptococcus oralis
Genes before collapse: 17
Genes after collapse: 14
Listeria monocytogenes
Genes before collapse: 14
Genes after collapse: 9
Staphylococcus haemolyticus
Genes before collapse: 14
Genes after collapse: 11
Streptococcus equinus
Genes before collapse: 6
Genes after collapse: 6
Corynebacterium glutamicum
Genes before collapse: 20
Genes after collapse: 11
Pseudomonas mendocina
Genes before 

In [8]:
# Calculate the gene knockout distance between ensemble members, perform PCOA,
# assign clusters with k-means, then classify with random forest.
train_classifiers = {}
for species in collapsed_knockouts.keys():
    if species not in ["Stenotrophomonas maltophilia"]:
        if len(collapsed_knockouts[species].keys()) > 2:
            
            cluster_assignments = pd.read_csv('../results/classification_results/'+species+'_clusters.csv',
                                  sep=',',
                                  index_col=0)
            cluster_assignments.columns = ['cluster']
            # grab the ensembles feature frame, fit the classifier
            feature_frame = feature_frames[species]
            input_cols = feature_frame.columns
            all_data = cluster_assignments.merge(feature_frame,left_index=True,right_index=True)
            
            # create a 70/30 train/test split
            trainset = all_data.sample(frac=0.7) # sample 70% of the data
            testset = all_data[~all_data.index.isin(trainset.index)] # select the other 30% to test
            
            # perform binary classification using random forest
            train_classifiers[species] = RandomForestClassifier(n_estimators=500, oob_score=True).fit(
                trainset[input_cols],trainset['cluster'])

            # evaluate on the test set
            print(species,
                  "OOB accuracy: " + str(train_classifiers[species].oob_score_),
                  "test set accuracy: " + 
                  str(train_classifiers[species].score(testset[input_cols],testset['cluster'])))


Pseudomonas putida OOB accuracy: 0.988448844884 test set accuracy: 0.988416988417
Haemophilus parasuis OOB accuracy: 0.988555078684 test set accuracy: 0.986622073579
Neisseria meningitidis OOB accuracy: 0.992753623188 test set accuracy: 0.983050847458
Neisseria mucosa OOB accuracy: 0.985337243402 test set accuracy: 0.98976109215
Streptococcus gallolyticus OOB accuracy: 0.989928057554 test set accuracy: 0.98322147651
Haemophilus influenzae OOB accuracy: 0.989781021898 test set accuracy: 0.993197278912
Streptococcus oralis OOB accuracy: 0.997101449275 test set accuracy: 0.996621621622
Streptococcus pneumoniae OOB accuracy: 0.981348637016 test set accuracy: 0.983277591973
Streptococcus vestibularis OOB accuracy: 0.997138769671 test set accuracy: 1.0
Staphylococcus haemolyticus OOB accuracy: 0.984240687679 test set accuracy: 0.969899665552
Streptococcus equinus OOB accuracy: 0.974063400576 test set accuracy: 0.976430976431
Corynebacterium glutamicum OOB accuracy: 0.991253644315 test set ac