## Setup

In [164]:
import pandas as pd
import numpy as np
import config

## Read data

In [165]:
species = pd.read_csv(config.TRAIN_DIR + "taxonomy.csv", index_col = 0)
species[species < 0.00001] = 0
X = species.T
X.head()

Species,s__Abiotrophia_defectiva,s__Acidaminococcus_fermentans,s__Acidaminococcus_intestini,s__Actinomyces_graevenitzii,s__Actinomyces_odontolyticus,s__Actinomyces_oris,s__Actinomyces_turicensis,s__Actinomyces_viscosus,s__Adlercreutzia_equolifaciens,s__Aggregatibacter_segnis,...,s__Subdoligranulum_variabile,s__Succinatimonas_hippei,s__Sutterella_wadsworthensis,s__Turicibacter_sanguinis,s__Varibaculum_cambriense,s__Veillonella_atypica,s__Veillonella_dispar,s__Veillonella_parvula,s__Weissella_cibaria,s__Weissella_confusa
SAMD00036192,0.0,0.0,0.0,0.003491,0.000377,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.006551,0.0,0.0,0.000979,4.8e-05,0.003579,0.0,0.0
SAMD00036193,0.0,0.0,0.0,0.00011,0.0,0.0,0.0,0.0,0.000614,0.0,...,0.0,0.0,0.000183,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SAMD00036194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001276,0.0,...,0.0,0.0,0.001943,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SAMD00036197,0.0,0.0,0.0,0.000435,1.4e-05,0.0,0.0,5.1e-05,4.7e-05,0.0,...,0.0,0.0,0.007306,0.0,0.0,0.0019,0.000175,0.000812,0.0,0.0
SAMD00036204,0.0,0.0,0.0,0.001297,2.8e-05,4.1e-05,0.0,0.0,0.001195,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [166]:
isHealthy = pd.read_csv(config.TRAIN_DIR + "isHealthy.csv", index_col=0)
y = isHealthy.T
y.head()

Unnamed: 0,isHealthy
SAMD00036192,True
SAMD00036193,True
SAMD00036194,False
SAMD00036197,True
SAMD00036204,True


## Demo GMHI!!

In [167]:
from sklearn.base import BaseEstimator

class GMHI(BaseEstimator):
    def __init__(self, theta_f = 1.4, theta_d = 0.1, thresh = 0.00001, use_shannon = True,
                R_MH = 7, R_MN = 31):
        self.theta_f = theta_f
        self.theta_d = theta_d
        self.thresh = 0.00001
        self.use_shannon = use_shannon
        self.fitted = False
        self.R_MH = R_MH
        self.R_MN = R_MN
    
    def fit(self, X, y):
        """
            X is a df, (num_examples, num_features)
            y is a df, (num_examples, 1)
            X and y have column names indicating species names
            Selects health abundant and health scarce species based on differences and fold changes
        """
        
        self.fitted = True
        
        # get healthy and unhealthy samples
        healthies = X.iloc[y.values, :]
        unhealthies = X.iloc[~y.values, :]
        
        # get proportions for each species
        proportion_healthy = self.get_proportions(healthies)
        proportion_unhealthy = self.get_proportions(unhealthies)
        
        # get differences and fold change
        diff = proportion_healthy - proportion_unhealthy
        fold = proportion_healthy / proportion_unhealthy
        
        # based on proportion differences and fold change, select health abundant
        # and health scarce
        self.health_abundant = self.select_species(diff, fold)
        self.health_scarce = self.select_species(-1 * diff, 1 / fold)
        
    def select_species(self, diff, fold):
        return list(diff[
            (diff['Proportion'] > self.theta_d) & (fold['Proportion'] > self.theta_f)
        ].index)
        
    def get_proportions(self, df):
        p = (df > self.thresh).sum() / df.shape[0]
        proportion = pd.DataFrame({"Proportion" : p})
        return proportion
    
    def predict(self, X):
        """
            X is a df, (num_examples, num_features)
            X has column names indicating species names
        """
        if not self.fitted:
            return None
        X_healthy_features = X[self.health_abundant]
        X_unhealthy_features = X[self.health_scarce]
        psi_MH = self.get_psi(X_healthy_features) / self.R_MH
        psi_MN = self.get_psi(X_unhealthy_features) / self.R_MN
        return np.log((psi_MH + 0.00001) / (psi_MN) + 0.00001) > 0
        
    def get_psi(self, X):
        psi = self.richness(X)
        if self.use_shannon:
            psi *= self.shannon(X)
        return psi
        
    def get_species(self):
        """
            Returns the lists of health abundant and health scarce species as a tuple, if fitted
        """
        if not self.fitted:
            return None
        return self.health_abundant, self.health_scarce
    
    def richness(self, X):
        frame = pd.DataFrame((X > 0).sum(axis=1))
        return frame
    
    def shannon(self, X):
        logged = X.copy()
        logged[logged > 0] = np.log(logged[logged > 0])
        shannoned = logged * X * -1
        sums = shannoned.sum(axis=1)
        sums = pd.DataFrame(sums)
        return sums

In [168]:
from sklearn.metrics import balanced_accuracy_score
gmhi = GMHI(use_shannon=True)
gmhi.fit(X, y)
y_hat = gmhi.predict(X)
score = balanced_accuracy_score(y, y_hat)
score

0.7005453864976157