## Setup

In [1]:
import pandas as pd
import config
import numpy as np

In [61]:
thresh = 0.00001 # 1e-5
species = pd.read_csv(config.TRAIN_DIR + "taxonomy.csv", index_col = 0).T
pathways = pd.read_csv(config.TRAIN_DIR + "pathways.csv", index_col = 0).T
X = (pd.concat([species, pathways], axis = 1) > thresh) * 1
X.head()

Unnamed: 0,s__Abiotrophia_defectiva,s__Acidaminococcus_fermentans,s__Acidaminococcus_intestini,s__Actinomyces_graevenitzii,s__Actinomyces_odontolyticus,s__Actinomyces_oris,s__Actinomyces_turicensis,s__Actinomyces_viscosus,s__Adlercreutzia_equolifaciens,s__Aggregatibacter_segnis,...,TRNA-CHARGING-PWY: tRNA charging,TRPSYN-PWY: L-tryptophan biosynthesis,TYRFUMCAT-PWY: L-tyrosine degradation I,UBISYN-PWY: superpathway of ubiquinol-8 biosynthesis (prokaryotic),UDPNACETYLGALSYN-PWY: UDP-N-acetyl-D-glucosamine biosynthesis II,UDPNAGSYN-PWY: UDP-N-acetyl-D-glucosamine biosynthesis I,URDEGR-PWY: superpathway of allantoin degradation in plants,URSIN-PWY: ureide biosynthesis,VALDEG-PWY: L-valine degradation I,VALSYN-PWY: L-valine biosynthesis
SAMD00036192,0,0,0,1,1,0,0,0,0,0,...,1,1,0,1,0,1,0,0,0,1
SAMD00036193,0,0,0,1,0,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
SAMD00036194,0,0,0,0,0,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
SAMD00036197,0,0,0,1,1,0,0,1,1,0,...,1,1,0,0,0,1,0,0,0,1
SAMD00036204,0,0,0,1,1,1,0,0,1,0,...,1,1,0,1,0,1,1,0,0,1


In [62]:
# How sparse is X?
X.sum().sum() / (X.shape[0] * X.shape[1])

0.41999011764898053

In [63]:
y = pd.read_csv(config.TRAIN_DIR + "isHealthy.csv", index_col=0).T
y.head()

Unnamed: 0,isHealthy
SAMD00036192,True
SAMD00036193,True
SAMD00036194,False
SAMD00036197,True
SAMD00036204,True


## GMHI 2.0 model Class

In [120]:
import numpy as np
from sklearn.base import BaseEstimator

class GMHI2(BaseEstimator):
    def __init__(self):
        self.fitted = False
    
    def fit(self, X, y):
        """X is dataframe (num_examples, num_features). y is dataframe (num_examples, 1)"""
        self.fitted = True
        
        # convert to numpy
        X = X.values
        y = y.values
        
        m = X.shape[0] # examples
        n = X.shape[1] # features
        
        # get healthy and unhealthy samples
        healthies = X[y.flatten(), :]
        unhealthies = X[~y.flatten(), :]
        
        # get number of healthy and unhealthy samples
        num_healthy = healthies.shape[0]
        num_unhealthy = unhealthies.shape[0]
        
        # for each feature, see the proportion of samples with a 1 as its value for that feature
        prop_healthy = healthies.mean(axis=0)
        prop_unhealthy = unhealthies.mean(axis=0)
        
        # to avoid divide by zero, replace zero with smallest possible nonzero proportion for each feature
        prop_healthy[prop_healthy == 0] = 1 / num_healthy
        prop_unhealthy[prop_unhealthy == 0] = 1 / num_unhealthy
        
        # calculate theta with difference and fold changes
        diff = prop_healthy - prop_unhealthy
        foldh = prop_healthy / prop_unhealthy
        foldn = prop_unhealthy / prop_healthy
        
        theta = diff * np.log(np.maximum(foldh, foldn))
        theta_positive = theta.copy()
        theta_negative = theta.copy()
        
        theta_positive[theta_positive < 0] = 0
        theta_negative[theta_negative > 0] = 0
        
        score_positive = (healthies @ theta_positive).mean()
        score_negative = abs((unhealthies @ theta_negative).mean())
        
        theta_positive /= score_positive
        theta_negative /= score_negative
        
        self.theta = theta_positive + theta_negative
    
    def predict(self, X):
        """Returns predictions for X"""
        if not self.fitted:
            return None
        return (X @ self.theta) > 0

In [121]:
with open(config.OUTPUT_DIR + "features.txt") as f:
    features = f.read().splitlines()
features[:20]

['s__Acidaminococcus_fermentans',
 's__Acidaminococcus_intestini',
 's__Actinomyces_odontolyticus',
 's__Alistipes_finegoldii',
 's__Alistipes_indistinctus',
 's__Alistipes_putredinis',
 's__Alistipes_senegalensis',
 's__Alistipes_shahii',
 's__Alistipes_sp_AP11',
 's__Anaerostipes_hadrus',
 's__Anaerotruncus_colihominis',
 's__Atopobium_parvulum',
 's__Bacteroidales_bacterium_ph8',
 's__Bacteroides_caccae',
 's__Bacteroides_cellulosilyticus',
 's__Bacteroides_clarus',
 's__Bacteroides_coprocola',
 's__Bacteroides_dorei',
 's__Bacteroides_eggerthii',
 's__Bacteroides_finegoldii']

In [122]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

X_reduce = X[features]

# Instantiate
model = GMHI2()

# Use 10 fold cv, shuffle data
kfold = KFold(10, shuffle=True, random_state=42)

scores_bal = cross_val_score(model, X_reduce, y, cv=kfold, scoring = "balanced_accuracy")
display(scores_bal, np.mean(scores_bal))

array([0.73276209, 0.71341463, 0.71844328, 0.69977452, 0.67994505,
       0.69983352, 0.71426347, 0.72619387, 0.71706454, 0.72739563])

0.7129090608061435

In [112]:
thresh = 0.00001 # 1e-5
species_val = pd.read_csv(config.VAL_DIR + "taxonomy241.csv", index_col = 0).T
pathways_val = pd.read_csv(config.VAL_DIR + "pathways241.csv", index_col = 0).T
X_val = (pd.concat([species_val, pathways_val], axis = 1) > thresh) * 1
X_val.head()

Unnamed: 0,s__Abiotrophia_defectiva,s__Acidaminococcus_fermentans,s__Acidaminococcus_intestini,s__Actinomyces_graevenitzii,s__Actinomyces_odontolyticus,s__Actinomyces_oris,s__Actinomyces_turicensis,s__Actinomyces_viscosus,s__Adlercreutzia_equolifaciens,s__Aggregatibacter_segnis,...,TRNA-CHARGING-PWY: tRNA charging,TRPSYN-PWY: L-tryptophan biosynthesis,TYRFUMCAT-PWY: L-tyrosine degradation I,UBISYN-PWY: superpathway of ubiquinol-8 biosynthesis (prokaryotic),UDPNACETYLGALSYN-PWY: UDP-N-acetyl-D-glucosamine biosynthesis II,UDPNAGSYN-PWY: UDP-N-acetyl-D-glucosamine biosynthesis I,URDEGR-PWY: superpathway of allantoin degradation in plants,URSIN-PWY: ureide biosynthesis,VALDEG-PWY: L-valine degradation I,VALSYN-PWY: L-valine biosynthesis
RHB_RAM_0004,0,0,0,0,0,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
RHB_RAM_0007,0,0,0,0,0,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
RHB_RAM_0013,0,0,0,0,0,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
RHB_RAM_0015,0,0,0,0,0,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
RHB_RAM_0017,0,0,0,0,0,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1


In [113]:
y_val = pd.read_csv(config.VAL_DIR + "isHealthy241.csv", index_col=0)
y_val.head()

Unnamed: 0_level_0,isHealthy
Sample_ID,Unnamed: 1_level_1
RHB_RAM_0004,False
RHB_RAM_0007,False
RHB_RAM_0013,False
RHB_RAM_0015,False
RHB_RAM_0017,False


In [114]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
X_val_reduce = X_val[features]

model = GMHI2()
model.fit(X_reduce, y)
y_hat_val = model.predict(X_val_reduce)
score_bal = balanced_accuracy_score(y_hat_val, y_val)
score_bal

0.6432155760738659