## Setup

In [219]:
import pandas as pd
import config
import numpy as np

In [220]:
thresh = 0.00001 # 1e-5
species = pd.read_csv(config.TRAIN_DIR + "taxonomy.csv", index_col = 0).T
# pathways = pd.read_csv(config.TRAIN_DIR + "pathways.csv", index_col = 0).T
X = (pd.concat([species], axis = 1) > thresh) * 1
X.head()

Species,s__Abiotrophia_defectiva,s__Acidaminococcus_fermentans,s__Acidaminococcus_intestini,s__Actinomyces_graevenitzii,s__Actinomyces_odontolyticus,s__Actinomyces_oris,s__Actinomyces_turicensis,s__Actinomyces_viscosus,s__Adlercreutzia_equolifaciens,s__Aggregatibacter_segnis,...,s__Subdoligranulum_variabile,s__Succinatimonas_hippei,s__Sutterella_wadsworthensis,s__Turicibacter_sanguinis,s__Varibaculum_cambriense,s__Veillonella_atypica,s__Veillonella_dispar,s__Veillonella_parvula,s__Weissella_cibaria,s__Weissella_confusa
SAMD00036192,0,0,0,1,1,0,0,0,0,0,...,0,0,1,0,0,1,1,1,0,0
SAMD00036193,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
SAMD00036194,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
SAMD00036197,0,0,0,1,1,0,0,1,1,0,...,0,0,1,0,0,1,1,1,0,0
SAMD00036204,0,0,0,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [221]:
# How sparse is X?
X.sum().sum() / (X.shape[0] * X.shape[1])

0.2288177884788525

In [222]:
y = pd.read_csv(config.TRAIN_DIR + "isHealthy.csv", index_col=0).T
y.head()

Unnamed: 0,isHealthy
SAMD00036192,True
SAMD00036193,True
SAMD00036194,False
SAMD00036197,True
SAMD00036204,True


## GMHI 2.0 model Class

In [269]:
import numpy as np
from sklearn.base import BaseEstimator

class GMHI2(BaseEstimator):
    def __init__(self):
        self.fitted = False
    
    def fit(self, X, y):
        """X is dataframe (num_examples, num_features). y is dataframe (num_examples, 1)"""
        self.fitted = True
        
        # convert to numpy
        X = X.values
        y = y.values
        
        m = X.shape[0] # examples
        n = X.shape[1] # features
        
        # get healthy and unhealthy samples
        healthies = X[y.flatten(), :]
        unhealthies = X[~y.flatten(), :]
        
        # get number of healthy and unhealthy samples
        num_healthy = healthies.shape[0]
        num_unhealthy = unhealthies.shape[0]
        
        # for each feature, see the proportion of samples with a 1 as its value for that feature
        prop_healthy = healthies.mean(axis=0)
        prop_unhealthy = unhealthies.mean(axis=0)
        
        # to avoid divide by zero, replace zero with smallest possible nonzero proportion for each feature
        prop_healthy[prop_healthy == 0] = 1 / num_healthy
        prop_unhealthy[prop_unhealthy == 0] = 1 / num_unhealthy
        
        # calculate theta with difference and fold changes
        diff = prop_healthy - prop_unhealthy
        foldh = prop_healthy / prop_unhealthy
        foldn = prop_unhealthy / prop_healthy
        
#         diff[diff < 0] = -1
#         diff[diff > 0] = 1
        
        theta = diff * np.log(np.maximum(foldh, foldn))
        theta_positive = theta.copy()
        theta_negative = theta.copy()
        
        theta_positive[theta_positive < 0] = 0
        theta_negative[theta_negative > 0] = 0
        
        score_positives = (healthies @ theta_positive)
        score_negatives = -1 * (unhealthies @ theta_negative)
        score_positives.sort()
        score_negatives.sort()
        score_positive = score_positives[-20]
        score_negative = score_negatives[-20]
        print(score_positives, score_negatives)
        
        theta_positive /= score_positive
        theta_negative /= score_negative
        
        self.theta = theta_positive + theta_negative
    
    def predict(self, X):
        """Returns predictions for X"""
        if not self.fitted:
            return None
        return (X @ self.theta) > 0

In [272]:
with open(config.OUTPUT_DIR + "features.txt") as f:
    features = f.read().splitlines()
features

['s__Acidaminococcus_fermentans',
 's__Acidaminococcus_intestini',
 's__Actinomyces_odontolyticus',
 's__Actinomyces_turicensis',
 's__Actinomyces_viscosus',
 's__Aggregatibacter_segnis',
 's__Akkermansia_muciniphila',
 's__Alistipes_finegoldii',
 's__Alistipes_indistinctus',
 's__Alistipes_onderdonkii',
 's__Alistipes_putredinis',
 's__Alistipes_senegalensis',
 's__Alistipes_shahii',
 's__Alistipes_sp_AP11',
 's__Alistipes_sp_HGB5',
 's__Alloscardovia_omnicolens',
 's__Anaerofustis_stercorihominis',
 's__Anaeroglobus_geminatus',
 's__Anaerostipes_hadrus',
 's__Anaerotruncus_colihominis',
 's__Atopobium_parvulum',
 's__Atopobium_rimae',
 's__Bacteroidales_bacterium_ph8',
 's__Bacteroides_caccae',
 's__Bacteroides_cellulosilyticus',
 's__Bacteroides_clarus',
 's__Bacteroides_coprocola',
 's__Bacteroides_coprophilus',
 's__Bacteroides_dorei',
 's__Bacteroides_eggerthii',
 's__Bacteroides_faecis',
 's__Bacteroides_finegoldii',
 's__Bacteroides_fragilis',
 's__Bacteroides_intestinalis',
 '

In [273]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

X_reduce = X[features]

# Instantiate
model = GMHI2()

# Use 10 fold cv, shuffle data
kfold = KFold(10, shuffle=True, random_state=42)

bal_accs = cross_val_score(model, X_reduce, y, cv=kfold, scoring = "balanced_accuracy")
# accs = cross_val_score(model, X_reduce, y, cv=kfold, scoring = "accuracy")
display(bal_accs, np.mean(bal_accs))
# display(accs, np.mean(accs))

[0.02077554 0.03226059 0.05509467 ... 1.52274922 1.53224072 1.61200718] [4.35403904e-05 1.36729614e-02 2.17601948e-02 ... 7.14183452e+00
 7.42916842e+00 7.56409806e+00]
[0.03454498 0.05400969 0.05588268 ... 1.54800243 1.55457897 1.6225231 ] [1.63492394e-04 9.25997240e-03 2.59661162e-02 ... 7.42508058e+00
 7.45621366e+00 7.63944496e+00]
[0.01882776 0.02953612 0.05707159 ... 1.50150051 1.50251789 1.56668211] [2.70519844e-04 1.29017253e-02 2.83199717e-02 ... 7.47231351e+00
 7.54893949e+00 7.66012839e+00]
[0.01511736 0.02670491 0.04494777 ... 1.47003743 1.48383513 1.54682609] [3.32451125e-04 1.62974797e-02 3.28585312e-02 ... 7.80385069e+00
 7.83275863e+00 7.98145522e+00]
[0.01701611 0.03019349 0.04861724 ... 1.45690728 1.47711131 1.48387938] [4.84064695e-05 1.65048117e-02 2.95209333e-02 ... 7.51873499e+00
 7.56925013e+00 7.69477876e+00]
[0.01733603 0.02844057 0.04774265 ... 1.48720383 1.50064396 1.55980022] [8.50299514e-05 1.08418329e-02 3.26531131e-02 ... 7.61296390e+00
 7.68841229e+00 7.

array([0.68207615, 0.673474  , 0.6990008 , 0.67775665, 0.65549451,
       0.65477248, 0.68057348, 0.69247051, 0.68364279, 0.69304667])

0.6792308022813375

In [274]:
thresh = 0.00001 # 1e-5
species_val = pd.read_csv(config.VAL_DIR + "taxonomy679.csv", index_col = 0).T
# pathways_val = pd.read_csv(config.VAL_DIR + "pathways241.csv", index_col = 0).T
X_val = (pd.concat([species_val], axis = 1) > thresh) * 1
X_val.head()

Unnamed: 0,s__Abiotrophia_defectiva,s__Acidaminococcus_fermentans,s__Acidaminococcus_intestini,s__Actinomyces_graevenitzii,s__Actinomyces_odontolyticus,s__Actinomyces_oris,s__Actinomyces_turicensis,s__Actinomyces_viscosus,s__Adlercreutzia_equolifaciens,s__Aggregatibacter_segnis,...,s__Subdoligranulum_variabile,s__Succinatimonas_hippei,s__Sutterella_wadsworthensis,s__Turicibacter_sanguinis,s__Varibaculum_cambriense,s__Veillonella_atypica,s__Veillonella_dispar,s__Veillonella_parvula,s__Weissella_cibaria,s__Weissella_confusa
RHB_RAM_0004,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
RHB_RAM_0007,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
RHB_RAM_0013,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
RHB_RAM_0015,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
RHB_RAM_0017,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,1,0,0


In [275]:
y_val = pd.read_csv(config.VAL_DIR + "isHealthy679.csv", index_col=0)
y_val.sum()

isHealthy    118
dtype: int64

In [277]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
X_val_reduce = X_val[features]

model = GMHI2()
model.fit(X_reduce, y)
y_hat_val = model.predict(X_val_reduce)
# acc = accuracy_score(y_hat_val, y_val)
bal_acc = balanced_accuracy_score(y_hat_val, y_val)
bal_acc

[0.01785049 0.02888577 0.04903915 ... 1.48550109 1.49132937 1.56155311] [9.25570870e-05 1.31500443e-02 2.64602577e-02 ... 7.51099011e+00
 7.56405114e+00 7.70527013e+00]


0.6329375457683859