## Setup

In [1]:
import pandas as pd
data_dir = "../data/"
output_dir = "../output/"

In [15]:
thresh = 0.00001 # 1e-5
species = pd.read_csv(data_dir + "taxonomy.csv", index_col = 0).T
pathways = pd.read_csv(data_dir + "pathways.csv", index_col = 0).T
X = (pd.concat([species, pathways], axis = 1) > thresh) * 1
X.head()

Unnamed: 0,s__Abiotrophia_defectiva,s__Acidaminococcus_fermentans,s__Acidaminococcus_intestini,s__Actinomyces_graevenitzii,s__Actinomyces_odontolyticus,s__Actinomyces_oris,s__Actinomyces_turicensis,s__Actinomyces_viscosus,s__Adlercreutzia_equolifaciens,s__Aggregatibacter_segnis,...,TRNA-CHARGING-PWY: tRNA charging,TRPSYN-PWY: L-tryptophan biosynthesis,TYRFUMCAT-PWY: L-tyrosine degradation I,UBISYN-PWY: superpathway of ubiquinol-8 biosynthesis (prokaryotic),UDPNACETYLGALSYN-PWY: UDP-N-acetyl-D-glucosamine biosynthesis II,UDPNAGSYN-PWY: UDP-N-acetyl-D-glucosamine biosynthesis I,URDEGR-PWY: superpathway of allantoin degradation in plants,URSIN-PWY: ureide biosynthesis,VALDEG-PWY: L-valine degradation I,VALSYN-PWY: L-valine biosynthesis
SAMD00036192,0,0,0,1,1,0,0,0,0,0,...,1,1,0,1,0,1,0,0,0,1
SAMD00036193,0,0,0,1,0,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
SAMD00036194,0,0,0,0,0,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
SAMD00036197,0,0,0,1,1,0,0,1,1,0,...,1,1,0,0,0,1,0,0,0,1
SAMD00036204,0,0,0,1,1,1,0,0,1,0,...,1,1,0,1,0,1,1,0,0,1


In [16]:
# How sparse is X?
X.sum().sum() / (X.shape[0] * X.shape[1])

0.41930290214856464

In [8]:
y = pd.read_csv(data_dir + "/isHealthy.csv", index_col=0).T
y.head()

Unnamed: 0,isHealthy
SAMD00036192,True
SAMD00036193,True
SAMD00036194,False
SAMD00036197,True
SAMD00036204,True


## GMHI 2.0 model Class

In [136]:
import numpy as np

class GMHI2:
    def __init__(self, feature_list):
        """
        feature_list is the list of features that the model will use. The model will discard all other features
        """
        self.feature_list = feature_list
    
    def fit(self, X, y):
        """
        X is a boolean dataframe. 1 indicates that the sample has a nonzero val for the microbe/pathway feature
        Discards features not in the feature_list. 
        
        y is an (m x 1) vector with the sample labels, with 1 indicating a healthy sample.
        
        Calculates theta, the weights for each feature with the following equation: 
            weight = ln(fold_change) * diff
        Where fold_change and diff are the fold_change and difference between the 
        """
        
        # Select the features passed in the constructor
        X_reduce = X[self.feature_list]
        
        # Get the proportion fold changes and differences for each feature
        pivot = self.get_pivot(X_reduce, y)
        
        # Calculate the positive and negative weights
        theta_good, theta_bad = self.get_weights(pivot)
        
        # Split features based on whether or not they have a positive or negative weight
        X_good, X_bad = self.split_features(X_reduce, theta_good, theta_bad)
        
        # for vectorized implementation
        theta_goods, theta_bads = self.get_thetas(theta_good), self.get_thetas(theta_bad)
        
        y_hat_goods = X_good @ theta_goods
        y_hat_bads = X_bad @ theta_bads
        display(y_hat_bads.shape)
        
    def get_chi(self, X, y):
        
        
    def score(self, X_test, y_test):
        """
        Returns balanced accuracy using the calculated weights.
        """
        X_test_reduce = X_test[self.feature_list]
    
    def get_pivot(self, X, y):
        """
        Returns a dataframe with the fold changes and differences in proportions between healthy and unhealthy
        samples
        """
        div = 0.0001
        pivot = pd.concat([X[y['isHealthy']].mean(), X[~y['isHealthy']].mean()], axis = 1)
        pivot.columns = ['healthy', 'not']
        pivot['diff'] = pivot['healthy'] - pivot['not']
        pivot['foldh'] = (pivot['healthy'] + div) / (pivot['not'] + div)
        pivot['foldn'] = (pivot['not'] + div) / (pivot['healthy'] + div)
        return pivot
    
    def get_weights(self, pivot):
        """
        Based on proportion fold changes and differences, calculate theta, the weights for each feature
        """
        copy = pivot.copy()
        copy['weight'] = 0
        copy.loc[copy['diff'] > 0, 'weight'] = np.log(copy[copy['diff'] > 0]['foldh'])
        copy.loc[copy['diff'] < 0, 'weight'] = np.log(copy[copy['diff'] < 0]['foldn'])
        copy['theta'] = copy['weight'] * copy['diff']
        theta = copy[['theta']]
        theta_good = theta[theta['theta'] > 0].sort_values('theta', ascending=False)
        theta_bad = theta[theta['theta'] < 0].sort_values('theta', ascending=True)
        return theta_good, theta_bad
    
    def split_features(self, X, theta_good, theta_bad):
        """"
        Split features based on whether or not they have a positive or negative weight
        """
        X_good = X[list(theta_good.index)]
        X_bad = X[list(theta_bad.index)]
        return X_good, X_bad
        
    def get_thetas(self, theta):
        thetas = np.hstack([theta] * theta.shape[0])
        for i in range(theta.shape[0]):
            # keep only the i most important features in ith column vector
            thetas[i:, i] = 0
        return thetas

In [137]:
with open(output_dir + "feature_list.txt") as f:
    feature_list = f.read().splitlines()
feature_list[:5]

['s__Acidaminococcus_fermentans',
 's__Acidaminococcus_intestini',
 's__Actinomyces_odontolyticus',
 's__Alistipes_finegoldii',
 's__Alistipes_indistinctus']

In [138]:
model = GMHI2(feature_list)
model.fit(X, y)

(4347, 94)

#### Split data

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [6]:
from sklearn.model_selection import KFold
X_fold = X.copy()
y_fold = y.copy()

In [7]:
kf = KFold(n_splits=10, random_state=42, shuffle=True)
X_fold['fold_index'] = 0
y_fold['fold_index'] = 0
fold_index = 0
for _, fold in kf.split(X, y):
    X_fold.iloc[fold, -1] = fold_index
    y_fold.iloc[fold, -1] = fold_index
    fold_index += 1
    print(fold)

[   8   17   23   33   45   51   61   70   84   96  109  120  132  134
  144  149  150  151  152  157  166  175  179  180  184  188  192  196
  205  211  220  238  254  270  274  287  290  296  297  305  308  309
  314  315  318  366  371  376  387  410  414  429  432  438  443  457
  461  463  468  471  478  494  497  505  511  530  538  544  555  561
  584  594  596  599  602  642  643  668  718  720  721  731  734  751
  764  794  803  810  812  817  829  831  857  862  869  881  888  907
  915  960  969  978 1001 1024 1027 1029 1032 1047 1051 1052 1055 1061
 1071 1084 1104 1106 1123 1130 1146 1157 1161 1162 1181 1183 1207 1225
 1231 1235 1237 1255 1261 1263 1288 1295 1302 1309 1315 1321 1322 1340
 1344 1351 1356 1361 1392 1393 1398 1402 1411 1417 1424 1432 1437 1438
 1448 1450 1454 1479 1482 1483 1485 1498 1501 1505 1534 1537 1572 1588
 1612 1615 1620 1627 1634 1642 1657 1659 1662 1665 1691 1702 1727 1728
 1738 1745 1749 1770 1777 1778 1780 1788 1808 1813 1820 1832 1839 1840
 1842 

## Calculate Weights

In [35]:
div = 0.0001
pivot = pd.concat([X_train[y_train['isHealthy']].mean(), X_train[~y_train['isHealthy']].mean()], axis = 1)
pivot.columns = ['healthy', 'not']
pivot['diff'] = pivot['healthy'] - pivot['not']
pivot['foldh'] = (pivot['healthy'] + div) / (pivot['not'] + div)
pivot['foldn'] = (pivot['not'] + div) / (pivot['healthy'] + div)

In [36]:
pivot['weight'] = 0
pivot

Unnamed: 0,healthy,not,diff,foldh,foldn,weight
s__Acidaminococcus_fermentans,0.037162,0.077073,-0.039910,0.482842,2.071070,0
s__Acidaminococcus_intestini,0.080659,0.117228,-0.036569,0.688316,1.452820,0
s__Actinomyces_odontolyticus,0.090794,0.150259,-0.059465,0.604512,1.654226,0
s__Alistipes_finegoldii,0.723818,0.553756,0.170061,1.307049,0.765082,0
s__Alistipes_indistinctus,0.541385,0.472798,0.068587,1.145036,0.873335,0
...,...,...,...,...,...,...
PWY0-1241: ADP-L-glycero-&beta;-D-manno-heptose biosynthesis,0.622044,0.738990,-0.116946,0.841771,1.187972,0
PWY0-781: aspartate superpathway,0.979730,0.970207,0.009522,1.009814,0.990281,0
PWY4LZ-257: superpathway of fermentation (Chlamydomonas reinhardtii),0.965794,0.950777,0.015017,1.015792,0.984453,0
PWY66-389: phytol degradation,0.185811,0.396373,-0.210562,0.468912,2.132598,0


In [37]:
import numpy as np
pivot.loc[pivot['diff'] > 0, 'weight'] = np.log(pivot[pivot['diff'] > 0]['foldh'])
pivot

Unnamed: 0,healthy,not,diff,foldh,foldn,weight
s__Acidaminococcus_fermentans,0.037162,0.077073,-0.039910,0.482842,2.071070,0.000000
s__Acidaminococcus_intestini,0.080659,0.117228,-0.036569,0.688316,1.452820,0.000000
s__Actinomyces_odontolyticus,0.090794,0.150259,-0.059465,0.604512,1.654226,0.000000
s__Alistipes_finegoldii,0.723818,0.553756,0.170061,1.307049,0.765082,0.267772
s__Alistipes_indistinctus,0.541385,0.472798,0.068587,1.145036,0.873335,0.135436
...,...,...,...,...,...,...
PWY0-1241: ADP-L-glycero-&beta;-D-manno-heptose biosynthesis,0.622044,0.738990,-0.116946,0.841771,1.187972,0.000000
PWY0-781: aspartate superpathway,0.979730,0.970207,0.009522,1.009814,0.990281,0.009766
PWY4LZ-257: superpathway of fermentation (Chlamydomonas reinhardtii),0.965794,0.950777,0.015017,1.015792,0.984453,0.015669
PWY66-389: phytol degradation,0.185811,0.396373,-0.210562,0.468912,2.132598,0.000000


In [38]:
pivot.loc[pivot['diff'] < 0, 'weight'] = np.log(pivot[pivot['diff'] < 0]['foldn'])
pivot.head()

Unnamed: 0,healthy,not,diff,foldh,foldn,weight
s__Acidaminococcus_fermentans,0.037162,0.077073,-0.03991,0.482842,2.07107,0.728065
s__Acidaminococcus_intestini,0.080659,0.117228,-0.036569,0.688316,1.45282,0.373507
s__Actinomyces_odontolyticus,0.090794,0.150259,-0.059465,0.604512,1.654226,0.503333
s__Alistipes_finegoldii,0.723818,0.553756,0.170061,1.307049,0.765082,0.267772
s__Alistipes_indistinctus,0.541385,0.472798,0.068587,1.145036,0.873335,0.135436


In [39]:
pivot['weight2'] = abs(pivot['weight'] * pivot['diff'])
pivot

Unnamed: 0,healthy,not,diff,foldh,foldn,weight,weight2
s__Acidaminococcus_fermentans,0.037162,0.077073,-0.039910,0.482842,2.071070,0.728065,0.029057
s__Acidaminococcus_intestini,0.080659,0.117228,-0.036569,0.688316,1.452820,0.373507,0.013659
s__Actinomyces_odontolyticus,0.090794,0.150259,-0.059465,0.604512,1.654226,0.503333,0.029931
s__Alistipes_finegoldii,0.723818,0.553756,0.170061,1.307049,0.765082,0.267772,0.045538
s__Alistipes_indistinctus,0.541385,0.472798,0.068587,1.145036,0.873335,0.135436,0.009289
...,...,...,...,...,...,...,...
PWY0-1241: ADP-L-glycero-&beta;-D-manno-heptose biosynthesis,0.622044,0.738990,-0.116946,0.841771,1.187972,0.172248,0.020144
PWY0-781: aspartate superpathway,0.979730,0.970207,0.009522,1.009814,0.990281,0.009766,0.000093
PWY4LZ-257: superpathway of fermentation (Chlamydomonas reinhardtii),0.965794,0.950777,0.015017,1.015792,0.984453,0.015669,0.000235
PWY66-389: phytol degradation,0.185811,0.396373,-0.210562,0.468912,2.132598,0.757341,0.159467


## Select features based on weights

In [40]:
X_good = X_train[pivot[pivot['diff'] > 0].nlargest(210,
    "weight2").index]
X_good_test = X_test[pivot[pivot['diff'] > 0].nlargest(210,
    "weight2").index]
X_good

Unnamed: 0,s__Bifidobacterium_angulatum,s__Bifidobacterium_catenulatum,s__Sutterella_wadsworthensis,s__Bifidobacterium_adolescentis,s__Lachnospiraceae_bacterium_8_1_57FAA,s__Bacteroidales_bacterium_ph8,s__Alistipes_senegalensis,s__Lactobacillus_acidophilus,s__Alistipes_finegoldii,s__Ruminococcus_callidus,...,s__Lachnospiraceae_bacterium_7_1_58FAA,PWY-6595: superpathway of guanosine nucleotides degradation (plants),s__Ruminococcus_bromii,s__Haemophilus_parainfluenzae,PWY4LZ-257: superpathway of fermentation (Chlamydomonas reinhardtii),HISDEG-PWY: L-histidine degradation I,DAPLYSINESYN-PWY: L-lysine biosynthesis I,PWY0-781: aspartate superpathway,"P4-PWY: superpathway of L-lysine, L-threonine and L-methionine biosynthesis I",s__Clostridium_bartlettii
SAMEA3879600,False,False,False,True,True,True,True,False,False,True,...,True,True,True,True,True,True,True,True,True,False
SAMN00715181,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,True,False,True,True,True,False
SAMN00142634,False,False,True,False,False,True,True,False,True,False,...,True,True,True,True,True,True,True,True,True,False
SAMEA3664664,False,True,False,True,False,True,True,True,True,True,...,True,True,True,False,True,True,True,True,True,True
SAMN00142107,False,False,True,True,False,True,False,False,True,True,...,True,True,True,True,True,True,True,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMN00792087,False,False,True,False,False,True,False,False,True,True,...,False,True,True,True,True,True,True,True,True,True
SAMEA1972159,False,False,False,False,False,False,False,False,False,False,...,True,False,True,False,True,False,True,True,True,False
SAMN00070035,False,False,True,True,True,False,False,False,False,False,...,False,True,True,False,False,True,True,True,True,False
SAMN06110010,False,True,False,False,False,True,True,False,True,True,...,True,True,True,True,True,True,True,True,True,True


In [41]:
X_bad = X_train[pivot[pivot['diff'] < 0].nlargest(214,
    "weight2").index]
X_bad_test = X_test[pivot[pivot['diff'] < 0].nlargest(214,
    "weight2").index]
X_bad

Unnamed: 0,s__Solobacterium_moorei,s__Granulicatella_adiacens,s__Clostridium_ramosum,s__Clostridium_bolteae,s__Peptostreptococcus_stomatis,s__Clostridium_hathewayi,s__candidate_division_TM7_single_cell_isolate_TM7c,s__Streptococcus_anginosus,s__Streptococcus_gordonii,s__Gemella_sanguinis,...,s__Gordonibacter_pamelaeae,PWY-5189: tetrapyrrole biosynthesis II (from glycine),s__Bilophila_wadsworthia,"P125-PWY: superpathway of (R,R)-butanediol biosynthesis",CRNFORCAT-PWY: creatinine degradation I,PWY66-398: TCA cycle III (animals),PWY-5345: superpathway of L-methionine biosynthesis (by sulfhydrylation),PWY-5941: glycogen degradation II (eukaryotic),NAGLIPASYN-PWY: lipid IVA biosynthesis,s__Prevotella_timonensis
SAMEA3879600,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,True,False,True,True,True,False
SAMN00715181,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,True,False,True,False,True,False
SAMN00142634,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,False
SAMEA3664664,True,False,False,True,True,False,False,False,False,False,...,True,True,True,False,True,True,True,True,True,False
SAMN00142107,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,True,True,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMN00792087,True,True,False,True,False,False,False,False,False,False,...,True,True,False,False,True,False,True,True,True,False
SAMEA1972159,False,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
SAMN00070035,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
SAMN06110010,False,False,False,False,False,True,False,False,False,False,...,True,True,True,False,False,True,True,True,True,False


## Get Weight Vector

In [42]:
theta_good = pivot[pivot['diff'] > 0]['weight2'].sort_values(
ascending=False)
print(theta_good.shape)
theta_good

(58,)


s__Bifidobacterium_angulatum                                                     0.213492
s__Bifidobacterium_catenulatum                                                   0.141284
s__Sutterella_wadsworthensis                                                     0.126205
s__Bifidobacterium_adolescentis                                                  0.088545
s__Lachnospiraceae_bacterium_8_1_57FAA                                           0.084819
s__Bacteroidales_bacterium_ph8                                                   0.082788
s__Alistipes_senegalensis                                                        0.079830
s__Lactobacillus_acidophilus                                                     0.054238
s__Alistipes_finegoldii                                                          0.045538
s__Ruminococcus_callidus                                                         0.044205
PWY-7235: superpathway of ubiquinol-6 biosynthesis (eukaryotic)                  0.041287
s__Bifidob

In [43]:
theta_bad = pivot[pivot['diff'] < 0]['weight2'].nlargest(214)
print(theta_bad.shape)
theta_bad

(78,)


s__Solobacterium_moorei                                                     0.368499
s__Granulicatella_adiacens                                                  0.329314
s__Clostridium_ramosum                                                      0.240393
s__Clostridium_bolteae                                                      0.238182
s__Peptostreptococcus_stomatis                                              0.223071
                                                                              ...   
PWY66-398: TCA cycle III (animals)                                          0.000609
PWY-5345: superpathway of L-methionine biosynthesis (by sulfhydrylation)    0.000529
PWY-5941: glycogen degradation II (eukaryotic)                              0.000099
NAGLIPASYN-PWY: lipid IVA biosynthesis                                      0.000045
s__Prevotella_timonensis                                                    0.000043
Name: weight2, Length: 78, dtype: float64

## Convert from dataframes to numpy matrices for performance optimization

In [44]:
y_numpy = y_train.to_numpy().flatten()
y_numpy_test = y_test.to_numpy().flatten()
y_numpy

array([ True,  True,  True, ...,  True,  True, False])

In [45]:
healthyIdx = np.where(y_train)[0]
unhealthyIdx = np.where(~y_train)[0]
healthyIdx_test = np.where(y_test)[0]
unhealthyIdx_test = np.where(~y_test)[0]
unhealthyIdx

array([   9,   11,   12, ..., 3907, 3908, 3911])

In [46]:
X_good2 = X_good.to_numpy().astype(float)
X_bad2 = X_bad.to_numpy().astype(float)
theta_good2 = theta_good.to_numpy()
theta_bad2 = theta_bad.to_numpy()
X_good2_test = X_good_test.to_numpy().astype(float)
X_bad2_test = X_bad_test.to_numpy().astype(float)

## Create theta_goods, theta_bads. Each row is a weight vector.
## For feature selection purposes, the ith weight vector has 0s from index i to the end in order to remove those features 

In [47]:
theta_goods = np.vstack([theta_good2] * theta_good2.shape[0])
for i in range(theta_good2.shape[0]):
    theta_goods[i, i:] = 0
theta_goods[10]

array([0.21349199, 0.14128405, 0.12620454, 0.08854507, 0.08481894,
       0.08278796, 0.07982981, 0.05423764, 0.04553759, 0.04420541,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ])

In [48]:
theta_bads = np.vstack([theta_bad2] * theta_bad2.shape[0])
for i in range(theta_bad2.shape[0]):
    theta_bads[i, i:] = 0
theta_bads[20, :]

array([0.36849887, 0.32931399, 0.24039344, 0.23818227, 0.22307111,
       0.21179569, 0.20985811, 0.20149964, 0.19421738, 0.19107523,
       0.18773107, 0.18526934, 0.17871955, 0.16813999, 0.15946743,
       0.15328814, 0.13046514, 0.12996069, 0.12269974, 0.1107841 ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

## Calculate balanced Accuracy

In [49]:
from numba import njit
@njit(nogil=True)
def getChi(X_good, X_bad,
           theta_goods, theta_bads, healthyIdx, unhealthyIdx, num_good, num_bad):
    good_term = (X_good @ theta_goods[num_good, :])
    bad_term = (X_bad @ theta_bads[num_bad, :])
    y_hat = good_term - bad_term
    y_hat = y_hat.reshape(len(y_hat), 1)
    p_h = (y_hat[healthyIdx] > 0).sum() / len(healthyIdx)
    p_n = (y_hat[unhealthyIdx] < 0).sum() / len(unhealthyIdx)
    return 0.5 * (p_h + p_n)

In [56]:
%%time
getChi(X_good2, X_bad2,
        theta_goods, theta_bads,
        healthyIdx, unhealthyIdx, 
      44, 26) # Best hyperparameters

CPU times: user 2.8 ms, sys: 279 µs, total: 3.07 ms
Wall time: 412 µs


0.735058771530598

## Find the best combination of number of features

In [51]:
# from numba import prange
# @njit(nogil=True, parallel=True)
def run():
    scores = [0.0]
    for num_good in range(theta_good2.shape[0]):
        for num_bad in range(theta_bad2.shape[0]):
            chi = getChi(X_good2, X_bad2, 
                theta_goods, theta_bads,
                healthyIdx, unhealthyIdx, 
                num_good, num_bad)
            scores.append(chi)
    return scores

In [52]:
%%time
scores = run()
max(scores)

CPU times: user 2.37 s, sys: 15 ms, total: 2.39 s
Wall time: 299 ms


0.735058771530598

In [55]:
scores.index(max(scores)) % theta_bad2.shape[0]

27

In [58]:
getChi(X_good2_test, X_bad2_test,
        theta_goods, theta_bads,
        healthyIdx_test, unhealthyIdx_test, 
      44, 26)

0.7565019215300741

In [11]:
y_fold

Unnamed: 0,isHealthy,fold_index
SAMD00036192,True,2
SAMD00036193,True,5
SAMD00036194,False,6
SAMD00036197,True,7
SAMD00036204,True,9
...,...,...
SAMN07534852,True,9
SAMN07534870,True,4
SAMN07534977,True,5
SAMN07534978,True,6


In [10]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=0.09, penalty='l1',
    fit_intercept=False, max_iter=500, random_state=42,
                        solver='liblinear')

In [23]:
for C in [0.01, 0.03, 0.1, 0.3, 1, 3, 10]:
    logreg = LogisticRegression(C=C, penalty='elasticnet',
    fit_intercept=False, max_iter=500, random_state=42,
                        solver='saga', l1_ratio=0.5)
    mean_acc = 0
    for fold_idx in range(10):
        X_test, y_test = (X_fold[X_fold['fold_index'] == fold_idx].iloc[:, :-1], 
            y_fold[y_fold['fold_index'] == fold_idx].iloc[:, 0])
        X_train, y_train = (X_fold[X_fold['fold_index'] != fold_idx].iloc[:, :-1], 
            y_fold[y_fold['fold_index'] != fold_idx].iloc[:, 0])
        clf = logreg.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        mean_acc += acc
    mean_acc /= 10
    print(mean_acc)
    

0.7285518300757455
0.753167010964564
0.762830658403517
0.7644409131839611




KeyboardInterrupt: 