## Setup

In [4]:
import pandas as pd
import config

In [5]:
thresh = 0.00001 # 1e-5
species = pd.read_csv(config.TRAIN_DIR + "taxonomy.csv", index_col = 0).T
pathways = pd.read_csv(config.TRAIN_DIR + "pathways.csv", index_col = 0).T
X = (pd.concat([species, pathways], axis = 1) > thresh) * 1
X.head()

Unnamed: 0,s__Abiotrophia_defectiva,s__Acidaminococcus_fermentans,s__Acidaminococcus_intestini,s__Actinomyces_graevenitzii,s__Actinomyces_odontolyticus,s__Actinomyces_oris,s__Actinomyces_turicensis,s__Actinomyces_viscosus,s__Adlercreutzia_equolifaciens,s__Aggregatibacter_segnis,...,TRNA-CHARGING-PWY: tRNA charging,TRPSYN-PWY: L-tryptophan biosynthesis,TYRFUMCAT-PWY: L-tyrosine degradation I,UBISYN-PWY: superpathway of ubiquinol-8 biosynthesis (prokaryotic),UDPNACETYLGALSYN-PWY: UDP-N-acetyl-D-glucosamine biosynthesis II,UDPNAGSYN-PWY: UDP-N-acetyl-D-glucosamine biosynthesis I,URDEGR-PWY: superpathway of allantoin degradation in plants,URSIN-PWY: ureide biosynthesis,VALDEG-PWY: L-valine degradation I,VALSYN-PWY: L-valine biosynthesis
SAMD00036192,0,0,0,1,1,0,0,0,0,0,...,1,1,0,1,0,1,0,0,0,1
SAMD00036193,0,0,0,1,0,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
SAMD00036194,0,0,0,0,0,0,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
SAMD00036197,0,0,0,1,1,0,0,1,1,0,...,1,1,0,0,0,1,0,0,0,1
SAMD00036204,0,0,0,1,1,1,0,0,1,0,...,1,1,0,1,0,1,1,0,0,1


In [7]:
y = pd.read_csv(config.TRAIN_DIR + "isHealthy.csv", index_col=0).T
y.head()

Unnamed: 0,isHealthy
SAMD00036192,True
SAMD00036193,True
SAMD00036194,False
SAMD00036197,True
SAMD00036204,True


## Use Lasso Regression to select features

In [35]:
from sklearn.linear_model import LogisticRegression
import numpy as np

logreg = LogisticRegression(C=0.1, penalty='l1',
fit_intercept=False, max_iter=500, random_state=42,
                    solver='saga', n_jobs=-1)
clf = logreg.fit(X, np.ravel(y))

In [36]:
coefficients = pd.DataFrame(X.columns, columns=['feature'])
coefficients['coef'] = clf.coef_[0]
coefficients

Unnamed: 0,feature,coef
0,s__Abiotrophia_defectiva,0.000000
1,s__Acidaminococcus_fermentans,-0.113528
2,s__Acidaminococcus_intestini,-0.099616
3,s__Actinomyces_graevenitzii,0.000000
4,s__Actinomyces_odontolyticus,0.252240
...,...,...
840,UDPNAGSYN-PWY: UDP-N-acetyl-D-glucosamine bios...,0.000000
841,URDEGR-PWY: superpathway of allantoin degradat...,0.000000
842,URSIN-PWY: ureide biosynthesis,0.000000
843,VALDEG-PWY: L-valine degradation I,0.000000


In [39]:
select = coefficients[coefficients['coef'].abs() > 0.1]
display(select)
features = list(select['feature'])
features

Unnamed: 0,feature,coef
1,s__Acidaminococcus_fermentans,-0.113528
4,s__Actinomyces_odontolyticus,0.252240
11,s__Alistipes_finegoldii,0.179591
15,s__Alistipes_senegalensis,0.187771
16,s__Alistipes_shahii,0.141365
...,...,...
637,PWY-6629: superpathway of L-tryptophan biosynt...,-0.114855
737,"PWY-7371: 1,4-dihydroxy-6-naphthoate biosynthe...",0.186640
745,PWY-7391: isoprene biosynthesis II (engineered),-0.131367
801,PWY4LZ-257: superpathway of fermentation (Chla...,0.104694


['s__Acidaminococcus_fermentans',
 's__Actinomyces_odontolyticus',
 's__Alistipes_finegoldii',
 's__Alistipes_senegalensis',
 's__Alistipes_shahii',
 's__Alistipes_sp_AP11',
 's__Atopobium_parvulum',
 's__Bacteroidales_bacterium_ph8',
 's__Bacteroides_caccae',
 's__Bacteroides_clarus',
 's__Bacteroides_coprocola',
 's__Bacteroides_dorei',
 's__Bacteroides_eggerthii',
 's__Bacteroides_fragilis',
 's__Bacteroides_plebeius',
 's__Bifidobacterium_adolescentis',
 's__Bifidobacterium_angulatum',
 's__Bifidobacterium_catenulatum',
 's__Bifidobacterium_longum',
 's__Bifidobacterium_pseudocatenulatum',
 's__Bilophila_wadsworthia',
 's__Butyricimonas_synergistica',
 's__Clostridium_bartlettii',
 's__Clostridium_bolteae',
 's__Clostridium_celatum',
 's__Clostridium_citroniae',
 's__Clostridium_leptum',
 's__Clostridium_methylpentosum',
 's__Clostridium_perfringens',
 's__Coprococcus_comes',
 's__Dialister_invisus',
 's__Dorea_formicigenerans',
 's__Enterobacter_cloacae',
 's__Erysipelotrichaceae_

In [38]:
with open(config.OUTPUT_DIR + "features.txt", "w") as f:
    for feat in features:
        f.write(feat + "\n")