In [1]:
import pandas as pd
import numpy as np
from utils import get_species, get_labels, get_labels_all

In [2]:
X, y, y_all = get_species(), get_labels(), get_labels_all()

In [3]:
cols = ((X > 0.00001).mean(axis=0) > 0.01).values
cols

array([ True,  True,  True, ..., False, False,  True])

In [4]:
X = X.iloc[:, cols]
X.shape

(12532, 542)

In [5]:
X = X[list(filter(lambda x : "virus" not in x and "unclassified" not in x, X.columns))]
X.shape

(12532, 413)

In [6]:
remove = ((y_all == "Underweight") | (y_all == "Overweight") | (y_all == "Obesity")).values

In [7]:
X, y, y_all = X.iloc[~remove, :], y.iloc[~remove, :], y_all.iloc[~remove, :]

In [8]:
studies = np.unique(X.index.get_level_values(0))
studies

array(['GMHI-10', 'GMHI-11', 'GMHI-13', 'GMHI-14', 'GMHI-15', 'GMHI-17',
       'GMHI-19', 'GMHI-2', 'GMHI-21', 'GMHI-23', 'GMHI-24', 'GMHI-26',
       'GMHI-27', 'GMHI-28', 'GMHI-3', 'GMHI-31', 'GMHI-32', 'GMHI-33',
       'GMHI-4', 'GMHI-5', 'GMHI-9', 'GMHI-V-35', 'GMHI-V-36',
       'GMHI-V-38', 'GMHI-V-40', 'GMHI-V-41', 'P103', 'P11', 'P110',
       'P113', 'P121', 'P13', 'P132', 'P135', 'P136', 'P140', 'P15',
       'P17', 'P2', 'P21', 'P24', 'P32', 'P34', 'P39', 'P4', 'P47', 'P48',
       'P53', 'P56', 'P57', 'P59', 'P63', 'P69', 'P74', 'P76', 'P77',
       'P8', 'P80', 'P81', 'P86', 'P87', 'P88', 'P89', 'P9', 'P94', 'P95',
       'P96', 'P98'], dtype=object)

In [89]:
np.random.seed(42)
perm = np.random.permutation(len(studies))
prop = 0.9
train_idx, test_idx = perm[:int(len(studies) * prop)], perm[int(len(studies) * prop):]
train_studies = studies[train_idx]
test_studies = studies[test_idx]

In [90]:
c = 0.00001
X_train, y_train = (X.loc[train_studies].values > c) * 1.0, y.loc[train_studies].values * 1.0
X_test, y_test = (X.loc[test_studies].values > c) * 1.0, y.loc[test_studies].values * 1.0

In [102]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

clf = LogisticRegression(random_state=42, penalty="l1", solver="liblinear", C=1)
clf = MLPClassifier(solver="lbfgs", alpha=1, random_state=42, hidden_layer_sizes=(10, 10),
                    early_stopping=True,
                    max_iter=2000,
                   )

In [104]:
from sklearn.metrics import balanced_accuracy_score

clf.fit(X_train, y_train.flatten())
pred = clf.predict(X_test)
balanced_accuracy_score(y_test, pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.6214204031541493

In [32]:
pred = clf.predict(X_test)

In [52]:
pred

array(['GMHI-V-41', 'P39', 'GMHI-V-38', ..., 'GMHI-V-36', 'P4', 'P56'],
      dtype=object)

In [53]:
classifiers = {}

for study in train_studies:
    logreg = LogisticRegression(random_state=42, penalty="l1", solver="liblinear", C=1)
    un = np.unique(y.loc[study])
    if len(un) < 2:
        classifiers[study] = un[0]
        continue
    logreg.fit(X.loc[study].values > c, y.loc[study].values.flatten())
    classifiers[study] = logreg

In [85]:
classifiers

{'P48': LogisticRegression(C=1, penalty='l1', random_state=42, solver='liblinear'),
 'GMHI-32': True,
 'GMHI-15': True,
 'GMHI-23': LogisticRegression(C=1, penalty='l1', random_state=42, solver='liblinear'),
 'P110': False,
 'P32': LogisticRegression(C=1, penalty='l1', random_state=42, solver='liblinear'),
 'P81': LogisticRegression(C=1, penalty='l1', random_state=42, solver='liblinear'),
 'GMHI-17': LogisticRegression(C=1, penalty='l1', random_state=42, solver='liblinear'),
 'P88': True,
 'GMHI-27': LogisticRegression(C=1, penalty='l1', random_state=42, solver='liblinear'),
 'GMHI-V-41': LogisticRegression(C=1, penalty='l1', random_state=42, solver='liblinear'),
 'P95': LogisticRegression(C=1, penalty='l1', random_state=42, solver='liblinear'),
 'P53': True,
 'GMHI-10': LogisticRegression(C=1, penalty='l1', random_state=42, solver='liblinear'),
 'P76': True,
 'P77': True,
 'P57': False,
 'GMHI-2': LogisticRegression(C=1, penalty='l1', random_state=42, solver='liblinear'),
 'P34': Fals

In [86]:
final_pred = []
for i in range(X_test.shape[0]):
    study = pred[i]
    clf = classifiers[study]
    if type(clf) == type(np.bool_(True)):
        final_pred.append(clf)
        continue
    example = X_test[[i], :]
    final_pred.append(clf.predict(example)[0])

In [87]:
final_pred

[True,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,


In [88]:
balanced_accuracy_score(y_test, final_pred)

0.5579031043427328