In [73]:
import pandas as pd
import numpy as np
from utils import get_species, get_labels, get_labels_all
from utils import get_taxonomy

In [74]:
X, y, y_all = get_taxonomy(), get_labels(), get_labels_all()

In [75]:
remove = (
    (y_all == "Underweight").values.flatten() | 
    (y_all == "Overweight").values.flatten() |
    (y_all == "Obesity").values.flatten() |
    (y_all == "Obese").values.flatten() |
    (X['UNKNOWN'] >= 100).values.flatten()
    )

In [76]:
X, y, y_all = X.iloc[~remove, :], y.iloc[~remove, :], y_all.iloc[~remove, :]

In [77]:
X.shape

(10004, 3200)

In [49]:
studies = np.unique(X.index.get_level_values(0))

In [50]:
np.random.seed(42)
perm = np.random.permutation(len(studies))
prop = 0.90
train_idx, test_idx = perm[:int(len(studies) * prop)], perm[int(len(studies) * prop):]
train_studies = studies[train_idx]
test_studies = studies[test_idx]

In [51]:
c = 0.00001
X_train, y_train = X.loc[train_studies], y.loc[train_studies]
X_test, y_test = X.loc[test_studies], y.loc[test_studies]

In [78]:
from torch import nn

class AE(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.encoder_hidden_layer = nn.Linear(
            in_features=kwargs["input_shape"], out_features=128
        ).float()
        self.encoder_output_layer = nn.Linear(
            in_features=128, out_features=128
        ).float()
        self.decoder_hidden_layer = nn.Linear(
            in_features=128, out_features=128
        ).float()
        self.decoder_output_layer = nn.Linear(
            in_features=128, out_features=kwargs["input_shape"]
        ).float()

    def forward(self, features):
        activation = self.encoder_hidden_layer(features)
        activation = torch.relu(activation)
        code = self.encoder_output_layer(activation)
        code = torch.relu(code)
        activation = self.decoder_hidden_layer(code)
        activation = torch.relu(activation)
        activation = self.decoder_output_layer(activation)
        reconstructed = torch.relu(activation)
        
        return reconstructed

In [79]:
import torch
from torch import optim

#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# create a model from `AE` autoencoder class
# load it to the specified device, either gpu or cpu
model = AE(input_shape=3200).to(device)

# create an optimizer object
# Adam optimizer with learning rate 1e-3
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# mean-squared error loss
criterion = nn.MSELoss()

In [80]:
X_tensor = torch.as_tensor(1.0 * (X > c).values, device=device).float()
X_train_tensor = torch.as_tensor(1.0 * (X_train > c).values, device=device).float()
X_test_tensor = torch.as_tensor(1.0 * (X_test > c).values, device=device).float()
X_tensor

tensor([[1., 1., 1.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [1., 1., 1.,  ..., 0., 0., 1.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.]])

In [81]:
X_loader = torch.utils.data.DataLoader(
    X_tensor, batch_size=128, shuffle=True, pin_memory=True
)

train_loader = torch.utils.data.DataLoader(
    X_train_tensor, batch_size=128, shuffle=True, num_workers=4, pin_memory=True
)

test_loader = torch.utils.data.DataLoader(
    X_test_tensor, batch_size=32, shuffle=False, num_workers=4
)

In [82]:
epochs = 100

for epoch in range(epochs):
    loss = 0
    for batch_features in X_loader:
        # load it to the active device
        batch_features = batch_features.view(-1, 3200).to(device)
        
        # reset the gradients back to zero
        # PyTorch accumulates gradients on subsequent backward passes
        optimizer.zero_grad()
        
        # compute reconstructions
        outputs = model(batch_features)
        
        # compute training reconstruction loss
        train_loss = criterion(outputs, batch_features)
        
        # compute accumulated gradients
        train_loss.backward()
        
        # perform parameter update based on current gradients
        optimizer.step()
        
        # add the mini-batch training loss to epoch loss
        loss += train_loss.item()
    
    # compute the epoch training loss
    loss = loss / len(train_loader)
    
    # display the epoch training loss
    print("epoch : {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch : 1/100, loss = 0.042360
epoch : 2/100, loss = 0.028050
epoch : 3/100, loss = 0.023879
epoch : 4/100, loss = 0.022232
epoch : 5/100, loss = 0.021266
epoch : 6/100, loss = 0.020514
epoch : 7/100, loss = 0.019961
epoch : 8/100, loss = 0.019409
epoch : 9/100, loss = 0.019030
epoch : 10/100, loss = 0.018730
epoch : 11/100, loss = 0.018458
epoch : 12/100, loss = 0.018197
epoch : 13/100, loss = 0.017978
epoch : 14/100, loss = 0.017762
epoch : 15/100, loss = 0.017559
epoch : 16/100, loss = 0.017426
epoch : 17/100, loss = 0.017269
epoch : 18/100, loss = 0.017135
epoch : 19/100, loss = 0.017031
epoch : 20/100, loss = 0.016903
epoch : 21/100, loss = 0.016819
epoch : 22/100, loss = 0.016682
epoch : 23/100, loss = 0.016609
epoch : 24/100, loss = 0.016526
epoch : 25/100, loss = 0.016462
epoch : 26/100, loss = 0.016403
epoch : 27/100, loss = 0.016309
epoch : 28/100, loss = 0.016240
epoch : 29/100, loss = 0.016220
epoch : 30/100, loss = 0.016152
epoch : 31/100, loss = 0.016097
epoch : 32/100, l

In [83]:
features = X_tensor
activation = model.encoder_hidden_layer(features)
activation = torch.relu(activation)
code = model.encoder_output_layer(activation)
code = torch.relu(code)
code

tensor([[0.0000, 1.5800, 1.6527,  ..., 0.0000, 1.2841, 0.0000],
        [0.0000, 2.0115, 1.3123,  ..., 0.0000, 2.3784, 0.0000],
        [0.0000, 1.8580, 1.3846,  ..., 0.0000, 2.8246, 0.0000],
        ...,
        [0.0000, 2.6081, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 3.0618, 1.1666,  ..., 0.0000, 1.2878, 0.0000],
        [0.0000, 1.5808, 1.6466,  ..., 0.0000, 1.8156, 0.0000]],
       grad_fn=<ReluBackward0>)

In [84]:
code.shape

torch.Size([10004, 128])

In [85]:
from sklearn.preprocessing import OrdinalEncoder

sample_studies = np.array(X.index.get_level_values(0))
o = OrdinalEncoder()
groups = o.fit_transform(sample_studies.reshape((len(sample_studies), 1))).flatten()
groups

array([ 9.,  9.,  9., ..., 35., 35., 35.])

In [86]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import StratifiedKFold
logo = LeaveOneGroupOut()
kfold = StratifiedKFold(10, shuffle=True, random_state=42)

In [99]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

c = 0.00001
clf = LogisticRegression(random_state=42, penalty="l1", solver="liblinear", C=0.3, class_weight="balanced")

pipe = Pipeline([('scaler', MinMaxScaler()), ('clf', clf)])

predictions = cross_val_predict(pipe, code.detach().numpy(), y.values.flatten(), 
                         groups=groups, 
                                cv=logo, verbose=2, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done  68 out of  68 | elapsed:  1.3min finished


In [100]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score

balanced_accuracy_score(y, predictions), accuracy_score(y, predictions)

(0.6867823060214364, 0.6858256697321071)