In [11]:
# This file is going to test the performance of simple MLP model with different ratios of missing features
# The missing features are randomly selected and the missing ratio is controlled by the parameter missing_ratio

import ast
import os
import gc
import random
import warnings
import sys

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization

import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset

import pickle
from utils import *

warnings.filterwarnings('ignore')

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# neural network binary classifier
class NNbinaryClassifier(nn.Module):
    def __init__(self, input_size, n_hidden_layers=1, hidden_size=512, dropout_prob=0.1):
        super().__init__()

        self.layers = nn.ModuleList()
        self.layers.extend([nn.Linear(input_size, hidden_size), nn.ReLU(), nn.Dropout(dropout_prob)])
        for _ in range(n_hidden_layers - 1):
            self.layers.extend([nn.Linear(hidden_size, hidden_size), nn.ReLU(), nn.Dropout(dropout_prob)])
        self.layers.extend([nn.Linear(hidden_size, 1), nn.Sigmoid()])

    def forward(self, inputs, labels):
        logits = inputs
        for layer in self.layers:
            logits = layer(logits)
        loss = BCELoss()(logits.view(-1), labels)
        return loss, logits
    
# define the training function
def train_fn(model, train_loader, optimizer):
    model.train()
    losses = []
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        loss, _ = model(inputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    return np.mean(losses)

# define the validation function
def valid_fn(model, valid_loader):
    model.eval()
    losses = []
    predictions = []
    targets = []
    for inputs, labels in valid_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        with torch.no_grad():
            loss, logits = model(inputs, labels)
        losses.append(loss.item())
        predictions.append(logits.cpu().numpy())
        targets.append(labels.cpu().numpy())
    return np.mean(losses), np.concatenate(predictions), np.concatenate(targets)

# define the function to deactive the features
def deactive_features(X_train, X_val, X_test, missing_ratio, random_seed=42):
    random.seed(random_seed)
    np.random.seed(random_seed)
    n_samples, n_features = X_train.shape
    n_missing_features = int(n_features * missing_ratio)
    missing_features = random.sample(range(n_features), n_missing_features)
    X_train[:, missing_features] = 0
    X_val[:, missing_features] = 0
    X_test[:, missing_features] = 0
    return X_train, X_val, X_test


# define the function to train the model
def train_NN_with_missing_features(drug, split_num, missing_ratio):
    # load the data
    with open(f'../Data/idx_splits/Walker_single_binary/{drug}.pkl', 'rb') as f:
            X = pickle.load(f)[split_num]
    y = getWalkerLabels(drug)[split_num]
    X_train, X_val, X_test = X
    y_train, y_val, y_test = y
    
    # Deactive the features
    X_train, X_val, X_test = deactive_features(X_train, X_val, X_test, missing_ratio, random_seed=42)

    # Set the seed
    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)

    # convert to torch dataset
    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float), torch.tensor(y_train, dtype=torch.float))
    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=64)
    val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float), torch.tensor(y_val, dtype=torch.float))
    val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=64)
    test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float), torch.tensor(y_test, dtype=torch.float))
    test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=64)

    # define the model
    model = NNbinaryClassifier(input_size=X_train.shape[1]).to(device)
    optimizer = Adam(model.parameters(), lr=0.001)

    # early stopping after 5 epochs
    best_loss = np.inf
    best_epoch = 0

    for epoch in range(100):
        train_loss = train_fn(model, train_dataloader, optimizer)
        val_loss, val_preds, val_targets = valid_fn(model, val_dataloader)
        if val_loss < best_loss:
            best_loss = val_loss
            best_epoch = epoch
        if epoch - best_epoch > 5:
            break
    
    # save the test results
    test_loss, test_preds, test_targets = valid_fn(model, test_dataloader)
    # print the test AUROC with drug name, split number and missing ratio
    print(f'{drug} Split{split_num} Missing ratio {missing_ratio} AUROC: {roc_auc_score(test_targets, test_preds)}')

    del model, train_dataloader, val_dataloader, test_dataloader, train_dataset, val_dataset, test_dataset
    gc.collect()

    return (test_preds, test_targets)
    


In [12]:
tmp = train_NN_with_missing_features('PZA', 0, 0)

PZA Split0 Missing ratio 0 AUROC: 0.9595275719267655


In [18]:
for drug in drug_list:
    for missing_ratio in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        train_NN_with_missing_features(drug, 0, missing_ratio)

INH Split0 Missing ratio 0 AUROC: 0.9704974139826579
INH Split0 Missing ratio 0.1 AUROC: 0.9403702322794876
INH Split0 Missing ratio 0.2 AUROC: 0.9301584114874619
INH Split0 Missing ratio 0.3 AUROC: 0.9078354891444632
INH Split0 Missing ratio 0.4 AUROC: 0.9108055053648307
INH Split0 Missing ratio 0.5 AUROC: 0.9081828408268882
INH Split0 Missing ratio 0.6 AUROC: 0.8748960334987237
INH Split0 Missing ratio 0.7 AUROC: 0.8318292049470528
INH Split0 Missing ratio 0.8 AUROC: 0.7406366414597695
INH Split0 Missing ratio 0.9 AUROC: 0.6345159224099196
RIF Split0 Missing ratio 0 AUROC: 0.9758857986814693
RIF Split0 Missing ratio 0.1 AUROC: 0.9509082736379464
RIF Split0 Missing ratio 0.2 AUROC: 0.9486054824623991
RIF Split0 Missing ratio 0.3 AUROC: 0.9389127115499872
RIF Split0 Missing ratio 0.4 AUROC: 0.9374964325465909
RIF Split0 Missing ratio 0.5 AUROC: 0.9296783584006394
RIF Split0 Missing ratio 0.6 AUROC: 0.8959694911384457
RIF Split0 Missing ratio 0.7 AUROC: 0.8695988755386855
RIF Split0 Mis

FileNotFoundError: [Errno 2] No such file or directory: '../Data/idx_splits/Walker_single_binary/RFB.pkl'

In [None]:
tmp = train_NN_with_missing_featu