In [1]:
import pandas as pd
import numpy as np
import wandb
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
# mute wandb outputs
import os
os.environ["WANDB_SILENT"] = "true"

 TODO:
 1. Use temporal method to cut train, val and test ✅
 2. Scale individually ✅
 3. Simplify the code (including organizing all GNNs into a base class)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
# login wandb
wandb.login(key="62d0c78e72de6dacd620fc6d13ebfecfa7ce68a1")

True

# Read Dataset

In [4]:
onset_df_pilot = pd.read_csv('/blue/yonghui.wu/lideyi/AKI_GNN/raw_data/norm_df_pilot.csv')

In [5]:
feature_columns = [col for col in onset_df_pilot.columns if col not in ['AKI_TARGET', 'TRAIN_SET', 'VAL_SET', 'TEST_SET']]
X_train = onset_df_pilot.loc[onset_df_pilot['TRAIN_SET'] == 1, feature_columns].copy(deep=True).values
y_train = onset_df_pilot.loc[onset_df_pilot['TRAIN_SET'] == 1, 'AKI_TARGET'].copy(deep=True).values
X_val = onset_df_pilot.loc[onset_df_pilot['VAL_SET'] == 1, feature_columns].copy(deep=True).values
y_val = onset_df_pilot.loc[onset_df_pilot['VAL_SET'] == 1, 'AKI_TARGET'].copy(deep=True).values
X_test = onset_df_pilot.loc[onset_df_pilot['TEST_SET'] == 1, feature_columns].copy(deep=True).values
y_test = onset_df_pilot.loc[onset_df_pilot['TEST_SET'] == 1, 'AKI_TARGET'].copy(deep=True).values

# Random Forest

In [6]:
def evaluate_RF(X_train: np.array, y_train: np.array, X_val: np.array, y_val: np.array,
                X_test: np.array, y_test:  np.array, wandb_project_name: str, 
                parameters: dict) -> pd.DataFrame:
    sweep_config = build_sweep_config(parameters)
    sweep_id = wandb.sweep(sweep_config, project = wandb_project_name)
    sweep_func = lambda: train_RF(X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val, config = None)
    wandb.agent(sweep_id, sweep_func)
    performance = test_best_RF(X_train, y_train, X_test, y_test, sweep_id)
    return performance

In [7]:
def build_sweep_config(parameters: dict) -> dict:
    sweep_config = {
    'method': 'grid',
    'metric': {'name': 'val_F1', 'goal': 'maximize'},
    'parameters': parameters,
    }
    return sweep_config

In [8]:
def train_RF(X_train: np.array, y_train: np.array, X_val: np.array, y_val: np.array, config = None) -> None:
    # Initialize a new wandb run
    with wandb.init(config=config):
        config = wandb.config
        RF = build_RF(config.max_depth, config.min_samples_leaf,
                      config.min_samples_split, config.n_estimators)
        RF.fit(X_train, y_train)
        val_F1 = evaluate_on_val(X_val, y_val, RF)
        
        wandb.log({"val_F1": val_F1})

In [9]:
def build_RF(max_depth: int, min_samples_leaf: int, 
             min_samples_split: int, n_estimators: int) -> RandomForestClassifier:
    
    RF = RandomForestClassifier(max_depth=max_depth, 
                                min_samples_leaf=min_samples_leaf, 
                                min_samples_split=min_samples_split,
                                n_estimators=n_estimators)
    return RF

In [10]:
def evaluate_on_val(X_val: np.array, y_val: np.array, RF: RandomForestClassifier) -> float:
    y_pred = RF.predict(X_val)
    val_F1 = f1_score(y_val, y_pred, average='macro')
    return val_F1

In [11]:
def test_best_RF(X_train: np.array, y_train: np.array, 
                 X_test: np.array, y_test: np.array, sweep_id: str) -> pd.DataFrame:
    best_config = fetch_best_config(sweep_id)
    best_RF = build_RF(best_config['max_depth'], best_config['min_samples_leaf'], 
                    best_config['min_samples_split'], best_config['n_estimators'])
    best_RF.fit(X_train, y_train)
    best_RF_performance = test_RF(X_test, y_test, best_RF)
    return best_RF_performance


In [12]:
def fetch_best_config(sweep_id: str) -> dict:
    # Authenticate with W&B
    api = wandb.Api()
    sweep = api.sweep(sweep_id)
    runs = sweep.runs
    
    # Find the best run
    best_run = max(runs, key=lambda run: run.summary.get("val_F1", float("-inf")))
    best_hyperparams = best_run.config
    return best_hyperparams

In [13]:
import sys
sys.path.append(os.path.abspath("/home/lideyi/AKI_GNN/notebooks/utils"))
from metrics import performance_per_class

In [14]:
def test_RF(X_test: np.array, y_test: np.array, best_model: RandomForestClassifier) -> dict:
    y_test_pred = best_model.predict(X_test)
    y_test_pred_prob = best_model.predict_proba(X_test)
    performance = performance_per_class(y_test, y_test_pred, y_test_pred_prob)
    return performance

In [None]:
RF_parameters = {
    'max_depth': {'values': [3, 5, 10]},
    'min_samples_leaf': {'values': [1, 2, 4]},
    'min_samples_split': {'values': [2, 5, 10]},
    'n_estimators': {'values': [50, 100, 200]}
}
RF_performance = evaluate_RF(X_train, y_train, X_val, y_val, X_test, y_test, "AKI_GNN_RF", RF_parameters)

Create sweep with ID: ccnpid7f
Sweep URL: https://wandb.ai/ericli/AKI_GNN_RF/sweeps/ccnpid7f


In [None]:
RF_performance

# Multi-laryer Perceptron

In [None]:
def evaluate_MLP(X_train: np.array, y_train: np.array, X_val: np.array, y_val: np.array,
                X_test: np.array, y_test:  np.array, wandb_project_name: str, 
                parameters: dict) -> pd.DataFrame:
    sweep_config = build_sweep_config(parameters)
    sweep_id = wandb.sweep(sweep_config, project = wandb_project_name)
    sweep_func = lambda: train_MLP_main(X_train = X_train, y_train = y_train, X_val = X_val, y_val = y_val, config = None)
    wandb.agent(sweep_id, sweep_func)
    performance = test_best_MLP(X_train, y_train, X_test, y_test, sweep_id)
    return performance

In [None]:
def train_MLP_main(X_train: np.array, y_train: np.array, X_val: np.array, y_val: np.array, config = None) -> None:
    # Initialize a new wandb run
    with wandb.init(config=config):
        config = wandb.config
        model = build_MLP(config.input_dim, config.n_class, config.hidden_dims, 
                        config.dropout, config.activation)
        optimizer = build_optimizer(model, config.optimizer, config.lr)
        train_loader = build_dataloader(X_train, y_train, config.batch_size, shuffle=True)
        val_loader = build_dataloader(X_val, y_val, config.batch_size, shuffle=False)
        train_MLP(model, config.epochs, optimizer, train_loader, val_loader)

In [None]:
def train_MLP(model: torch.nn.Module, epochs: int, optimizer: torch.optim.Optimizer, 
              train_loader: torch.utils.data.DataLoader, val_loader: torch.utils.data.DataLoader,
              log: bool = True) -> None:
    for _ in range(epochs):
        train_F1, avg_loss_train = train_epoch(model, optimizer, train_loader)
        if log:
            wandb.log({"train_loss": avg_loss_train, "train_F1": train_F1})
        if val_loader != None:
            val_F1, avg_loss_val = val_epoch(model, val_loader)
            if log:
                wandb.log({"val_loss": avg_loss_val, "val_F1": val_F1})

In [None]:
def build_MLP(input_dim: int, n_class: int, hidden_dims: list, dropout: float, activation: str) -> torch.nn.Module:
    return MLP(input_dim, n_class, hidden_dims, dropout, activation).to(device)

In [None]:
class MLP(torch.nn.Module):
    def __init__(self, input_dim: int, n_class: int, hidden_dims: list, dropout: float, activation: str):
        super(MLP, self).__init__()
        torch.random.manual_seed(888)
        # Define the activation function
        if activation == 'relu':
            activation_fn = torch.nn.ReLU()
        elif activation == 'sigmoid':
            activation_fn = torch.nn.Sigmoid()
        elif activation == 'tanh':
            activation_fn = torch.nn.Tanh()
        else:
            raise ValueError("Unsupported activation function. Choose from 'relu', 'sigmoid', or 'tanh'.")
        
        layers = []
        prev_dim = input_dim
        
        for h_dim in hidden_dims:
            layers.append(torch.nn.Linear(prev_dim, h_dim))
            layers.append(activation_fn)
            layers.append(torch.nn.Dropout(dropout))
            prev_dim = h_dim
            
        # append classifier layer
        layers.append(torch.nn.Linear(prev_dim, n_class))
        self.network = torch.nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.network(x)

In [None]:
def build_dataloader(X: np.array, y: np.array, batch_size: int, shuffle: bool) -> torch.utils.data.DataLoader:
    dataset = torch.utils.data.TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.long))
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [None]:
def build_optimizer(model: torch.nn.Module, optimizer: str, lr: float) -> torch.optim.Optimizer:
    if optimizer == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    elif optimizer == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    return optimizer

In [None]:
# train the model for one epoch
def train_epoch(model: torch.nn.Module, optimizer: torch.optim.Optimizer, 
                train_loader: torch.utils.data.DataLoader) -> tuple[float, float]:
    model.train()
    total_loss = 0
    y_true = []
    y_pred = []
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred_batch = model(X_batch)
        loss = torch.nn.functional.cross_entropy(y_pred_batch, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        y_true.extend(y_batch.cpu().numpy())
        y_pred.extend(y_pred_batch.argmax(dim=1).detach().cpu().numpy())
    macro_F1 = f1_score(y_true, y_pred, average='macro')
    return macro_F1, total_loss / len(train_loader)

In [None]:
# validate the model on the validation set, return the macro-F1
def val_epoch(model: torch.nn.Module, val_loader: torch.utils.data.DataLoader) -> tuple[float, float]:
    model.eval()
    total_loss = 0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred_batch = model(X_batch)
            loss = torch.nn.functional.cross_entropy(y_pred_batch, y_batch)
            total_loss += loss.item()
            y_true.extend(y_batch.cpu().numpy())
            y_pred.extend(y_pred_batch.argmax(dim=1).detach().cpu().numpy())
    macro_F1 = f1_score(y_true, y_pred, average='macro')
    return macro_F1, total_loss / len(val_loader)

In [None]:
def test_best_MLP(X_train: np.array, y_train: np.array, 
                 X_test: np.array, y_test: np.array, sweep_id: str) -> pd.DataFrame:
    best_config = fetch_best_config(sweep_id)
    best_model = build_MLP(best_config['input_dim'], best_config['n_class'], best_config['hidden_dims'], 
                    best_config['dropout'], best_config['activation'])
    optimizer = build_optimizer(best_model, best_config['optimizer'], best_config['lr'])
    train_loader = build_dataloader(X_train, y_train, best_config['batch_size'], shuffle=True)
    test_loader = build_dataloader(X_test, y_test, best_config['batch_size'], shuffle=False)
    train_MLP(best_model, best_config['epochs'], optimizer, train_loader, None, log=False)
    performance = test_MLP(test_loader, best_model)
    return performance

In [None]:
def test_MLP(test_loader: torch.utils.data.DataLoader, best_model: torch.nn.Module) -> pd.DataFrame:
    best_model.eval()
    y_true, y_pred, y_pred_proba = [], [], []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred_batch = best_model(X_batch)
            y_true.extend(y_batch.cpu().numpy())
            y_pred.extend(y_pred_batch.argmax(dim=1).detach().cpu().numpy())
            # get y_pred_proba
            y_pred_proba_batch = torch.nn.functional.softmax(y_pred_batch, dim=1).cpu().numpy()
            y_pred_proba.append(y_pred_proba_batch)
             
    y_true, y_pred, y_pred_proba = np.array(y_true), np.array(y_pred), np.concatenate(y_pred_proba)
    performance = performance_per_class(y_true, y_pred, y_pred_proba)
    return performance

In [None]:
MLP_parameters = {
    'hidden_dims': {'values': [[64, 32, 16], [128, 64, 32, 16]]},
    'dropout': {'values': [0.1, 0.3, 0.5]},
    'activation': {'values': ['relu', 'sigmoid', 'tanh']},
    'optimizer': {'values': ['sgd', 'adam']},
    'lr': {'values': [0.001, 0.01, 0.1]},
    'n_class': {'value': len(np.unique(y_train))},
    'input_dim': {'value': X_train.shape[1]},
    'batch_size': {'value': 64},
    'epochs': {'value': 20},
}

MLP_performacne = evaluate_MLP(X_train, y_train, X_val, y_val, X_test, y_test, "AKI_GNN_MLP", MLP_parameters)

Create sweep with ID: n6aa7cc7
Sweep URL: https://wandb.ai/ericli/AKI_GNN_MLP/sweeps/n6aa7cc7


In [None]:
MLP_performacne