In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import wandb


import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GroupShuffleSplit

from amp_parkinson_preprocess import *

# Call the create_train_test function
train_peptides_path = "./train_data/train_peptides.csv"
train_proteins_path = "./train_data/train_proteins.csv"
train_clinical_data_path = "./train_data/train_clinical_data.csv"

X_train, X_test, y_train, y_test, preprocessor, features, targets, ids, groups = create_train_test(train_peptides_path, train_proteins_path, train_clinical_data_path)


ValueError: Shape of passed values is (2357, 1202), indices imply (2357, 1198)

In [None]:

# Merge
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# 2. Pad the sequences to the same length
def pad_dataframe(df, look_back=4, features, targets):
    padded_data = {}
    for patient_id, group in df.groupby('patient_id'):
        features = group.loc[:, features]  # Exclude patient_id, visit_month, updrs_1, updrs_2, updrs_3, updrs_4
        targets = group.loc[:, targets]  # Keep only updrs_1, updrs_2, updrs_3, updrs_4

        padded_features = torch.zeros((look_back, ))
        padded_targets = torch.zeros((look_back, n_targets))

        seq_len = min(group.shape[0], look_back)
        padded_features[-seq_len:, :] = torch.tensor(features.values[-seq_len:], dtype=torch.float)
        padded_targets[-seq_len:, :] = torch.tensor(targets.values[-seq_len:], dtype=torch.float)

        padded_data[patient_id] = (padded_features, padded_targets)
    return padded_data


# Pad the train_data and test_data DataFrames
padded_train_data = pad_dataframe(train_data)
padded_test_data = pad_dataframe(test_data)


In [None]:

# Custom dataset
class PatientDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]
        return x, y

# Split the data into training and testing sets using group split
group_split = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_indices, test_indices = next(group_split.split(patients, groups=patients))

train_patients = patients[train_indices]
test_patients = patients[test_indices]

train_data = [padded_data[patient_id][0] for patient_id in train_patients]
train_targets = [padded_data[patient_id][1] for patient_id in train_patients]

test_data = [padded_data[patient_id][0] for patient_id in test_patients]
test_targets = [padded_data[patient_id][1] for patient_id in test_patients]

train_dataset = PatientDataset(train_data, train_targets)
test_dataset = PatientDataset(test_data, test_targets)

# Train the model using the DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# 3. Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, feature_size, num_layers, dropout_prob):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc1 = nn.Linear(hidden_size, output_size) # Train targets
        self.fc2 = nn.Linear(hidden_size, feature_size-1) # Train features (except visit_month)
        #self.relu = nn.ReLU()

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.dropout(hn[-1])
        out1 = self.fc1(out)
        out2 = self.fc2(out)
        #out = self.relu(out)
        return out1, out2


class EarlyStoppingLossThreshold:
    def __init__(self, loss_threshold, patience):
        self.loss_threshold = loss_threshold
        self.patience = patience
        self.wait = 0

    def on_epoch_end(self, epoch, train_loss, test_loss):
        if test_loss <= self.loss_threshold:
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= self.patience:
                print(f"Early stopping: Loss did not reach the threshold of {self.loss_threshold} for {self.patience} epochs.")
                return True
        return False



#Custom SMAPE function for loss calculation
def SMAPE(y_true, y_pred):
    epsilon = 1e-8
    summ = torch.abs(y_true) + torch.abs(y_pred) + epsilon
    smape = torch.abs(y_true - y_pred) / summ
    return torch.mean(smape)


# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



In [None]:
def train_model():
    # init project
    run = wandb.init(project='amp-parkinson-kaggle')
    config = run.config

    #configure hyperparameters
    input_size = n_features
    output_size = n_targets

    loss_threshold = 0.6
    patience = 250
    early_stopping_callback = EarlyStoppingLossThreshold(loss_threshold, patience)

    # Set up your model, optimizer, and other components
    model = LSTMModel(input_size, config.hidden_size, output_size, n_features, config.num_layers, config.dropout_prob).to(device)

    #check if pytorch version is 2.0 or higher
    if torch.__version__ >= '2.0':
        model = torch.compile(model)

    criterion1 = SMAPE
    criterion2 = SMAPE #nn.MSELoss()  # or another suitable loss function for features
    
    optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)

    # Train the model
    for epoch in range(config.num_epochs):
        model.train()
        train_losses = []
        train_feature_losses = []

        for batch_idx, (data, targets) in enumerate(train_dataloader):
            data, targets = data.to(device), targets.to(device)
            optimizer.zero_grad()
            output_targets, output_features = model(data)
            loss1 = criterion1(output_targets, targets[-1])  # Loss based on the last time step
            loss2 = criterion2(output_features, data[:,-1,1:])  # Loss based on the features of the last time step
            loss = 0.6*loss1 + 0.4*loss2  
            loss.backward()
            optimizer.step()
            train_losses.append(loss1.item())
            train_feature_losses.append(loss2.item())
            
        train_loss = np.mean(train_losses)
        train_feature_loss = np.mean(train_feature_losses)
        total_train_loss = 0.6*train_loss + 0.4*train_feature_loss

        model.eval()
        test_losses = []
        test_feature_losses = []

        with torch.no_grad():
            for batch_idx, (data, targets) in enumerate(test_dataloader):
                data, targets = data.to(device), targets.to(device)
                output_targets, output_features = model(data)
                loss1 = criterion1(output_targets, targets[-1])  # Loss based on the last time step
                loss2 = criterion2(output_features, data[:,-1,1:])  # Loss based on the features of the last time step
                test_losses.append(loss1.item())
                test_feature_losses.append(loss2.item())
                

        test_loss = np.mean(test_losses)
        test_feature_loss = np.mean(test_feature_losses)
        total_test_loss = 0.6*test_loss + 0.4*test_feature_loss

        # Log the losses to wandb
        wandb.log({'updrs_train_smape': train_loss, 'updrs_test_smape': test_loss, 'feature_smape_train': train_feature_loss, 'feature_smape_test': test_feature_loss, 'total_train_loss': total_train_loss, 'total_test_loss': total_test_loss})

        # Check for early stopping
        should_stop = early_stopping_callback.on_epoch_end(epoch, total_train_loss, total_test_loss)
        if should_stop:
            break

        print(f'Epoch [{epoch+1}/{wandb.config.num_epochs}], UPDRS Train SMAPE: {train_loss:.4f}, UPDRS Test SMAPE: {test_loss:.4f}, Feature Train RMSE: {train_feature_loss:.4f}, Feature Test RMSE: {test_feature_loss:.4f}')



In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {
        'goal': 'minimize',
        'name': 'test_loss'
    },
    'parameters': {
        'num_epochs': {
            'value': 500
        },
        'hidden_size': {
            'values': [32, 64, 128, 256]
        },
        'num_layers': {
            'values': [2, 3, 4]
        },
        'dropout_prob': {
            'min': 0.1,
            'max': 0.8
        },
        'learning_rate': {
            'min': 1e-4,
            'max': 1e-2
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='amp-parkinson-kaggle')

# Run the sweep with the train_model function
wandb.agent(sweep_id, function=train_model, count=2000)