In [101]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import find_peaks

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import yaml

from typing import Dict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

import optuna
from optuna.trial import TrialState

%load_ext kedro.ipython
device = torch.device('mps')

The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython


In [106]:
%reload_ext kedro.ipython

In [107]:
mox_bin = catalog.load("mox_bin")
# mox_bin.tail()
lstm_params = catalog.load("params:complete_lstm_model.model_options")
study_params = catalog.load("params:complete_lstm_model.study_options")
model_input_table = catalog.load("model_input_table")

In [108]:
print(lstm_params)
print(lstm_params['num_epochs'])

{'num_classes': 1, 'num_epochs': 10, 'batch_size': 32, 'learning_rate': 0.001, 'test_size': 0.2, 'random_state': 3, 'input_size': 1, 'sequence_length': 1500, 'hidden_size': 128, 'num_layers': 2, 'val_size': 0.2}
10


In [109]:

# Hyper-parameters 

num_classes = lstm_params['num_classes']
num_epochs = lstm_params['num_epochs']
batch_size = lstm_params['batch_size']
learning_rate = lstm_params['learning_rate']

"""

Each feature as a time step in your sequence, you could set sequence_length to 150 and input_size to 1.
This would mean you are feeding in sequences of length 150, with each time step in the sequence having 1 feature.

"""

input_size = lstm_params['input_size']
sequence_length = lstm_params['sequence_length'] # the window it trains with can be selected
hidden_size = lstm_params['hidden_size']
num_layers = lstm_params['num_layers']
random_state = lstm_params['random_state']

# batch_size = lstm_params['batch_size']  # You can adjust the batch size according to your needs

test_size = lstm_params['test_size']
val_size = lstm_params['val_size']

In [110]:
# NaN processing
# forward fill NaN values
def _ffill_NaN (X_dataset: np.ndarray) -> np.ndarray:
    X_dataset_df = pd.DataFrame(X_dataset)
    X_dataset_df.ffill(inplace=True)
    # Convert back to numpy arrays
    return X_dataset

In [117]:
# Clean NaN values from input table
_ffill_NaN(model_input_table)
# check for NaN values
model_input_table.isnull().sum()


bin_0        [1;36m0[0m
bin_1        [1;36m0[0m
bin_2        [1;36m0[0m
bin_3        [1;36m0[0m
bin_4        [1;36m0[0m
            ..
bin_1496     [1;36m0[0m
bin_1497     [1;36m0[0m
bin_1498     [1;36m0[0m
bin_1499     [1;36m0[0m
res_ratio    [1;36m0[0m
Length: [1;36m1501[0m, dtype: int64

In [119]:
# Implement LSTM functions below
# there is no validation set in this example
# load mox_table as input

def split_data(model_input_table: pd.DataFrame) -> torch.tensor:
    # Split data into features and target
    X = model_input_table[model_input_table.columns[:-1]].values  # Assuming last column is the target
    y = model_input_table[model_input_table.columns[-1]].values
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = test_size, random_state = random_state)
    
    # Further split to create a validation set
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, 
        test_size = val_size, 
        random_state = random_state)
    
    # Initialize StandardScaler
    scaler = StandardScaler()
    # Fit on training data
    scaler.fit(X_train)

    # Transform both training and testing data
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Ensure y_train and y_test are in the correct format
    if isinstance(y_train, pd.Series):
        y_train = y_train.values
    if isinstance(y_val, pd.Series):
        y_val = y_val.values
    if isinstance(y_test, pd.Series):
        y_test = y_test.values

    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_scaled.astype(np.float32))
    y_train_tensor = torch.tensor(y_train.astype(np.float32))

    X_val_tensor = torch.tensor(X_val_scaled.astype(np.float32))
    y_val_tensor = torch.tensor(y_val.astype(np.float32))

    X_test_tensor = torch.tensor(X_test_scaled.astype(np.float32))
    y_test_tensor = torch.tensor(y_test.astype(np.float32))

    return X_train_tensor, X_val_tensor, X_test_tensor, y_train_tensor, y_val_tensor, y_test_tensor


# create X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor from split_data(df)
X_train_tensor, X_val_tensor, X_test_tensor, y_train_tensor, y_val_tensor, y_test_tensor = split_data(model_input_table)
# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
# Initialize DataLoaders
# batch_size = lstm_params['batch_size']  # You can adjust the batch size according to your needs
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)


In [129]:
# OPTUNA model

# Optuna managed model
class RNN(nn.Module):
    # def __init__(self, input_size, hidden_size, num_layers, num_classes):
    def __init__(self, trial, input_size, num_classes):
        super(RNN, self).__init__()

        # Optuna suggests the number of layers and hidden size
        self.num_layers = trial.suggest_int("num_layers", 1, 3)
        self.hidden_size = trial.suggest_int("hidden_size", 30, 100)

        # Optuna suggests the dropout ratio of each layer
        dropout_rate = trial.suggest_float("dropout_rate", 0, 0.5)

        # Define the LSTM layer
        self.lstm = nn.LSTM(
            input_size, 
            self.hidden_size, 
            self.num_layers, 
            batch_first=True,
            dropout=(dropout_rate if self.num_layers > 1 else 0), 
            )
        self.fc = nn.Linear(self.hidden_size, num_classes)
        
    def forward(self, x):
        # Set initial hidden states (and cell states for LSTM)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        out, _ = self.lstm(x, (h0,c0))  
        out = out[:, -1, :]
        out = self.fc(out)
        return out

def define_model(trial, input_size, num_classes):
    model = RNN(trial, input_size, num_classes)
    return model

In [130]:
# Optuna managed training
def objective(trial, input_size, num_classes):
    model = define_model(trial, input_size, num_classes).to(device)
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    # Training of the model
    n_total_steps = len(train_loader)
    for epoch in range(num_epochs):
        model.train()

        for i, (bins, target) in enumerate(train_loader):
            bins = bins.reshape(-1, sequence_length, input_size).to(device)
            target = target.squeeze().to(device)

            # Forward pass
            outputs = model(bins)
            # Example of reshaping/squeezing if applicable
            outputs = outputs.squeeze()  # Removes dimensions of size 1
            # outputs = outputs[:64]  # Adjust if you need to slice the outputs
            # target = target.squeeze().to(device)  # Add an extra dimension to match outputs
            loss = criterion(outputs, target)
        
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():  # Don't calculate gradients
            total_loss = 0
            count = 0

            for bins, target in test_loader:  # Replace with your validation loader
                bins = bins.reshape(-1, sequence_length, input_size).to(device)
                target = target.squeeze().to(device)  # Add an extra dimension to match outputs
                
                outputs = model(bins)
                outputs = outputs.squeeze()

                loss = criterion(outputs, target)
                total_loss += loss.item()
                count += 1
        
        model.train() # Set the model back to training mode

        rmse = np.sqrt(total_loss / count)
        trial.report(rmse, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
    return rmse
            # print(f'Epoch [{epoch+1}/{num_epochs}], RMSE on validation data: {rmse}')


In [131]:
if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    # study.optimize(objective, n_trials=5, timeout=600)
    study.optimize(lambda trial: objective(trial, input_size, num_classes), n_trials=1, timeout=600)

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2024-01-24 15:34:24,876] A new study created in memory with name: no-name-fc4797e9-8210-4801-8a7f-83f7fbca92a0


[I 2024-01-24 15:38:53,993] Trial 0 finished with value: 0.08793570348750154 and parameters: {'num_layers': 3, 'hidden_size': 74, 'dropout_rate': 0.42405752634736116, 'optimizer': 'Adam', 'lr': 0.0029712251186688977}. Best is trial 0 with value: 0.08793570348750154.


Study statistics: 
  Number of finished trials:  1
  Number of pruned trials:  0
  Number of complete trials:  1
Best trial:
  Value:  0.08793570348750154
  Params: 
    num_layers: 3
    hidden_size: 74
    dropout_rate: 0.42405752634736116
    optimizer: Adam
    lr: 0.0029712251186688977
