In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from conv_transformer import ConvTransformerModel
from cnn_lstm import ParallelCNNLSTMModel
from utils import get_loaders, import_checkpoint, save_checkpoint
import torch
import multiprocessing
import mlflow
import mlflow.pytorch
import torch
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix,roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import psutil
from torch.cuda.amp import GradScaler, autocast
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import math


# MLflow Setup

In [2]:
os.environ['AWS_ACCESS_KEY_ID'] = 'dIgexhE2iDrGls2qargL'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'IzEzgQpztotDnrIInJdUfUIYngpjJoT18d0FDZf7'
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:9000'
os.environ['MLFLOW_S3_IGNORE_TLS'] = 'true'
os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
mlflow.set_tracking_uri("http://localhost:5000")

print('tracking uri:', mlflow.get_tracking_uri())

tracking uri: http://localhost:5000


In [3]:
# Configuration
DATA_DIR = '../data/data_normalized_exp2'
SEQ_LENGTH = 500
BATCH_SIZE = 64
NUM_EPOCHS = 50
LEARNING_RATE = 0.0001
EXPERIMENT_NAME = "IEEG_Classification_Final_Comp"
# RUN_NAME = "CNN"
PIN_MEMORY = True
LOAD_MODEL = False
NUM_WORKERS = multiprocessing.cpu_count()
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
INPUT_SIZE = SEQ_LENGTH
NUM_CLASSES = 5
CHECKPOINTS_PATH = '../models/checkpoints'

# Before Training

In [4]:
def get_model_size(model):
    param_size = 0
    buffer_size = 0
    for param in model.parameters():
        param_size += param.numel() * param.element_size()
    for buffer in model.buffers():
        buffer_size += buffer.numel() * buffer.element_size()
    size_all_mb = (param_size + buffer_size) / 1024 ** 2
    return size_all_mb

In [5]:
def train_model(model: nn.Module, train_loader: DataLoader, val_loader: DataLoader, optimizer: optim.Optimizer, 
                criterion: nn.Module, num_epochs: int, device: torch.device, save_checkpoint_interval: int = 10, 
                early_stopping_patience: int = 15, checkpoint_dir: str = '../models/checkpoints', accumulation_steps: int = 2,
                cnn=False, model_name='CNN'):
    """
    Train a deep learning model with the given parameters and log metrics to MLflow.

    Args:
        model (nn.Module): The model to train.
        train_loader (DataLoader): DataLoader for the training data.
        val_loader (DataLoader): DataLoader for the validation data.
        optimizer (optim.Optimizer): Optimizer for updating model parameters.
        criterion (nn.Module): Loss function.
        num_epochs (int): Number of epochs to train.
        device (torch.device): Device to use for training (CPU or GPU).
        save_checkpoint_interval (int, optional): Interval for saving checkpoints. Default is 10.
        early_stopping_patience (int, optional): Patience for early stopping. Default is 15.
        checkpoint_dir (str, optional): Directory to save checkpoints. Default is 'checkpoints'.
        accumulation_steps (int, optional): Number of steps to accumulate gradients before updating weights. Default is 2.
        cnn (bool, optional): If True, use CNN mode. Default is False.
        model_name (str, optional): Name of the model for saving checkpoints. Default is 'CNN'.
    """
    scaler = GradScaler()  # For mixed precision training
    best_val_loss = float('inf')  # Track the best validation loss for early stopping
    patience_counter = 0  # Counter for early stopping

    # Ensure checkpoint directory exists
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        y_true_train = []
        y_pred_train = []

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        optimizer.zero_grad()  # Reset gradients at the start of each epoch

        for batch_idx, (inputs, labels) in enumerate(progress_bar):
            inputs, labels = inputs.to(device), labels.to(device)

            with autocast():  # Mixed precision training
                if not cnn:
                    outputs = model(inputs)
                else:
                    outputs, _ = model(inputs)
                loss = criterion(outputs, labels.squeeze())

            scaler.scale(loss).backward()  # Backpropagation

            scaler.step(optimizer)  # Update weights
            scaler.update()
            optimizer.zero_grad()  # Reset gradients after updating weights

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            y_true_train.extend(labels.squeeze().cpu().numpy())
            y_pred_train.extend(predicted.cpu().numpy())

            avg_loss = running_loss / (batch_idx + 1)
            train_accuracy = accuracy_score(y_true_train, y_pred_train)
            precision, recall, f1, _ = precision_recall_fscore_support(y_true_train, y_pred_train, average='weighted', zero_division=0)

            progress_bar.set_postfix(train_loss=avg_loss, train_accuracy=train_accuracy, train_precision=precision, train_recall=recall, train_f1=f1)

        # Log training metrics to MLflow
        mlflow.log_metric("train_loss", avg_loss, step=epoch)
        mlflow.log_metric("train_accuracy", train_accuracy, step=epoch)
        mlflow.log_metric("train_precision", precision, step=epoch)
        mlflow.log_metric("train_recall", recall, step=epoch)
        mlflow.log_metric("train_f1", f1, step=epoch)

        # Validation step
        model.eval()
        val_loss = 0.0
        y_true_val = []
        y_pred_val = []

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)

                with autocast():  # Mixed precision inference
                    if not cnn:
                        outputs = model(inputs)
                    else:
                        outputs, _ = model(inputs)
                    loss = criterion(outputs, labels.squeeze())

                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                y_true_val.extend(labels.squeeze().cpu().numpy())
                y_pred_val.extend(predicted.cpu().numpy())

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = accuracy_score(y_true_val, y_pred_val)
        val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_true_val, y_pred_val, average='weighted', zero_division=0)

        # Log validation metrics to MLflow
        mlflow.log_metric("val_loss", avg_val_loss, step=epoch)
        mlflow.log_metric("val_accuracy", val_accuracy, step=epoch)
        mlflow.log_metric("val_precision", val_precision, step=epoch)
        mlflow.log_metric("val_recall", val_recall, step=epoch)
        mlflow.log_metric("val_f1", val_f1, step=epoch)

        # Update the progress bar with validation metrics
        progress_bar.set_postfix(train_loss=avg_loss, train_accuracy=train_accuracy, train_precision=precision, train_recall=recall, train_f1=f1, val_loss=avg_val_loss, val_accuracy=val_accuracy, val_precision=val_precision, val_recall=val_recall, val_f1=val_f1)

        # Save checkpoint every 'save_checkpoint_interval' epochs
        if (epoch + 1) % save_checkpoint_interval == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_{model_name}.pth.tar')
            save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, checkpoint_path)
            mlflow.log_artifact(checkpoint_path, artifact_path="checkpoints")

        # Early stopping based on validation loss
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0  # Reset counter if we get a new best validation loss
        else:
            patience_counter += 1

        if patience_counter >= early_stopping_patience:
            print(f"Early stopping at epoch {epoch + 1} due to no improvement in validation loss.")
            mlflow.log_param("epochs_actual", epoch + 1)
            break

        # Clear CUDA cache after each epoch
        torch.cuda.empty_cache()

    # Clear CUDA cache at the end of training
    torch.cuda.empty_cache()

In [6]:

def evaluate_model(model: nn.Module, test_loader: DataLoader, dataset: Dataset, device: torch.device, 
                   img_path: str, run_name: str, batch_size: int = 16, cnn=False):
    """
    Evaluate a deep learning model and log metrics to MLflow.

    Args:
        model (nn.Module): The model to evaluate.
        test_loader (DataLoader): DataLoader for the test data.
        dataset (Dataset): The dataset containing the test data.
        device (torch.device): Device to use for evaluation (CPU or GPU).
        img_path (str): Path to save the confusion matrix image.
        run_name (str): Name of the MLflow run.
        batch_size (int, optional): Batch size for evaluation. Default is 16.
    """
    model.eval()
    y_true_test = []
    y_pred_test = []
    feature_maps = []

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc="Evaluating", unit="batch"):
            inputs, labels = inputs.to(device), labels.to(device)
            if not cnn: 
                outputs = model(inputs)

            else:
                outputs, feature_map = model(inputs)
                feature_maps.append([fm.cpu() for fm in feature_map])  # Move feature maps to CPU to free GPU memory

            _, predicted = torch.max(outputs, 1)
            y_true_test.extend(labels.squeeze().cpu().numpy())
            y_pred_test.extend(predicted.cpu().numpy())

            # Clear cache to free up memory
            torch.cuda.empty_cache()

    test_accuracy = accuracy_score(y_true_test, y_pred_test)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true_test, y_pred_test, average='weighted', zero_division=0)

    print(f'Accuracy of the model on the test data: {test_accuracy:.2f}%')
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')

    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1", f1)

    # Confusion matrix
    cm = confusion_matrix(y_true_test, y_pred_test)
    cm_df = pd.DataFrame(cm, index=dataset.label_encoder.classes_, columns=dataset.label_encoder.classes_)

    plt.figure(figsize=(10, 7))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix')
    img_file = os.path.join(img_path, f"confusion_matrix_{run_name}.png")
    plt.savefig(img_file)
    mlflow.log_artifact(img_file)
    plt.close()

    # Save feature maps and labels to file
    if cnn:
        feature_maps_file = os.path.join(img_path, f"feature_maps_{run_name}.pt")
        torch.save((feature_maps, y_true_test, y_pred_test), feature_maps_file)
    # mlflow.log_artifact(feature_maps_file, artifact_path="feature_maps")

    return y_true_test, y_pred_test


# CONV Transformer

In [7]:
train_loader, val_loader, test_loader, dataset = get_loaders(data_dir=DATA_DIR, with_val_loader=True, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS,
                                        pin_memory=PIN_MEMORY, test_size=0.1, seq_length=SEQ_LENGTH, model_type="cnn")

Total dataset size: 131150
Train indices length: 118035, Test indices length: 13115
Train indices after val split: 106231, Val indices: 11804
Train dataset length: 106231, Val dataset length: 11804, Test dataset length: 13115


In [8]:
for loader_name, loader in zip(['train_loader', 'test_loader','val_loader'], [train_loader, test_loader, val_loader]):
    for i, (inputs, labels) in enumerate(loader):
        print(f"{loader_name} - Batch {i}: inputs shape = {inputs.shape}, labels shape = {labels.shape}")
        if i == 0:  # Only print the first batch for brevity
            break

train_loader - Batch 0: inputs shape = torch.Size([64, 1, 500]), labels shape = torch.Size([64])
test_loader - Batch 0: inputs shape = torch.Size([64, 1, 500]), labels shape = torch.Size([64])
val_loader - Batch 0: inputs shape = torch.Size([64, 1, 500]), labels shape = torch.Size([64])


In [9]:
# Model parameters
input_size = SEQ_LENGTH  # Use the sequence length provided by your dataset
num_classes = 5  # Number of classes for classification
conv_filters = [64, 128]  # Reduced number of filters to save memory
transformer_dim = 128  # Smaller transformer dimension
num_heads = 4  # Fewer attention heads
transformer_depth = 2  # Fewer transformer layers
fc_neurons = [512, 128]  # Reduced fully connected layer sizes
dropout = 0.3  # Dropout rate

In [10]:
model = ConvTransformerModel(
    input_size=input_size,
    num_classes=num_classes,
    transformer_dim=transformer_dim,
    num_heads=num_heads,
    transformer_depth=transformer_depth,
    fc_neurons=fc_neurons,
    dropout=dropout,
    activation=nn.ReLU()
).to(DEVICE)

In [11]:
print(f'Model size: {get_model_size(model):.3f} MB')
model

Model size: 2.365 MB


ConvTransformerModel(
  (conv_embedding_stem): ConvEmbeddingStem(
    (conv1): Conv1d(1, 64, kernel_size=(10,), stride=(2,), padding=(4,), bias=False)
    (act1): GELU(approximate='none')
    (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
    (act2): GELU(approximate='none')
    (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv1d(128, 128, kernel_size=(3,), stride=(2,), padding=(1,), bias=False)
    (act3): GELU(approximate='none')
    (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (transformer_blocks): ModuleList(
    (0-1): 2 x MultiheadSelfAttentionBlock(
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=False)
    

In [12]:
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)
criterion =  nn.CrossEntropyLoss()  

In [13]:
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='mlflow-artifacts:/11', creation_time=1716942946817, experiment_id='11', last_update_time=1716942946817, lifecycle_stage='active', name='IEEG_Classification_Final_Comp', tags={}>

In [14]:

# Train and Evaluate the Model with MLflow
run_name = "CONV_Transformer_sl500"
with mlflow.start_run(run_name=run_name) as run:
    # Log parameters
    mlflow.log_param("epochs", NUM_EPOCHS)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("learning_rate", LEARNING_RATE)
    mlflow.log_param("model", "Conv_Tranformer_sl500")
    mlflow.log_param("input_size", SEQ_LENGTH)
    mlflow.log_param("num_classes", NUM_CLASSES)
    mlflow.log_dict(dataset.get_class_mapping(), "class_mapping.json")

    # Train and Evaluate the Model
    train_model(model, train_loader,val_loader, optimizer, criterion, NUM_EPOCHS, DEVICE, 
                save_checkpoint_interval=10, checkpoint_dir=CHECKPOINTS_PATH, 
                model_name="Conv_Tranformer_sl500", early_stopping_patience=40, cnn=False)
    evaluate_model(model, test_loader, dataset, DEVICE, img_path='../plots', run_name=run_name)

    # Log the model
    mlflow.pytorch.log_model(model, "model_Conv_Tranformer_sl500")

2024/05/29 01:27:23 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Epoch 1/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 2/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 3/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 4/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 5/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 6/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 7/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 8/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 9/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 10/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Checkpoint saved successfully.


Epoch 11/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 12/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 13/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 14/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 15/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 16/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 17/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 18/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 19/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 20/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Checkpoint saved successfully.


Epoch 21/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 22/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 23/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 24/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 25/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 26/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 27/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 28/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 29/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 30/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Checkpoint saved successfully.


Epoch 31/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 32/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 33/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 34/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 35/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 36/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 37/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 38/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 39/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 40/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Checkpoint saved successfully.


Epoch 41/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 42/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 43/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 44/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 45/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 46/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 47/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 48/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 49/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 50/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Checkpoint saved successfully.


Evaluating:   0%|          | 0/205 [00:00<?, ?batch/s]

  return F.conv1d(input, weight, bias, self.stride,


Accuracy of the model on the test data: 0.97%
Precision: 0.9701, Recall: 0.9706, F1 Score: 0.9703


2024/05/29 02:42:04 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/05/29 02:42:04 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


# CNN + LSTM Parallel

In [7]:
train_loader, val_loader, test_loader, dataset = get_loaders(data_dir=DATA_DIR, with_val_loader=True, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS,
                                        pin_memory=PIN_MEMORY, test_size=0.1, seq_length=SEQ_LENGTH, model_type="cnn")

Total dataset size: 131150
Train indices length: 118035, Test indices length: 13115
Train indices after val split: 106231, Val indices: 11804
Train dataset length: 106231, Val dataset length: 11804, Test dataset length: 13115


In [8]:
model = ParallelCNNLSTMModel(input_size=SEQ_LENGTH, 
                            num_classes=NUM_CLASSES,
                            input_size_lstm=1,
                            # conv_filters=[64,128,256],
                            # fc_neurons=[1024,128],
                            # lstm_hidden_size=64,
                            lstm_num_layers=3
                        ).to(DEVICE)
print(f'Model size: {get_model_size(model):.3f} MB')
model

Model size: 65.849 MB


ParallelCNNLSTMModel(
  (cnn_head): CNN_Head(
    (conv_layers): ModuleList(
      (0): Conv1d(1, 64, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (2): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (3): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    )
    (bn_layers): ModuleList(
      (0): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (activation): ReLU()
    (dropout): Dropout(p=0.3, inplace=False)
    (maxpool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (lstm): LSTM(1, 64, num_layers=3, batch_first=True)
  (fc_lst

In [9]:
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)
criterion =  nn.CrossEntropyLoss() 

In [10]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
    if experiment.lifecycle_stage == 'deleted':
        mlflow.tracking.MlflowClient().restore_experiment(experiment_id)

In [11]:
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='mlflow-artifacts:/11', creation_time=1716942946817, experiment_id='11', last_update_time=1716942946817, lifecycle_stage='active', name='IEEG_Classification_Final_Comp', tags={}>

In [12]:

# Train and Evaluate the Model with MLflow
run_name = "P_CNN_LSTM_Experiment_sl500"
with mlflow.start_run(run_name=run_name) as run:
    # Log parameters
    mlflow.log_param("epochs", NUM_EPOCHS)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("learning_rate", LEARNING_RATE)
    mlflow.log_param("model", "P_CNN_LSTM_sl500")
    mlflow.log_param("input_size", SEQ_LENGTH)
    mlflow.log_param("num_classes", NUM_CLASSES)
    mlflow.log_dict(dataset.get_class_mapping(), "class_mapping.json")

    # Train and Evaluate the Model
    train_model(model, train_loader, val_loader,optimizer, criterion, NUM_EPOCHS, DEVICE, 
                save_checkpoint_interval=10, checkpoint_dir=CHECKPOINTS_PATH, 
                model_name="P_CNN_LSTM_sl500", early_stopping_patience=15, cnn=False)
    evaluate_model(model, test_loader, dataset, DEVICE, img_path='../plots', run_name=run_name)

    # Log the model
    mlflow.pytorch.log_model(model, "model_P_CNN_LSTM_sl500")

2024/05/29 10:36:59 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Epoch 1/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return F.conv1d(input, weight, bias, self.stride,
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return F.conv1d(input, weight, bias, self.stride,


Epoch 2/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 3/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 4/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 5/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 6/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 7/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 8/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 9/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 10/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Checkpoint saved successfully.


Epoch 11/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 12/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 13/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 14/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 15/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 16/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 17/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 18/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 19/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 20/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Checkpoint saved successfully.


Epoch 21/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 22/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 23/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 24/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 25/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 26/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 27/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 28/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 29/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 30/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Checkpoint saved successfully.


Epoch 31/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 32/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Epoch 33/50:   0%|          | 0/1660 [00:00<?, ?batch/s]

Early stopping at epoch 33 due to no improvement in validation loss.


Evaluating:   0%|          | 0/205 [00:00<?, ?batch/s]

  return F.conv1d(input, weight, bias, self.stride,


Accuracy of the model on the test data: 0.96%
Precision: 0.9603, Recall: 0.9606, F1 Score: 0.9601


2024/05/29 11:26:56 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/05/29 11:26:56 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
