In [None]:
# when executed in a Google Colab setting, we must install the required libraries

# !pip install torch
# !pip install os
# !pip install transformers
# !pip install numpy
# !pip install pandas

In [None]:
import os
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.distributions.beta import Beta
import numpy as np
import pandas as pd
import random
import pickle
import base64
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import re

In [None]:
DATASET_FILEPATH = './drive/MyDrive/Thesis/'
DATASET_SEED = 2
SEED = 42
torch.manual_seed(SEED)
EMB_SIZE = 'base' # 'base' 768 embeddings or 'large' 1024 embeddings
SAVE_WEIGHTS_PATH = os.path.join(DATASET_FILEPATH, 'weights-and-graphs/grid-search-avg/model.pth')

In [None]:
train_csv_file = os.path.join(DATASET_FILEPATH, f'{EMB_SIZE}/{DATASET_SEED}/processed/train_dataset.csv')
validation_csv_file = os.path.join(DATASET_FILEPATH, f'{EMB_SIZE}/{DATASET_SEED}/processed/validation_dataset.csv')
aug_train_csv_file = os.path.join(DATASET_FILEPATH, '/base/aug-dataset/processed/train_dataset.csv')

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device("cpu")
print('Device: ', device)

Device:  cuda


In [None]:
def to_tensor(base64_str):
    return pickle.loads(base64.b64decode(base64_str.encode()))

selected_columns = ['audio_file_name', 'classification', 'wav2vec_embeddings', 'hubert_embeddings', 'bert_embeddings']

train_df = pd.read_csv(train_csv_file, usecols=selected_columns, converters={'hubert_embeddings': to_tensor, 'wav2vec_embeddings' : to_tensor})
validation_df = pd.read_csv(validation_csv_file, usecols=selected_columns, converters={'hubert_embeddings': to_tensor, 'wav2vec_embeddings' : to_tensor})

In [None]:
def process_training_set(train_df, oversample_minority=False, undersample_majority=False):
  """
  Re-sample the training dataset, with options to oversample minority class and undersample majority class based on audio lengths.

  :param train_df: DataFrame containing the training data with columns ['classification', 'audio_file_name'] among others.
  :param oversample_minority: Boolean, if True, the minority class (classification == 0) is duplicated to balance the dataset.
  :param undersample_majority: Boolean, if True, majority class data with audio lengths above a threshold (specified by DROP_SEGMENTS) are dropped.
  :returns: DataFrame with the desired processed training data.
  """
  if oversample_minority:
    class_0 = train_df[train_df['classification'] == 0]
    train_df = pd.concat([train_df, class_0])
  if undersample_majority:
    DROP_SEGMENTS = 5
    def get_audio_length_group(file_name):
        return int(re.findall(r'\d+', file_name)[-1])
    train_df['audio_length_group'] = train_df['audio_file_name'].apply(get_audio_length_group)
    train_df = train_df[train_df['audio_length_group'] <= DROP_SEGMENTS]
    train_df = train_df.drop(columns=['audio_length_group'])

  # some indices are duplicated / removed so we have to reset them
  train_df.reset_index(drop=True, inplace=True)
  return train_df

def print_dataset_balance(df):
    """
    Prints the balance of classifications in a given dataset.

    :param df: DataFrame containing the data with a 'classification' column.
    """
    classification_counts = df['classification'].value_counts().reset_index()
    classification_counts.columns = ['classification', 'count']
    total_rows = classification_counts['count'].sum()
    classification_counts['percentage'] = (classification_counts['count'] / total_rows) * 100
    classification_counts['percentage'] = classification_counts['percentage'].round(1)
    print(classification_counts)

def augment_train_dataset(df, augmented_df_filepath):
  """
  Introduce additional 'non-interruption' samples to the dataset, which have been extracted from the GAP dataset with an LLM.

  :param df: Original DataFrame containing the training data.
  :param augmented_df_filepath: Filepath to the CSV containing the augmented data.
  :returns: A combined DataFrame of the original and augmented training data.
  """
  selected_columns = ['audio_file_name','classification', 'wav2vec_embeddings', 'hubert_embeddings']
  aug_train_df = pd.read_csv(aug_train_csv_file, usecols=selected_columns, converters={'wav2vec_embeddings': to_tensor, 'hubert_embeddings' : to_tensor})
  augmented_df = pd.concat([df, aug_train_df], ignore_index=True)
  return augmented_df

In [None]:
class EmbeddingsDataset(Dataset):
    def __init__(self, audio_embeddings, labels):
        self.audio_embeddings = audio_embeddings
        self.labels = labels

    def __len__(self):
        return len(self.audio_embeddings)

    def __getitem__(self, idx):
        audio_embedding = self.audio_embeddings[idx]
        label = self.labels[idx]
        return audio_embedding, label

In [None]:
class AudioModel(nn.Module):
    def __init__(self, audio_embedding_dim=768, hidden_dims=[256], output_dim=1, dropout_rate=0):
        super(AudioModel, self).__init__()

        layers = []
        prev_dim = audio_embedding_dim
        for dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = dim

        self.model = nn.Sequential(*layers)

        self.output_layer = nn.Linear(hidden_dims[-1], output_dim)

    def forward(self, audio_embedding):
        out = self.model(audio_embedding)
        return self.output_layer(out)

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
def binary_accuracy(preds, y):
    """
    Receives predicted values and true labels and computes the average accuracy of the predictions.
    Should 8/10 be correctly classified, this returns 0.8, NOT 8

    :param preds: Tensor of predicted values.
    :param y: Tensor of true labels.
    :returns: Accuracy as a floating point value.
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

def evaluate(model, iterator, criterion):
    """
    Evaluate the model's performance on a given dataset. This is used for the validation

    :param model: PyTorch model to be evaluated.
    :param iterator: Iterator that provides batches of data for evaluation.
    :param criterion: Loss function used to compute the loss during evaluation.
    :returns: Tuple containing average loss, average accuracy and macro average F1 score over all batches.
    """
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for audio_embeddings, labels in iterator:
            predictions = model(audio_embeddings).squeeze(1)
            loss = criterion(predictions, labels.float())
            acc = binary_accuracy(predictions, labels)

            binary_predictions = (torch.sigmoid(predictions) > 0.5).int()

            all_predictions.extend(binary_predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        macro_f1 = f1_score(all_labels, all_predictions, average='macro')

    return epoch_loss / len(iterator), epoch_acc / len(iterator), macro_f1

def train(model, iterator, optimizer, criterion):
    """
    Train the model for one epoch on the dataset.

    :param model: The PyTorch model to be trained.
    :param iterator: Iterator that provides batches of data for training.
    :param optimizer: Optimizer used to update the model's parameters.
    :param criterion: Loss function used to compute the loss during training.
    :returns: Tuple containing average loss, average accuracy over all batches, and total number of batches.
    """
    epoch_loss = 0
    epoch_acc = 0
    model.train()

    for audio_embeddings, labels in iterator:
        optimizer.zero_grad()
        predictions = model(audio_embeddings).squeeze(1)
        loss = criterion(predictions, labels.float())
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
param_grid = {
    'learning-rate': [0.0005, 0.001],
    'batch-size': [16],
    'optimiser' : ['Adam', 'SGD'],
    'architecture' : [
        {
            'layers' : [512,256],
            'dropout-rate': 0.2,
        },
        {
            'layers' : [768, 512, 256],
            'dropout-rate': 0.3,
        },
        {
            'layers' : [1024, 768, 512, 256],
            'dropout-rate': 0.4,
        },
    ],
    'embeddings' : ['wav2vec', 'hubert'],
    'dataset' : ['method 1', 'method 2']
}

In [None]:
def train_model(gridsearch_params):
  """
  Trains a classifier model on audio embeddings (either wav2vec or hubert) based on parameters received
  from a grid search. One of the key hyperparameters is the method which dictates the strategy to balance the dataset.

  Depending on the chosen dataset method, this function either:
  1. Over-samples the minority class, under-samples the majority class, and optionally prunes the dataset (referred to as "method 1"), or
  2. Augments the dataset with instances of False interruptions (referred to as "method 2").

  Checkpoints are taken based on the macro average F1 score. The function returns the model weights of the
  epoch which has the best score beyond the fifth epoch. The early stopping condition is met when there are
  three consecutive falls in the macro average F1 score after the fifth epoch.

  :param gridsearch_params: A dictionary containing parameters sourced from a grid search. Key parameters
                            include 'dataset' (which determines the chosen method of dataset processing),
                            'embeddings', 'batch-size', 'tcn' (which further includes 'layers' and 'dropout-rate'),
                            'kernel-size', 'optimiser', and 'learning-rate'.
  :returns: A tuple containing (1) the best model weights during training, corresponding to the epoch with
            the highest macro average F1 score beyond the fifth epoch, and (2) that highest Macro-weighted
            average F1 score.
  """
  if gridsearch_params['dataset'] == 'method 2':
    balanced_train_df = augment_train_dataset(train_df.copy(deep=True), aug_train_csv_file)
  else:
    balanced_train_df = process_training_set(train_df.copy(deep=True), oversample_minority=True, undersample_majority=True, prune=False)

  copy_validation_df = validation_df.copy(deep=True)
  # apply mean across all embeddings
  balanced_train_df['wav2vec_embeddings'] = balanced_train_df['wav2vec_embeddings'].apply(lambda x: torch.mean(x, dim=0))
  copy_validation_df['wav2vec_embeddings'] = copy_validation_df['wav2vec_embeddings'].apply(lambda x: torch.mean(x, dim=0))
  balanced_train_df['hubert_embeddings'] = balanced_train_df['hubert_embeddings'].apply(lambda x: torch.mean(x, dim=0))
  copy_validation_df['hubert_embeddings'] = copy_validation_df['hubert_embeddings'].apply(lambda x: torch.mean(x, dim=0))

  wav2vec_train_data, wav2vec_valid_data = balanced_train_df['wav2vec_embeddings'], copy_validation_df['wav2vec_embeddings']
  hubert_train_data, hubert_valid_data = balanced_train_df['hubert_embeddings'], copy_validation_df['hubert_embeddings']
  train_labels, valid_labels = balanced_train_df['classification'], copy_validation_df['classification']

  if gridsearch_params['embeddings'] == 'wav2vec':
    train_dataset = EmbeddingsDataset(wav2vec_train_data, train_labels)
    valid_dataset = EmbeddingsDataset(wav2vec_valid_data, valid_labels)
  else:
    train_dataset = EmbeddingsDataset(hubert_train_data, train_labels)
    valid_dataset = EmbeddingsDataset(hubert_valid_data, valid_labels)

  train_loader = DataLoader(train_dataset, batch_size=gridsearch_params['batch-size'], shuffle=True)
  valid_loader = DataLoader(valid_dataset, batch_size=gridsearch_params['batch-size'])
  classifier_model = AudioModel(hidden_dims=gridsearch_params['architecture']['layers'], dropout_rate=gridsearch_params['architecture']['dropout-rate'])

  if gridsearch_params['optimiser'] == 'Adam':
    optimizer = torch.optim.Adam(classifier_model.parameters(), lr=gridsearch_params['learning-rate'])
  else:
    optimizer = torch.optim.SGD(classifier_model.parameters(), lr=gridsearch_params['learning-rate'], momentum=0.9)

  train_losses = []
  valid_losses = []
  macro_f1_scores = []
  best_model_weights = None
  best_macro_f1 = -1

  MAX_EPOCHS = 20
  consecutive_rises = 0  # keep track of consecutive rises in validation loss

  for epoch in range(MAX_EPOCHS):
      train_loss, train_acc = train(classifier_model, train_loader, optimizer, criterion)
      valid_loss, valid_acc, epoch_macro_f1 = evaluate(classifier_model, valid_loader, criterion)

      train_losses.append(train_loss)
      valid_losses.append(valid_loss)
      macro_f1_scores.append(epoch_macro_f1)

      if epoch_macro_f1 > best_macro_f1 and epoch >= 5:
        best_macro_f1 = epoch_macro_f1
        best_model_weights = classifier_model.state_dict()

      if epoch > 0 and epoch_macro_f1 < macro_f1_scores[-2]: # early stop if 3 consecutive rises in F1 score
          consecutive_rises += 1
      else:
          consecutive_rises = 0

      if consecutive_rises >= 3 and epoch >= 4: # ensure at least 5 epochs are completed
          break

  validation_performance = max(macro_f1_scores[4:]) # only count losses from the 5th epoch onwards
  del balanced_train_df, copy_validation_df
  print('Finished training model: ', gridsearch_params, ', with highest Macro-weighted average F1 score: ', validation_performance)

  return best_model_weights, validation_performance

In [None]:
from itertools import product

def grid_search(param_grid):
    """
    Conducts a grid search over the specified parameter space to find the best parameters that maximize
    the performance of the `train_model` function.

    For every combination of parameters in the grid, the function trains the model and saves the best
    model weights (based on macro average F1 score beyond the fifth epoch) if the current combination
    yields better performance than previous ones. This approach ensures that the optimal hyperparameters
    and corresponding model weights are identified and saved.

    :param param_grid: A dictionary where keys are parameter names and values are lists of possible values
                       for that parameter.
    :returns: A tuple containing (1) a dictionary of the best parameters identified during the grid search
              and (2) the highest macro average F1 score obtained using those best parameters.
    """
    # create a list of all parameter combinations
    all_params = [dict(zip(param_grid.keys(), v)) for v in product(*param_grid.values())]
    best_params = None
    best_performance = float('-inf')

    for params in all_params:
        best_weights, performance = train_model(params)
        if performance > best_performance:
            best_performance = performance
            best_params = params
            torch.save(best_weights, SAVE_WEIGHTS_PATH)

    return best_params, best_performance

best_hyperparameters, best_performance = grid_search(param_grid)
print('\nOptimal hyperparameters for grid search with macro average F1 of ',  best_performance,' :')
print(best_hyperparameters)

Finished training model:  {'learning-rate': 0.0005, 'batch-size': 16, 'optimiser': 'Adam', 'architecture': {'layers': [512, 256], 'dropout-rate': 0.2}, 'embeddings': 'wav2vec', 'dataset': 'method 1'} , with highest Macro-weighted average F1 score:  0.557764880410171
Finished training model:  {'learning-rate': 0.0005, 'batch-size': 16, 'optimiser': 'Adam', 'architecture': {'layers': [512, 256], 'dropout-rate': 0.2}, 'embeddings': 'wav2vec', 'dataset': 'method 2'} , with highest Macro-weighted average F1 score:  0.59899389148401
Finished training model:  {'learning-rate': 0.0005, 'batch-size': 16, 'optimiser': 'Adam', 'architecture': {'layers': [512, 256], 'dropout-rate': 0.2}, 'embeddings': 'hubert', 'dataset': 'method 1'} , with highest Macro-weighted average F1 score:  0.6259073427909227
Finished training model:  {'learning-rate': 0.0005, 'batch-size': 16, 'optimiser': 'Adam', 'architecture': {'layers': [512, 256], 'dropout-rate': 0.2}, 'embeddings': 'hubert', 'dataset': 'method 2'} ,