In [None]:
# when executed in a Google Colab setting, we must install the required libraries

# !pip install torch
# !pip install os
# !pip install transformers
# !pip install numpy
# !pip install pandas

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import weight_norm
from torch.utils.data import Dataset, DataLoader
import pickle
import base64
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import re

In [None]:
#### Edit variables and filepaths here ####
DATASET_FILEPATH = './drive/MyDrive/Thesis/'
DATASET_SEED = 2
SAVE_WEIGHTS_PATH = os.path.join(DATASET_FILEPATH, 'weights-and-graphs/grid-search-tcn/model.pth')
SEED = 42
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fb34f6e9f70>

In [None]:
train_csv_file = os.path.join(DATASET_FILEPATH, f'base/{DATASET_SEED}/processed/train_dataset.csv')
train_csv_file = os.path.join(DATASET_FILEPATH, f'base/{DATASET_SEED}/processed/validation_dataset.csv')
aug_train_csv_file = os.path.join(DATASET_FILEPATH, '/base/aug-dataset/processed/train_dataset.csv')

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device("cpu")
print('Device: ', device)

In [None]:
def to_tensor(base64_str):
    return pickle.loads(base64.b64decode(base64_str.encode()))


selected_columns = ['audio_file_name','classification', 'wav2vec_embeddings', 'hubert_embeddings']
train_df = pd.read_csv(train_csv_file, usecols=selected_columns, converters={'wav2vec_embeddings': to_tensor, 'hubert_embeddings' : to_tensor})
validation_df = pd.read_csv(validation_csv_file, usecols=selected_columns, converters={'wav2vec_embeddings': to_tensor, 'hubert_embeddings' : to_tensor})

Device:  cuda


In [None]:
def process_training_set(train_df, oversample_minority=False, undersample_majority=False):
  """
  Re-sample the training dataset, with options to oversample minority class and undersample majority class based on audio lengths.

  :param train_df: DataFrame containing the training data with columns ['classification', 'audio_file_name'] among others.
  :param oversample_minority: Boolean, if True, the minority class (classification == 0) is duplicated to balance the dataset.
  :param undersample_majority: Boolean, if True, majority class data with audio lengths above a threshold (specified by DROP_SEGMENTS) are dropped.
  :returns: DataFrame with the desired processed training data.
  """
  if oversample_minority:
    class_0 = train_df[train_df['classification'] == 0]
    train_df = pd.concat([train_df, class_0])
  if undersample_majority:
    DROP_SEGMENTS = 5
    def get_audio_length_group(file_name):
        return int(re.findall(r'\d+', file_name)[-1])
    train_df['audio_length_group'] = train_df['audio_file_name'].apply(get_audio_length_group)
    train_df = train_df[train_df['audio_length_group'] <= DROP_SEGMENTS]
    train_df = train_df.drop(columns=['audio_length_group'])

  # some indices are duplicated / removed so we have to reset them
  train_df.reset_index(drop=True, inplace=True)
  return train_df

def print_dataset_balance(df):
    """
    Prints the balance of classifications in a given dataset.

    :param df: DataFrame containing the data with a 'classification' column.
    """
    classification_counts = df['classification'].value_counts().reset_index()
    classification_counts.columns = ['classification', 'count']
    total_rows = classification_counts['count'].sum()
    classification_counts['percentage'] = (classification_counts['count'] / total_rows) * 100
    classification_counts['percentage'] = classification_counts['percentage'].round(1)
    print(classification_counts)

def augment_train_dataset(df, augmented_df_filepath):
  """
  Introduce additional 'non-interruption' samples to the dataset, which have been extracted from the GAP dataset with an LLM.

  :param df: Original DataFrame containing the training data.
  :param augmented_df_filepath: Filepath to the CSV containing the augmented data.
  :returns: A combined DataFrame of the original and augmented training data.
  """
  selected_columns = ['audio_file_name','classification', 'wav2vec_embeddings', 'hubert_embeddings']
  aug_train_df = pd.read_csv(aug_train_csv_file, usecols=selected_columns, converters={'wav2vec_embeddings': to_tensor, 'hubert_embeddings' : to_tensor})
  augmented_df = pd.concat([df, aug_train_df], ignore_index=True)
  return augmented_df

In [None]:
# Creating the Dataset
class AudioEmbeddingsDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx]
        label = self.labels[idx]
        return label, embedding

# Creating DataLoader with custom collate function
FIXED_LENGTH = 250 # fixed sequence length that the model expects as an input

def collate_fn(batch):
    """
    Function to be passed to the DataLoader class which processes a batch of data points before being passed to the model in training.
    The TCN must process process data points of length 250, we adjust each data point in the batch to fit this requirement.

    :param batch: array of data points in the dataset.
    """
    labels, embeddings = zip(*batch)
    labels = torch.tensor(labels, dtype=torch.float32)

    # Truncate or zero-pad all sequences to a fixed length
    embeddings = [emb.squeeze(0) for emb in embeddings]
    embeddings = [emb[:FIXED_LENGTH, :] if emb.shape[0] > FIXED_LENGTH else torch.cat([emb, torch.zeros((FIXED_LENGTH - emb.shape[0], emb.shape[1]))]) for emb in embeddings]

    embeddings = torch.stack(embeddings)
    return embeddings, labels

In [None]:
class NormReLUChannelNormalization(nn.Module):
    def __init__(self, epsilon=1e-5):
        super(NormReLUChannelNormalization, self).__init__()
        self.epsilon = epsilon
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(x)
        max_values, _ = torch.max(torch.abs(x), dim=2, keepdim=True)
        max_values += self.epsilon
        out = x / max_values
        return out

class WaveNetActivation(nn.Module):
    def __init__(self):
        super(WaveNetActivation, self).__init__()

    def forward(self, x):
        tanh_out = torch.tanh(x)
        sigm_out = torch.sigmoid(x)
        return tanh_out * sigm_out

class Chomp1d(nn.Module):
    def __init__(self, chomp_size):
        super(Chomp1d, self).__init__()
        self.chomp_size = chomp_size

    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous()

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, dilation, kernel_size, activation, dropout=0):
        super(ResidualBlock, self).__init__()
        chomp_size = (kernel_size-1) * dilation
        padding = (kernel_size-1) * dilation
        self.conv1 = weight_norm(nn.Conv1d(in_channels, out_channels, kernel_size,
                                           stride=1, padding=padding, dilation=dilation))
        self.chomp1 = Chomp1d(chomp_size)
        self.dropout = nn.Dropout(dropout)
        self.activation = activation
        self.conv2 = weight_norm(nn.Conv1d(out_channels, out_channels, kernel_size,
                                           stride=1, padding=padding, dilation=dilation))
        self.chomp2 = Chomp1d(chomp_size)
        self.net = nn.Sequential(self.conv1, self.chomp1, self.activation, self.dropout,
                                 self.conv2, self.chomp2, self.activation, self.dropout)
        self.downsample = nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else None
        self.relu = nn.ReLU()
        self.init_weights()

    def init_weights(self):
        self.conv1.weight.data.normal_(0, 0.01)
        self.conv2.weight.data.normal_(0, 0.01)
        if self.downsample is not None:
            self.downsample.weight.data.normal_(0, 0.01)

    def forward(self, x):
        out = self.net(x)
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)

class TemporalConvNet(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=2, dropout=0.2):
        super(TemporalConvNet, self).__init__()
        layers = []
        num_levels = len(out_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = in_channels if i == 0 else out_channels[i-1]
            activation = NormReLUChannelNormalization() if i%2 == 0 else WaveNetActivation()
            layers += [ResidualBlock(in_channels, out_channels[i], dilation=dilation_size,
                                     kernel_size=kernel_size, activation=activation, dropout=dropout)]

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

class TCN(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=2, dropout=0.3):
        super(TCN, self).__init__()
        self.tcn = TemporalConvNet(in_channels, out_channels, kernel_size=kernel_size, dropout=dropout)
        self.linear = nn.Linear(out_channels[-1], 1)

    def forward(self, x):
        x = x.transpose(1, 2)
        y1 = self.tcn(x)
        o = self.linear(y1[:, :, -1])
        return o


In [None]:
# Model training code
criterion = nn.BCEWithLogitsLoss()

def binary_accuracy(preds, y):
    """
    Receives predicted values and true labels and computes the average accuracy of the predictions.
    Should 8/10 be correctly classified, this returns 0.8, NOT 8

    :param preds: Tensor of predicted values.
    :param y: Tensor of true labels.
    :returns: Accuracy as a floating point value.
    """
    rounded_preds = torch.round(torch.sigmoid(preds)).squeeze()
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def evaluate(model, iterator, criterion):
    """
    Evaluate the model's performance on a given dataset. This is used for the validation

    :param model: PyTorch model to be evaluated.
    :param iterator: Iterator that provides batches of data for evaluation.
    :param criterion: Loss function used to compute the loss during evaluation.
    :returns: Tuple containing average loss, average accuracy and macro average F1 score over all batches.
    """
    epoch_loss = 0
    epoch_acc = 0
    model = model.to(device)
    model.eval()

    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for embeddings, labels in iterator:
            embeddings, labels = embeddings.to(device), labels.to(device)
            predictions = model(embeddings).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)

            binary_predictions = (torch.sigmoid(predictions) > 0.5).int()

            all_predictions.extend(binary_predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        macro_f1 = f1_score(all_labels, all_predictions, average='macro')

    return epoch_loss / len(iterator), epoch_acc / len(iterator), macro_f1

def train(model, iterator, optimizer, criterion):
    """
    Train the model for one epoch on the dataset.

    :param model: The PyTorch model to be trained.
    :param iterator: Iterator that provides batches of data for training.
    :param optimizer: Optimizer used to update the model's parameters.
    :param criterion: Loss function used to compute the loss during training.
    :returns: Tuple containing average loss, average accuracy over all batches, and total number of batches.
    """
    epoch_loss = 0
    epoch_acc = 0
    model = model.to(device)
    model.train()

    for batch_idx, (embeddings, labels) in enumerate(iterator):
        embeddings, labels = embeddings.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(embeddings).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

        # Print or log the loss and accuracy for every batch
        # print(f"Batch {batch_idx + 1}/{len(iterator)} - Loss: {loss.item():.4f}, Accuracy: {acc.item():.4f}")

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
param_grid = {
    'learning-rate': [0.001],
    'batch-size': [16],
    'optimiser' : ['Adam', 'SGD'],
    'tcn' : [
        {
            # small
            'layers' : [512, 256],
            'dropout-rate' : 0.2,
        },
        {
            # medium
            'layers' : [768, 384, 192],
            'dropout-rate' : 0.3,
        },
        {
            # large
            'layers' : [1024, 768, 384],
            'dropout-rate' : 0.4,
        }
    ],
    'kernel-size' : [2, 3],
    'embeddings' : ['wav2vec', 'hubert'],
    'dataset' : ['method 1', 'method 2']
}

In [None]:
def train_model(gridsearch_params):
  """
  Trains a classifier model on audio embeddings (either wav2vec or hubert) based on parameters received
  from a grid search. One of the key hyperparameters is the method which dictates the strategy to balance the dataset.

  Depending on the chosen dataset method, this function either:
  1. Over-samples the minority class, under-samples the majority class, and optionally prunes the dataset (referred to as "method 1"), or
  2. Augments the dataset with instances of False interruptions (referred to as "method 2").

  Checkpoints are taken based on the macro average F1 score. The function returns the model weights of the
  epoch which has the best score beyond the fifth epoch. The early stopping condition is met when there are
  three consecutive falls in the macro average F1 score after the fifth epoch.

  :param gridsearch_params: A dictionary containing parameters sourced from a grid search. Key parameters
                            include 'dataset' (which determines the chosen method of dataset processing),
                            'embeddings', 'batch-size', 'tcn' (which further includes 'layers' and 'dropout-rate'),
                            'kernel-size', 'optimiser', and 'learning-rate'.
  :returns: A tuple containing (1) the best model weights during training, corresponding to the epoch with
            the highest macro average F1 score beyond the fifth epoch, and (2) that highest Macro-weighted
            average F1 score.
  """
  if gridsearch_params['dataset'] == 'method 2':
    balanced_train_df = augment_train_dataset(train_df.copy(deep=True), aug_train_csv_file)
  else:
    balanced_train_df = process_training_set(train_df.copy(deep=True), oversample_minority=True, undersample_majority=True, prune=False)

  copy_validation_df = validation_df.copy(deep=True)

  wav2vec_train_data, wav2vec_valid_data = balanced_train_df['wav2vec_embeddings'], copy_validation_df['wav2vec_embeddings']
  hubert_train_data, hubert_valid_data = balanced_train_df['hubert_embeddings'], copy_validation_df['hubert_embeddings']
  train_labels, valid_labels = balanced_train_df['classification'], copy_validation_df['classification']

  if gridsearch_params['embeddings'] == 'wav2vec':
    train_dataset = AudioEmbeddingsDataset(wav2vec_train_data, train_labels)
    valid_dataset = AudioEmbeddingsDataset(wav2vec_valid_data, valid_labels)
  else:
    train_dataset = AudioEmbeddingsDataset(hubert_train_data, train_labels)
    valid_dataset = AudioEmbeddingsDataset(hubert_valid_data, valid_labels)

  BATCH_SIZE = gridsearch_params['batch-size']
  train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
  valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

  classifier_model = TCN(768, gridsearch_params['tcn']['layers'], kernel_size=gridsearch_params['kernel-size'], dropout=gridsearch_params['tcn']['dropout-rate']).to(device)
  if gridsearch_params['optimiser'] == 'Adam':
    optimizer = torch.optim.Adam(classifier_model.parameters(), lr=gridsearch_params['learning-rate'])
  else:
    optimizer = torch.optim.SGD(classifier_model.parameters(), lr=gridsearch_params['learning-rate'], momentum=0.9)

  train_losses = []
  valid_losses = []
  macro_f1_scores = []
  best_model_weights = None
  best_macro_f1 = -1

  MAX_EPOCHS = 20
  consecutive_rises = 0  # keep track of consecutive drops in macro avg F1 score

  for epoch in range(MAX_EPOCHS):
      train_loss, train_acc = train(classifier_model, train_loader, optimizer, criterion)
      valid_loss, valid_acc, epoch_macro_f1 = evaluate(classifier_model, valid_loader, criterion)

      train_losses.append(train_loss)
      valid_losses.append(valid_loss)
      macro_f1_scores.append(epoch_macro_f1)

      if epoch_macro_f1 > best_macro_f1 and epoch >= 5:
        best_macro_f1 = epoch_macro_f1
        best_model_weights = classifier_model.state_dict()

      if epoch > 0 and epoch_macro_f1 < macro_f1_scores[-2]: # early stop if 3 consecutive rises in validation loss
          consecutive_rises += 1
      else:
          consecutive_rises = 0

      if consecutive_rises >= 3 and epoch >= 4: # ensure at least 5 epochs are completed
          break

  validation_performance = max(macro_f1_scores[4:]) # only count losses from the 5th epoch onwards

  del balanced_train_df, copy_validation_df
  print('Finished training model: ', gridsearch_params, ', with highest Macro-weighted average F1 score: ', validation_performance)
  return best_model_weights, validation_performance

In [None]:
from itertools import product

def grid_search(param_grid):
    """
    Conducts a grid search over the specified parameter space to find the best parameters that maximize
    the performance of the `train_model` function.

    For every combination of parameters in the grid, the function trains the model and saves the best
    model weights (based on macro average F1 score beyond the fifth epoch) if the current combination
    yields better performance than previous ones. This approach ensures that the optimal hyperparameters
    and corresponding model weights are identified and saved.

    :param param_grid: A dictionary where keys are parameter names and values are lists of possible values
                       for that parameter.
    :returns: A tuple containing (1) a dictionary of the best parameters identified during the grid search
              and (2) the highest macro average F1 score obtained using those best parameters.
    """
    # create a list of all parameter combinations
    all_params = [dict(zip(param_grid.keys(), v)) for v in product(*param_grid.values())]
    best_params = None
    best_performance = float('-inf')  # since we are maximising F1 score

    for params in all_params:
        best_weights, performance = train_model(params)
        if performance > best_performance:
            best_performance = performance
            best_params = params
            torch.save(best_weights, SAVE_WEIGHTS_PATH)

    return best_params, best_performance

best_hyperparameters, best_performance = grid_search(param_grid)
print('\nOptimal hyperparameters for grid search with macro average F1 of ',  best_performance,' :')
print(best_hyperparameters)

Finished training model:  {'learning-rate': 0.001, 'batch-size': 16, 'optimiser': 'Adam', 'tcn': {'layers': [512, 256], 'dropout-rate': 0.2}, 'kernel-size': 2, 'embeddings': 'wav2vec', 'dataset': 'method 1'} , with highest Macro-weighted average F1 score:  0.5940071468380159
Finished training model:  {'learning-rate': 0.001, 'batch-size': 16, 'optimiser': 'Adam', 'tcn': {'layers': [512, 256], 'dropout-rate': 0.2}, 'kernel-size': 2, 'embeddings': 'wav2vec', 'dataset': 'method 2'} , with highest Macro-weighted average F1 score:  0.6204624415957829
Finished training model:  {'learning-rate': 0.001, 'batch-size': 16, 'optimiser': 'Adam', 'tcn': {'layers': [512, 256], 'dropout-rate': 0.2}, 'kernel-size': 2, 'embeddings': 'hubert', 'dataset': 'method 1'} , with highest Macro-weighted average F1 score:  0.7096774193548386
Finished training model:  {'learning-rate': 0.001, 'batch-size': 16, 'optimiser': 'Adam', 'tcn': {'layers': [512, 256], 'dropout-rate': 0.2}, 'kernel-size': 2, 'embeddings':