In [None]:
# when executed in a Google Colab setting, we must install the required libraries

# !pip install torch
# !pip install os
# !pip install transformers
# !pip install numpy
# !pip install pandas

[31mERROR: Could not find a version that satisfies the requirement os (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for os[0m[31m
[0mCollecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-

In [None]:
import os
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
from torch.distributions.beta import Beta
from torch.nn.utils import weight_norm
import numpy as np
import pandas as pd
import random
import pickle
import time
import base64
from sklearn.metrics import average_precision_score, recall_score, classification_report, PrecisionRecallDisplay
import matplotlib.pyplot as plt
import re

In [None]:
#### Edit variables and filepaths here ####
DATASET_FILEPATH = './drive/MyDrive/Thesis/'
EMBEDDINGS = 'wav2vec_embeddings'
LARGE_LSTM = False
DATASET_SEED = 2
SEED = 42
torch.manual_seed(SEED)
SAVE_WEIGHTS_PATH = os.path.join(DATASET_FILEPATH, 'weights-and-graphs/lstm-base/model.pth')
EMB_SIZE = 'base' # 'base' 768 embeddings or 'large' 1024 embeddings
test_csv_file = os.path.join(DATASET_FILEPATH, f'{EMB_SIZE}/{DATASET_SEED}/processed/test_dataset.csv')

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device("cpu")
print('Device: ', device)

Device:  cpu


In [None]:
def to_tensor(base64_str):
    return pickle.loads(base64.b64decode(base64_str.encode()))

selected_columns = ['audio_file_name', 'classification', EMBEDDINGS]
test_df = pd.read_csv(test_csv_file, usecols=selected_columns, converters={EMBEDDINGS: to_tensor})

FileNotFoundError: ignored

In [None]:
def print_dataset_balance(df):
    """
    Prints the balance of classifications in a given dataset.

    :param df: DataFrame containing the data with a 'classification' column.
    """
    classification_counts = df['classification'].value_counts().reset_index()
    classification_counts.columns = ['classification', 'count']
    total_rows = classification_counts['count'].sum()
    classification_counts['percentage'] = (classification_counts['count'] / total_rows) * 100
    classification_counts['percentage'] = classification_counts['percentage'].round(1)
    print(classification_counts)

In [None]:
print_dataset_balance(test_df)

In [None]:
class AudioEmbeddingsDataset(Dataset):
    def __init__(self, embeddings, file_names, labels):
        self.embeddings = embeddings
        self.file_names = file_names
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx]
        label = self.labels[idx]
        file_name = self.file_names[idx]
        return label, file_name, embedding

FIXED_LENGTH = 400 # fixed sequence length that the model expects as an input

def collate_fn(batch):
    """
    Function to be passed to the DataLoader class which processes a batch of data points before being passed to the model in training. The LSTM must have all batch samples of equal length.

    :param batch: array of data points in the dataset.
    """
    # we either truncate or pad the sequence to have length 400
    labels, file_names, embeddings = zip(*batch)
    labels = torch.tensor(labels, dtype=torch.float32)

    # Truncate or zero-pad all sequences to a fixed length
    lengths = [ emb.shape[0] for emb in embeddings ]
    embeddings = pad_sequence(embeddings, batch_first=True)
    return embeddings, file_names, labels, lengths

audio_test_data = test_df[EMBEDDINGS]
test_labels = test_df['classification']
test_file_names = test_df['audio_file_name']

test_dataset = AudioEmbeddingsDataset(audio_test_data, test_file_names, test_labels)

In [None]:
BATCH_SIZE = 1 # to allow for analysis of results

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [None]:
# LSTM Classifier
class Classifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout_rate):
        super().__init__()
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout_rate if n_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, embedding, lengths):
        packed = pack_padded_sequence(embedding, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.rnn(packed)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden)

In [None]:
# load the trained model

if LARGE_LSTM:
  EMBEDDING_DIMENSION = 768
  NUM_HIDDEN_UNITS = 768
  OUTPUT_DIMENSION = 1
  NUM_LSTM_LAYERS = 4
  BI_DIRECTIONAL = True
  DROPOUT_RATE = 0
else:
  EMBEDDING_DIMENSION = 768
  NUM_HIDDEN_UNITS = 256
  OUTPUT_DIMENSION = 1
  NUM_LSTM_LAYERS = 2
  BI_DIRECTIONAL = True
  DROPOUT_RATE = 0

model = Classifier(EMBEDDING_DIMENSION, NUM_HIDDEN_UNITS, OUTPUT_DIMENSION, NUM_LSTM_LAYERS, BI_DIRECTIONAL, DROPOUT_RATE).to(device)
model.load_state_dict(torch.load('./drive/MyDrive/Thesis/weights-and-graphs/lstm-base/model.pth'))
print('Loaded model in')
model.eval()  # set the model to evaluation mode

In [None]:
TRUE_THRESHOLD = 0.5

y_name = []
y_true = []
y_score = []
y_assigned = []
latencies = []

def test(model, iterator):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    with torch.no_grad():
        for audio_embeddings, file_names, labels, lengths in iterator:
            start_time = time.time()
            output = model(audio_embeddings, lengths).squeeze(1)
            pred = torch.sigmoid(output)
            rounded_pred = torch.where(pred >= TRUE_THRESHOLD, torch.tensor(1, device=pred.device), torch.tensor(0, device=pred.device))
            end_time = time.time()

            latency = end_time - start_time
            latencies.append(latency)

            y_true.append(float(labels[0]))
            y_score.append(float(pred[0]))
            y_assigned.append(float(rounded_pred[0]))
            y_name.append(file_names[0])

test(model, test_loader)

In [None]:
print(classification_report(y_true, y_assigned, target_names=['non-interruption', 'interruption']))

In [None]:
display = PrecisionRecallDisplay.from_predictions(
    y_true, y_score, name="LinearSVC"
)

_ = display.ax_.set_title("2-class Precision-Recall curve")
plt.show()

In [None]:
mean_latency = (sum(latencies) / len(latencies)) *1000
print(f"Mean Latency: {mean_latency:.1f} milliseconds")