This is the base BERT-MLP pipeline.

## Global modules import

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import numpy as np
import random as rnd
import sys
import torch

from sklearn.model_selection import train_test_split
from operator import itemgetter

## Local modules import

In [3]:
sys.path.append("../..")

## Loading data

In [4]:
from data_loading import create_word_lists, tidy_sentence_length

In [6]:
with open("../../data/corpus_data.json") as json_file:
    data = json.load(json_file)
data = data["records"]

In [7]:
human_transcripts = [entry["human_transcript"] for entry in data]
stt_transcripts = [entry["stt_transcript"] for entry in data]

In [8]:
human_words, stt_words, word_labels, word_grams, word_sems = create_word_lists(data)

In [48]:
stt_transcripts, stt_words, word_labels, word_grams, word_sems = tidy_sentence_length(
    stt_transcripts, stt_words, word_labels, word_grams, word_sems
)

# PIPELINE START
---

## Train-test split

In [49]:
max_length = max(map(len, word_labels))
padded_labels = [row + [False] * (max_length - len(row)) for row in word_labels]
padded_labels = np.array(padded_labels)
stat_labels = np.any(padded_labels, axis=1)

In [50]:
indices = list(range(len(stt_transcripts)))
tr_indices, te_indices = train_test_split(
    indices, test_size=0.2, random_state=0, shuffle=True, stratify=stat_labels
)

In [51]:
extract_train = itemgetter(*tr_indices)
extract_test = itemgetter(*te_indices)

In [52]:
tr_stt_transcripts = extract_train(stt_transcripts)
tr_stt_words = extract_train(stt_words)

tr_word_labels = extract_train(word_labels)
tr_word_grams = extract_train(word_grams)
tr_word_sems = extract_train(word_sems)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

te_stt_transcripts = extract_test(stt_transcripts)
te_stt_words = extract_test(stt_words)

te_word_labels = extract_test(word_labels)
te_word_grams = extract_test(word_grams)
te_word_sems = extract_test(word_sems)

## BERT part

In [53]:
import torch
from transformers import BertTokenizer, BertModel

In [54]:
from bert_encoder import encode_sentence

In [55]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_bert = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
model_bert.eval();

In [56]:
../../tr_stt_vectors = []
te_stt_vectors = []

In [None]:
for sentence, words in zip(tr_stt_transcripts, tr_stt_words):
    tr_stt_vectors.append(encode_sentence(sentence, words, model_bert, tokenizer))

In [None]:
for sentence, words in zip(te_stt_transcripts, te_stt_words):
    te_stt_vectors.append(encode_sentence(sentence, words, model_bert, tokenizer))

In [None]:
tr_tensor = torch.vstack(tr_stt_vectors)
tr_label_tensor = torch.tensor(
    [int(element) for sublist in tr_word_labels for element in sublist]
)
tr_grams_tensor = torch.tensor(
    [int(element) for sublist in tr_word_grams for element in sublist]
)
tr_sems_tensor = torch.tensor(
    [int(element) for sublist in tr_word_sems for element in sublist]
)


te_tensor = torch.vstack(te_stt_vectors)
te_label_tensor = torch.tensor(
    [int(element) for sublist in te_word_labels for element in sublist]
)
te_grams_tensor = torch.tensor(
    [int(element) for sublist in te_word_grams for element in sublist]
)
te_sems_tensor = torch.tensor(
    [int(element) for sublist in te_word_sems for element in sublist]
)

## MLP part

In [12]:
import itertools
import pandas as pd

from sklearn.model_selection import StratifiedKFold

from tqdm import tqdm

import torch.nn as nn
import torch.optim as optim

In [13]:
from mlp import MLP, cross_validate_model, train_model, calc_stats

In [14]:
torch_device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [15]:
epochs_options = [10, 25, 20]
hidden_layers_options = [1, 4, 8, 12, 16]
neurons_per_layer_options = [32, 64, 128, 256]
learning_rate_options = [1e-3, 1e-4, 1e-5]

In [16]:
grid = itertools.product(
    epochs_options,
    hidden_layers_options,
    neurons_per_layer_options,
    learning_rate_options,
)
parameter_combinations = list(grid)

In [17]:
best_loss = float("inf")
best_params = None

features = tr_tensor
labels = tr_label_tensor

criterion = nn.BCELoss()
splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [18]:
grid_search_data = []

In [19]:
for combination in tqdm(parameter_combinations):
    epochs, hidden_layers, neurons_per_layer, learning_rate = combination

    model = MLP(features.shape[1], hidden_layers, neurons_per_layer).to(torch_device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    tr_loss, tr_loss_std, te_loss, te_loss_std = cross_validate_model(
        model,
        features,
        labels,
        criterion,
        optimizer,
        splitter,
        n_epochs=epochs,
        num_workers=0,
        device=torch_device,
    )

    values_to_add = [
        epochs,
        hidden_layers,
        neurons_per_layer,
        learning_rate,
        tr_loss,
        tr_loss_std,
        te_loss,
        te_loss_std,
    ]

    # Add preliminary data to dataframe
    grid_search_data.append(values_to_add)

    if te_loss < best_loss:
        best_loss = te_loss
        best_params = combination

  from .autonotebook import tqdm as notebook_tqdm


100%|██████████| 180/180 [2:52:49<00:00, 57.61s/it]   


In [20]:
out_path = "../outputs/BERT/"
columns = [
    "epochs",
    "hidden_layers",
    "neurons_per_layer",
    "learning_rate",
    "tr_loss",
    "tr_loss_std",
    "te_loss",
    "te_loss_std",
]
gs_frame = pd.DataFrame(grid_search_data, columns=columns)
gs_frame.to_csv(os.path.join(out_path, "gs_data_test.csv"), index=False)

## Test the best model

In [21]:
from mlp import STTDataset
from torch.utils.data import DataLoader

In [22]:
train_data = STTDataset(tr_tensor, tr_label_tensor)
num_workers = 0  # This works fastest on my machine
train_loader = DataLoader(
    train_data, batch_size=128, shuffle=True, num_workers=num_workers
)

epochs, hidden_layers, neurons_per_layer, learning_rate = best_params

criterion = nn.BCELoss()
model = MLP(features.shape[1], hidden_layers, neurons_per_layer).to(torch_device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [23]:
train_model(
    model, criterion, optimizer, train_loader, n_epochs=epochs, device=torch_device
)

0.010613417972827239

In [24]:
test_data = STTDataset(te_tensor, te_label_tensor)
test_loader = DataLoader(
    test_data, batch_size=len(test_data), shuffle=True, num_workers=num_workers
)

In [25]:
model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(torch_device), labels.to(torch_device)
        pred = model(inputs)
        pred = torch.squeeze(pred, dim=1)
        loss = criterion(pred, labels.to(torch.float)).item()

In [26]:
accuracy, precision, recall, f1 = calc_stats(pred, te_label_tensor)

In [27]:
results = pd.DataFrame(
    [[loss, accuracy, precision, recall, f1]],
    columns=["loss", "accuracy", "precision", "recall", "f1"],
)
results.to_csv(os.path.join(out_path, "bert_test.csv"), index=False)