Here we use second-to-last layer of BERT encoder as word representations to be inputted to MLP.

## Global modules import

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import numpy as np
import random as rnd
import sys
import torch

from sklearn.model_selection import train_test_split
from operator import itemgetter

## Local modules import

In [3]:
sys.path.append("../..")

## Loading data

In [4]:
from data_loading import create_word_lists, tidy_sentence_length

In [5]:
with open("../../data/corpus_data.json") as json_file:
    data = json.load(json_file)
data = data["records"]

In [6]:
human_transcripts = [entry["human_transcript"] for entry in data]
stt_transcripts = [entry["stt_transcript"] for entry in data]

In [7]:
human_words, stt_words, word_labels, word_grams, word_sems = create_word_lists(data)

In [8]:
stt_transcripts, stt_words, word_labels, word_grams, word_sems = tidy_sentence_length(
    stt_transcripts, stt_words, word_labels, word_grams, word_sems
)

# PIPELINE START
---

## Train-test split

In [9]:
max_length = max(map(len, word_labels))
padded_labels = [row + [False] * (max_length - len(row)) for row in word_labels]
padded_labels = np.array(padded_labels)
stat_labels = np.any(padded_labels, axis=1)

In [10]:
indices = list(range(len(stt_transcripts)))
tr_indices, te_indices = train_test_split(
    indices, test_size=0.2, random_state=0, shuffle=True, stratify=stat_labels
)

In [11]:
extract_train = itemgetter(*tr_indices)
extract_test = itemgetter(*te_indices)

In [12]:
tr_stt_transcripts = extract_train(stt_transcripts)
tr_stt_words = extract_train(stt_words)

tr_word_labels = extract_train(word_labels)
tr_word_grams = extract_train(word_grams)
tr_word_sems = extract_train(word_sems)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

te_stt_transcripts = extract_test(stt_transcripts)
te_stt_words = extract_test(stt_words)

te_word_labels = extract_test(word_labels)
te_word_grams = extract_test(word_grams)
te_word_sems = extract_test(word_sems)

## BERT part

In [13]:
import torch
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
from bert_encoder import encode_sentence

In [15]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_bert = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
model_bert.eval();

In [16]:
tr_stt_vectors = []
te_stt_vectors = []

Encode the corpus, with `vectorization="stl"`:

In [17]:
for sentence, words in zip(tr_stt_transcripts, tr_stt_words):
    tr_stt_vectors.append(
        encode_sentence(sentence, words, model_bert, tokenizer, vectorization="stl")
    )

In [18]:
for sentence, words in zip(te_stt_transcripts, te_stt_words):
    te_stt_vectors.append(
        encode_sentence(sentence, words, model_bert, tokenizer, vectorization="stl")
    )

In [19]:
tr_tensor = torch.vstack(tr_stt_vectors)
tr_label_tensor = torch.tensor(
    [int(element) for sublist in tr_word_labels for element in sublist]
)
tr_grams_tensor = torch.tensor(
    [int(element) for sublist in tr_word_grams for element in sublist]
)
tr_sems_tensor = torch.tensor(
    [int(element) for sublist in tr_word_sems for element in sublist]
)


te_tensor = torch.vstack(te_stt_vectors)
te_label_tensor = torch.tensor(
    [int(element) for sublist in te_word_labels for element in sublist]
)
te_grams_tensor = torch.tensor(
    [int(element) for sublist in te_word_grams for element in sublist]
)
te_sems_tensor = torch.tensor(
    [int(element) for sublist in te_word_sems for element in sublist]
)

## MLP part

In [20]:
import itertools
import pandas as pd

from sklearn.model_selection import StratifiedKFold

import os
from tqdm import tqdm

import torch.nn as nn
import torch.optim as optim

In [21]:
from mlp import MLP, cross_validate_model, train_model, calc_stats

In [22]:
torch_device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [23]:
best_params = (20, 1, 256, 0.0001)

In [24]:
out_path = "."

## Test the best model

In [25]:
from mlp import STTDataset
from torch.utils.data import DataLoader

In [26]:
train_data = STTDataset(tr_tensor, tr_label_tensor)
num_workers = 0  # This works fastest on my machine
train_loader = DataLoader(
    train_data, batch_size=128, shuffle=True, num_workers=num_workers
)

epochs, hidden_layers, neurons_per_layer, learning_rate = best_params

criterion = nn.BCELoss()
model = MLP(tr_tensor.shape[1], hidden_layers, neurons_per_layer).to(torch_device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [27]:
train_model(
    model, criterion, optimizer, train_loader, n_epochs=epochs, device=torch_device
)

0.040139796198024165

In [28]:
test_data = STTDataset(te_tensor, te_label_tensor)
test_loader = DataLoader(
    test_data, batch_size=len(test_data), shuffle=True, num_workers=num_workers
)

In [29]:
model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(torch_device), labels.to(torch_device)
        pred = model(inputs)
        pred = torch.squeeze(pred, dim=1)
        loss = criterion(pred, labels.to(torch.float)).item()

In [30]:
accuracy, precision, recall, f1 = calc_stats(pred, te_label_tensor)

In [35]:
f1

0.019607843137254905

In [31]:
results = pd.DataFrame(
    [[loss, accuracy, precision, recall, f1]],
    columns=["loss", "accuracy", "precision", "recall", "f1"],
)
results.to_csv(os.path.join(out_path, "results.csv"), index=False)

In [32]:
all_te_words = [element for sublist in te_stt_words for element in sublist]
all_te_labels = [element for sublist in te_word_labels for element in sublist]
all_te_predictions = (pred.to("cpu").numpy().flatten() > 0.5).astype(int)

In [33]:
german_words = []
german_predictions = []
for i in range(len(all_te_words)):
    if all_te_labels[i]:
        german_words.append(all_te_words[i])
        german_predictions.append(all_te_predictions[i])

predicted_labels = pd.DataFrame(
    {"word": german_words, "prediction": german_predictions}
)
predicted_labels.to_csv(os.path.join(out_path, "word_labels.csv"))

In [34]:
predicted_labels.sort_values(by="prediction", ascending=False).head(20)

Unnamed: 0,word,prediction
1,bilde,1
58,east,1
135,tok,1
384,sachin,1
291,can,1
0,ash,0
265,at,0
273,walk,0
272,the,0
271,the,0
