## Global modules import

In [150]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [151]:
import json
import numpy as np
import random as rnd
import sys
import torch

from sklearn.model_selection import train_test_split
from operator import itemgetter

## Local modules import

In [152]:
sys.path.append('../..')

## Loading data

In [153]:
from data_loading import create_word_lists, tidy_sentence_length

In [154]:
with open('../../data/corpus_data.json') as json_file:
    data = json.load(json_file)
data = data['records']

In [155]:
human_transcripts = [entry['human_transcript'] for entry in data]
stt_transcripts   = [entry['stt_transcript'] for entry in data]

In [156]:
human_words, stt_words, word_labels, word_grams, word_sems = \
    create_word_lists(data)

Some of the sentences are too long, so we need to shorten them. The sentences are basically concatenations of individual words with spaces in between, without any interpuction, so they are reconstructed from word lists when necessary.

In [157]:
stt_transcripts, stt_words, word_labels, word_grams, word_sems = \
    tidy_sentence_length(stt_transcripts, stt_words, word_labels, word_grams, word_sems)

# PIPELINE START
---

## Train-test split

We need to extract which sentences contain German words in order to stratify the data split:

In [158]:
max_length = max(map(len, word_labels))
padded_labels = [row + [False] * (max_length - len(row)) for row in word_labels]
padded_labels = np.array(padded_labels)
stat_labels = np.any(padded_labels, axis=1)

Here, we split only indices and not data itself, because the data contains arrays of variable length, which does not work with `train_test_split`:

In [159]:
indices = list(range(len(stt_transcripts)))
tr_indices, te_indices = train_test_split(indices, test_size=0.2, random_state=0, shuffle=True, stratify=stat_labels)

These are hepler functions that will extract data selected by indices:

In [160]:
extract_train = itemgetter(*tr_indices)
extract_test  = itemgetter(*te_indices)

Finally, do data splitting:

In [161]:
tr_stt_transcripts   = extract_train(stt_transcripts)
tr_stt_words         = extract_train(stt_words)

tr_word_labels       = extract_train(word_labels)
tr_word_grams        = extract_train(word_grams)
tr_word_sems         = extract_train(word_sems)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

te_stt_transcripts   = extract_test(stt_transcripts)
te_stt_words         = extract_test(stt_words)

te_word_labels       = extract_test(word_labels)
te_word_grams        = extract_test(word_grams)
te_word_sems         = extract_test(word_sems)

## BERT part

In [162]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from tqdm import tqdm

In [163]:
def encode_with_perplexity(
    sentence: str,
    words: list,
    model: BertForMaskedLM,
    tokenizer: BertTokenizer,
    vectorization: str = "sum"
):

    sentence_ = "[CLS] " + sentence + " [SEP]"

    # Tokenize, extract dictionary ids, and set
    # segment ids to 1 (we use outputs/BERT_GRAM/bert_gram_pipeline.ipynbjust 1 sentence)
    tokens = tokenizer.tokenize(sentence_)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
    segments_ids = [1] * len(tokens)  # The whole text is just one sentence

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Find token embeddings
    with torch.no_grad():  # We aren't doing backprop
        outputs = model(tokens_tensor, segments_tensors)

    token_embeddings = torch.stack(outputs.hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)  # We use just 1 sentence
    token_embeddings = token_embeddings.permute(1, 0, 2)  # tokens, layers, features
    token_embeddings = token_embeddings[1:-1]  # We don't need [CLS] and [SEP]

    token_probabilities = torch.squeeze(outputs.logits, dim=0)
    token_probabilities = torch.nn.functional.softmax(token_probabilities, dim=-1)
    token_perplexities = 2**torch.sum(token_probabilities*torch.log(token_probabilities), dim=-1)

    # Choose how to extract vectors from hidden layers
    if vectorization == "sum":
        token_vectors = torch.sum(token_embeddings[:, -4:, :], dim=1)
    elif vectorization == "stl":
        token_vectors = token_embeddings[:, -2, :]
    elif vectorization == "concat":
        token_vectors = torch.cat(
            (
                token_embeddings[:, -4, :],
                token_embeddings[:, -3, :],
                token_embeddings[:, -2, :],
                token_embeddings[:, -1, :],
            ),
            dim=1,
        )
    else:
        return None

    # Finally, we need to combine tokens into words,
    # as some words were split in tokenization
    word_token_lengths = []
    for word in words:
        word_token_lengths.append(len(tokenizer.encode(word, add_special_tokens=False)))

    # Use mean value to combine
    tid = 0
    word_vectors = []
    word_perplexities = []
    for wl in word_token_lengths:
        word_vectors.append(torch.mean(token_vectors[tid : tid + wl], dim=0))
        word_perplexities.append(torch.mean(token_perplexities[tid : tid + wl], dim=0))
        tid = tid + wl

    return torch.stack(word_vectors), torch.stack(word_perplexities)

In [164]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertForMaskedLM.from_pretrained('bert-base-uncased', output_hidden_states=True)
model_bert.eval();

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Encode the corpus:

In [165]:
tr_stt_vectors = []
tr_stt_perplexities = []

for sentence, words in tqdm(zip(tr_stt_transcripts, tr_stt_words), total=len(tr_stt_transcripts)):
    word_vectors, word_perplexities = encode_with_perplexity(sentence, words, model_bert, tokenizer)
    tr_stt_vectors.append(word_vectors)
    tr_stt_perplexities.append(word_perplexities)

  0%|          | 0/5434 [00:00<?, ?it/s]

100%|██████████| 5434/5434 [06:14<00:00, 14.51it/s]


In [166]:
te_stt_vectors = []
te_stt_perplexities = []

for sentence, words in tqdm(zip(te_stt_transcripts, te_stt_words), total=len(te_stt_transcripts)):
    word_vectors, word_perplexities = encode_with_perplexity(sentence, words, model_bert, tokenizer)
    te_stt_vectors.append(word_vectors)
    te_stt_perplexities.append(word_perplexities)

  0%|          | 1/1359 [00:00<03:21,  6.75it/s]

100%|██████████| 1359/1359 [01:34<00:00, 14.42it/s]


In [167]:
tr_tensor       = torch.vstack(tr_stt_vectors)
tr_label_tensor = torch.tensor([int(element) for sublist in tr_word_labels for element in sublist])
tr_grams_tensor = torch.tensor([int(element) for sublist in tr_word_grams  for element in sublist])
tr_sems_tensor  = torch.tensor([int(element) for sublist in tr_word_sems   for element in sublist])
tr_perps_tensor = torch.cat(tr_stt_perplexities)


te_tensor = torch.vstack(te_stt_vectors)
te_label_tensor = torch.tensor([int(element) for sublist in te_word_labels for element in sublist])
te_grams_tensor = torch.tensor([int(element) for sublist in te_word_grams  for element in sublist])
te_sems_tensor  = torch.tensor([int(element) for sublist in te_word_sems   for element in sublist])
te_perps_tensor = torch.cat(te_stt_perplexities)

## MLP part

For quicker experimenting, load saved data:

In [200]:
import itertools
import pandas as pd

from sklearn.model_selection import StratifiedKFold

import os
from tqdm import tqdm

import torch.nn as nn
import torch.optim as optim

In [201]:
from mlp import MLP, cross_validate_model, train_model, calc_stats

Use CUDA accelleration if possible:

In [202]:
torch_device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

Use the best parameters from the grid search:

In [255]:
best_params = (20, 1, 512, 0.0001)

Create dataframe to store hyperparameter data and save it:

In [256]:
out_path = '.'

## Test the best model

In [257]:
from mlp import STTDataset
from torch.utils.data import DataLoader

Train the model on the whole dataset with the best parameters:

In [258]:
train_data = STTDataset(tr_tensor, tr_label_tensor)
train_data.add_feature(tr_perps_tensor)
num_workers = 0  # This works fastest on my machine
train_loader = DataLoader(
            train_data, batch_size=128, shuffle=True, num_workers=num_workers
        )

epochs, hidden_layers, neurons_per_layer, learning_rate = best_params

german_proportion = tr_label_tensor.to(torch.float).mean()
weights = torch.tensor([1/(1-german_proportion), 1/german_proportion])

criterion = nn.BCELoss()
model = MLP(train_data.embeddings.shape[1], hidden_layers, neurons_per_layer).to(torch_device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [259]:
train_model(model, criterion, optimizer, train_loader, n_epochs=epochs, device=torch_device, class_weights=weights)

0.011104600767240576

Test the model on the test set:

In [260]:
test_data = STTDataset(te_tensor, te_label_tensor)
test_data.add_feature(te_perps_tensor)
test_loader = DataLoader(
            test_data, batch_size=len(test_data), shuffle=True, num_workers=num_workers
        )

In [261]:
model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(torch_device), labels.to(torch_device)
        pred = model(inputs)
        pred = torch.squeeze(pred, dim=1)
        loss = criterion(pred, labels.to(torch.float)).item()

In [262]:
accuracy, precision, recall, f1 = calc_stats(pred, te_label_tensor)

In [263]:
f1

0.01712328767123288

In [264]:
results = pd.DataFrame([[loss, accuracy, precision, recall, f1]],
                        columns = ['loss', 'accuracy', 'precision', 'recall', 'f1'])
results.to_csv(os.path.join(out_path, 'results.csv'), index=False)

In [265]:
all_te_words = [element for sublist in te_stt_words for element in sublist]
all_te_labels = [element for sublist in te_word_labels for element in sublist]
all_te_predictions = (pred.to('cpu').numpy().flatten() > 0.5).astype(int)

In [266]:
german_words = []
german_predictions = []
for i in range(len(all_te_words)):
    if all_te_labels[i]:
        german_words.append(all_te_words[i])
        german_predictions.append(all_te_predictions[i])

predicted_labels = pd.DataFrame(
    {'word': german_words, 'prediction': german_predictions}
)
predicted_labels.to_csv(
    os.path.join(out_path, 'word_labels.csv')
)

In [267]:
predicted_labels.sort_values(by='prediction', ascending=False).head(20)

Unnamed: 0,word,prediction
145,princess,1
283,show,1
184,and,1
166,pale,1
87,we,1
0,ash,0
267,bonbons,0
274,finest,0
273,walk,0
272,the,0
