## Global modules import

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import numpy as np
import random as rnd
import os
import sys
import torch

from sklearn.model_selection import train_test_split
from operator import itemgetter

## Local modules import

In [3]:
sys.path.append('../..')

# PIPELINE START
---

## Loading data

In [4]:
import pickle

Data has been split before augmentation.

In [5]:
data_path = "../../intermediate_data/translator_basic/"

In [6]:
with open(os.path.join(data_path, 'words_higher_perc.pkl'), 'rb') as file:
    tr_stt_words = pickle.load(file)
with open(os.path.join(data_path, 'labels_higher_perc.pkl'), 'rb') as file:
    tr_word_labels = pickle.load(file)

tr_stt_transcripts   = [" ".join(words) for words in tr_stt_words]


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


with open(os.path.join(data_path, 'test_words_higher_perc.pkl'), 'rb') as file:
    te_stt_words = pickle.load(file)
with open(os.path.join(data_path, 'test_labels_higher_perc.pkl'), 'rb') as file:
    te_word_labels = pickle.load(file)

te_stt_transcripts   = [" ".join(words) for words in te_stt_words]


## BERT part

In [7]:
import torch
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from bert_encoder import encode_sentence

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model_bert.eval();

In [10]:
tr_stt_vectors = []
te_stt_vectors = []

Encode the corpus:

In [11]:
for sentence, words in zip(tr_stt_transcripts, tr_stt_words):
    tr_stt_vectors.append(
        encode_sentence(sentence, words, model_bert, tokenizer)
    )

In [12]:
for sentence, words in zip(te_stt_transcripts, te_stt_words):
    te_stt_vectors.append(
        encode_sentence(sentence, words, model_bert, tokenizer)
    )

In [13]:
tr_tensor       = torch.vstack(tr_stt_vectors)
tr_label_tensor = torch.tensor([int(element) for sublist in tr_word_labels for element in sublist])


te_tensor = torch.vstack(te_stt_vectors)
te_label_tensor = torch.tensor([int(element) for sublist in te_word_labels for element in sublist])

## MLP part

In [14]:
import itertools
import pandas as pd

from sklearn.model_selection import StratifiedKFold

from tqdm import tqdm

import torch.nn as nn
import torch.optim as optim

In [15]:
from mlp import MLP, cross_validate_model, train_model, calc_stats

Use CUDA accelleration if possible:

In [16]:
torch_device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

Define the grid:

In [17]:
epochs_options = [10, 25, 20]
hidden_layers_options = [1, 4, 8, 12, 16]
neurons_per_layer_options = [32, 64, 128, 256]
learning_rate_options = [1e-3, 1e-4, 1e-5]

In [18]:
grid = itertools.product(epochs_options, hidden_layers_options, neurons_per_layer_options, learning_rate_options)
parameter_combinations = list(grid)

Define global variables:

In [19]:
best_loss = float('inf')
best_params = None

features = tr_tensor
labels = tr_label_tensor

criterion = nn.BCELoss()
splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

german_proportion = tr_label_tensor.to(torch.float).mean()
weights = torch.tensor([1/(1-german_proportion), 1/german_proportion])

Create a temporary array to store intermediate data:

In [20]:
grid_search_data = []

Do a grid search for best hyperparameters:

In [21]:
for combination in tqdm(parameter_combinations):
    epochs, hidden_layers, neurons_per_layer, learning_rate = combination
    
    model = MLP(features.shape[1], hidden_layers, neurons_per_layer).to(torch_device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    tr_loss, tr_loss_std, te_loss, te_loss_std = \
        cross_validate_model(model,
                             features,
                             labels,
                             criterion,
                             optimizer,
                             splitter,
                             n_epochs=epochs,
                             num_workers=0,
                             device=torch_device,
                             class_weights=weights
        )
    
    values_to_add = [epochs, hidden_layers, neurons_per_layer, learning_rate,\
        tr_loss, tr_loss_std, te_loss, te_loss_std]

    # Add preliminary data to dataframe
    grid_search_data.append(values_to_add)

    if te_loss < best_loss:
        best_loss = te_loss
        best_params = combination

  0%|          | 0/180 [00:00<?, ?it/s]

100%|██████████| 180/180 [3:08:16<00:00, 62.76s/it]   


Create dataframe to store hyperparameter data and save it:

In [22]:
out_path = '.'
columns = ['epochs', 'hidden_layers', 'neurons_per_layer', 'learning_rate', 'tr_loss', 'tr_loss_std', 'te_loss', 'te_loss_std']
gs_frame = pd.DataFrame(grid_search_data, columns=columns)
gs_frame.to_csv(os.path.join(out_path, 'gs_data_test.csv'), index=False)

In [33]:
best_params

(20, 1, 256, 0.0001)

## Test the best model

In [23]:
from mlp import STTDataset
from torch.utils.data import DataLoader

Train the model on the whole dataset with the best parameters:

In [24]:
train_data = STTDataset(tr_tensor, tr_label_tensor)
num_workers = 0  # This works fastest on my machine
train_loader = DataLoader(
            train_data, batch_size=128, shuffle=True, num_workers=num_workers
        )

german_proportion = tr_label_tensor.to(torch.float).mean()
weights = torch.tensor([1/(1-german_proportion), 1/german_proportion])
epochs, hidden_layers, neurons_per_layer, learning_rate = best_params

criterion = nn.BCELoss()
model = MLP(features.shape[1], hidden_layers, neurons_per_layer).to(torch_device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [25]:
train_model(model, criterion, optimizer, train_loader, n_epochs=epochs, device=torch_device, class_weights=weights)

0.05676008819047448

Test the model on the test set:

In [26]:
test_data = STTDataset(te_tensor, te_label_tensor)
test_loader = DataLoader(
            test_data, batch_size=len(test_data), shuffle=True, num_workers=num_workers
        )

In [27]:
model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(torch_device), labels.to(torch_device)
        pred = model(inputs)
        pred = torch.squeeze(pred, dim=1)
        loss = criterion(pred, labels.to(torch.float)).item()

In [28]:
accuracy, precision, recall, f1 = calc_stats(pred, te_label_tensor)

In [29]:
results = pd.DataFrame([[loss, accuracy, precision, recall, f1]],
                        columns = ['loss', 'accuracy', 'precision', 'recall', 'f1'])
results.to_csv(os.path.join(out_path, 'results.csv'), index=False)

In [30]:
all_te_words = [element for sublist in te_stt_words for element in sublist]
all_te_labels = [element for sublist in te_word_labels for element in sublist]
all_te_predictions = (pred.to('cpu').numpy().flatten() > 0.5).astype(int)

In [31]:
german_words = []
german_predictions = []
for i in range(len(all_te_words)):
    if all_te_labels[i]:
        german_words.append(all_te_words[i])
        german_predictions.append(all_te_predictions[i])

predicted_labels = pd.DataFrame(
    {'word': german_words, 'prediction': german_predictions}
)
predicted_labels.to_csv(
    os.path.join(out_path, 'word_labels.csv')
)

In [32]:
predicted_labels.sort_values(by='prediction', ascending=False).head(20)

Unnamed: 0,word,prediction
27,so,1
91,means,1
157,shading,1
62,shelton,1
167,makes,1
376,monte,1
349,was,1
295,outro,1
294,met,0
263,howls,0
