# BERT encoder

This notebook is used to encode words to tensors and the encodings into temporary pickle files.

In [2]:
out_path = 'intermediate_data/BERT'

## Global modules import

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import json
import numpy as np
from operator import itemgetter
import random as rnd
import sys
import torch

## Local modules import

In [5]:
from data_loading import create_word_lists, tidy_sentence_length

## Loading data

In [6]:
with open('data/corpus_data.json') as json_file:
    data = json.load(json_file)
data = data['records']

In [7]:
human_transcripts = [entry['human_transcript'] for entry in data]
stt_transcripts   = [entry['stt_transcript'] for entry in data]

In [8]:
human_words, stt_words, word_labels, word_grams, word_sems = \
    create_word_lists(data)

Some of the sentences are too long, so we need to shorten them. The sentences are basically concatenations of individual words with spaces in between, without any interpuction, so they are reconstructed from word lists when necessary.

In [9]:
stt_transcripts, stt_words, word_labels, word_grams, word_sems = \
    tidy_sentence_length(stt_transcripts, stt_words, word_labels, word_grams, word_sems)

## Train-test split

In [10]:
from sklearn.model_selection import train_test_split

We need to extract which sentences contain German words in order to stratify the data split:

In [11]:
max_length = max(map(len, word_labels))
padded_labels = [row + [False] * (max_length - len(row)) for row in word_labels]
padded_labels = np.array(padded_labels)
stat_labels = np.any(padded_labels, axis=1)

Here, we split only indices and not data itself, because the data contains arrays of variable length, which does not work with `train_test_split`:

In [12]:
indices = list(range(len(stt_transcripts)))
tr_indices, te_indices = train_test_split(indices, test_size=0.2, random_state=0, shuffle=True, stratify=stat_labels)

These are hepler functions that will extract data selected by indices:

In [13]:
extract_train = itemgetter(*tr_indices)
extract_test  = itemgetter(*te_indices)

Finally, do data splitting:

In [14]:
tr_stt_transcripts   = extract_train(stt_transcripts)
tr_stt_words         = extract_train(stt_words)

tr_word_labels       = extract_train(word_labels)
tr_word_grams        = extract_train(word_grams)
tr_word_sems         = extract_train(word_sems)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

te_stt_transcripts   = extract_test(stt_transcripts)
te_stt_words         = extract_test(stt_words)

te_word_labels       = extract_test(word_labels)
te_word_grams        = extract_test(word_grams)
te_word_sems         = extract_test(word_sems)

## BERT part

In [15]:
import torch
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
from bert_encoder import encode_sentence

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model_bert.eval();

In [18]:
tr_stt_vectors = []
te_stt_vectors = []

Encode the corpus:

In [19]:
for sentence, words in zip(tr_stt_transcripts, tr_stt_words):
    tr_stt_vectors.append(
        encode_sentence(sentence, words, model_bert, tokenizer)
    )

In [20]:
for sentence, words in zip(te_stt_transcripts, te_stt_words):
    te_stt_vectors.append(
        encode_sentence(sentence, words, model_bert, tokenizer)
    )

In [21]:
tr_tensor       = torch.vstack(tr_stt_vectors)
tr_label_tensor = torch.tensor([int(element) for sublist in tr_word_labels for element in sublist])
tr_grams_tensor = torch.tensor([int(element) for sublist in tr_word_grams  for element in sublist])
tr_sems_tensor  = torch.tensor([int(element) for sublist in tr_word_sems   for element in sublist])


te_tensor = torch.vstack(te_stt_vectors)
te_label_tensor = torch.tensor([int(element) for sublist in te_word_labels for element in sublist])
te_grams_tensor = torch.tensor([int(element) for sublist in te_word_grams  for element in sublist])
te_sems_tensor  = torch.tensor([int(element) for sublist in te_word_sems   for element in sublist])

## Save results

In [25]:
import pickle
import os

Save training:

In [26]:
with open(os.path.join(out_path, 'tr_tensor.pkl'), 'wb') as file:
    pickle.dump(tr_tensor, file)

with open(os.path.join(out_path, 'tr_label_tensor.pkl'), 'wb') as file:
    pickle.dump(tr_label_tensor, file)

with open(os.path.join(out_path, 'tr_grams_tensor.pkl'), 'wb') as file:
    pickle.dump(tr_grams_tensor, file)

with open(os.path.join(out_path, 'tr_sems_tensor.pkl'), 'wb') as file:
    pickle.dump(tr_sems_tensor, file)

Save testing:

In [27]:
with open(os.path.join(out_path, 'te_tensor.pkl'), 'wb') as file:
    pickle.dump(te_tensor, file)

with open(os.path.join(out_path, 'te_label_tensor.pkl'), 'wb') as file:
    pickle.dump(te_label_tensor, file)

with open(os.path.join(out_path, 'te_grams_tensor.pkl'), 'wb') as file:
    pickle.dump(te_grams_tensor, file)

with open(os.path.join(out_path, 'te_sems_tensor.pkl'), 'wb') as file:
    pickle.dump(te_sems_tensor, file)