# Using BERT to classify the narrators [WIP]

In [1]:
# make logs less verbose
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# suppress UnparsedFlagAccessError warning
import sys
from absl import flags

sys.argv = ["preserve_unused_tokens=False"]
flags.FLAGS(sys.argv)

['preserve_unused_tokens=False']

## Process data

In [2]:
import pandas as pd

df = pd.read_json("./data/jawbone.json")

Augment the data with the label (narrator).

In [3]:
import json

f = open("./data/unofficial_solution.json")
possible_solutions = json.load(f)
f.close()


def get_narrator(id):
    for key, ids in possible_solutions.items():
        if id in ids:
            return key
    return None


df["narrator"] = df["page"].apply(lambda x: get_narrator(x))
df

Unnamed: 0,page,text,narrator
0,1,I sit down alone at the appointed table and ta...,Bill Hardy
1,2,I plunged for the last time. The few remaining...,Oscar Mills
2,3,"At my meeting with Clement yesterday, he had b...",John Walker
3,4,And I really think I would have preferred the ...,Bill Hardy
4,5,I hated my eye for being caught by what didn’t...,May Doncaster
...,...,...,...
95,96,Now I think I will try a cup of what they inso...,Bill Hardy
96,97,To have slept and to wake right up surrounded ...,John Walker
97,98,He stood and looked down at me; but I was not ...,Sir Paul Trinder
98,99,"And she wore a mauve love-knot on her breast, ...",Henry (dog)


In [4]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df, random_state=42, train_size=34, stratify=df.narrator.values
)
print(train_df.shape)
print(test_df.shape)

(34, 3)
(66, 3)


In [5]:
from sklearn import preprocessing
from keras.utils import to_categorical

# labels (narrators) as categorical variable
label = preprocessing.LabelEncoder()
y = to_categorical(label.fit_transform(train_df["narrator"]))

In [6]:
import tensorflow_hub as hub

# TODO: use Hugging Face instead
bert_layer = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True
)

In [7]:
from bert import tokenization
import numpy as np

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)


# from https://www.kaggle.com/code/nayansakhiya/text-classification-using-bert
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []

    for text in texts:
        text = tokenizer.tokenize(text)

        text = text[: max_len - 2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
import tensorflow as tf


# adapted from https://www.kaggle.com/code/nayansakhiya/text-classification-using-bert
# TODO: improve
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(
        shape=(max_len,), dtype=tf.int32, name="input_word_ids"
    )
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer(
        [input_word_ids, input_mask, segment_ids]
    )

    clf_output = sequence_output[:, 0, :]

    lay = tf.keras.layers.Dense(64, activation="relu")(clf_output)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    lay = tf.keras.layers.Dense(32, activation="relu")(lay)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    out = tf.keras.layers.Dense(8, activation="softmax")(lay)  # 8 labels-narrators

    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_mask, segment_ids], outputs=out
    )
    model.compile(
        optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
    )

    return model

In [None]:
max_len = 250
train_input = bert_encode(train_df.text.values, tokenizer, max_len=max_len)
test_input = bert_encode(test_df.text.values, tokenizer, max_len=max_len)

In [None]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

In [None]:
train_sh = model.fit(
    train_input, y, validation_split=0.1, epochs=5, batch_size=2, verbose=1
)

In [None]:
# TODO: inference