# Using BERT to Identify Bill Hardy (Binary Classification) [WIP]

Now that K-means clustering seems to indicate that some of the narrators can be inferred from the pages, we can attempt to use a language model to predict whether a page belongs to a given narrator. 

Let's start with a binary classification to **predict if sentences belong or not to Bill Hardy**.

<img src="https://glthr.com/cj/characters/bill-min.jpg" alt="drawing" width="400"/>

In [1]:
%%capture

# make logs less verbose
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# suppress UnparsedFlagAccessError warning
import sys
from absl import flags

sys.argv = ["preserve_unused_tokens=False"]
flags.FLAGS(sys.argv)

# suppress Pandas Future Warning (deprecation of DataFrame.swapaxes)
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

## Process data

In [2]:
import pandas as pd

df = pd.read_json("./data/jawbone.json")

Augment the data with the label: `True` for Bill Hardy, `False` otherwise.

In [3]:
import json

f = open("./data/unofficial_solution.json")
possible_solutions = json.load(f)
f.close()


def is_bill(id):
    for key, ids in possible_solutions.items():
        if id in ids:
            return key == "Bill Hardy"
    return None


df["narrator"] = df["page"].apply(lambda x: is_bill(x))
df

Unnamed: 0,page,text,narrator
0,1,I sit down alone at the appointed table and ta...,True
1,2,I plunged for the last time. The few remaining...,False
2,3,"At my meeting with Clement yesterday, he had b...",False
3,4,And I really think I would have preferred the ...,True
4,5,I hated my eye for being caught by what didn’t...,False
...,...,...,...
95,96,Now I think I will try a cup of what they inso...,True
96,97,To have slept and to wake right up surrounded ...,False
97,98,He stood and looked down at me; but I was not ...,False
98,99,"And she wore a mauve love-knot on her breast, ...",False


In [4]:
# tokenize paragraphs
import nltk
from nltk import sent_tokenize


def tokenize_sentences(paragraph):
    sentences = sent_tokenize(paragraph)

    valid_sentences = [
        sentence for sentence in sentences if any(c.isalnum() for c in sentence)
    ]
    return valid_sentences


df["text"] = df["text"].apply(tokenize_sentences)
df = df.explode("text").reset_index(drop=True)
df

Unnamed: 0,page,text,narrator
0,1,I sit down alone at the appointed table and ta...,True
1,1,"Call me nervous, call me fey, if you will; at ...",True
2,1,It has not had much work since it flew so nimb...,True
3,1,"As I watch the sea, Casy Ferris passes with do...",True
4,1,"Of course, to-day is the day.",True
...,...,...,...
1225,100,O beastly woman.,True
1226,100,"You know not how ill’s all here, about my hear...",True
1227,100,"Henry, I feel it, is for the first and last ti...",True
1228,100,"Good-bye, Henry.",True


In [5]:
from sklearn.model_selection import train_test_split

train_df, test_df, y_train, y_test = train_test_split(
    df, df["narrator"], random_state=42, train_size=0.4, stratify=df.narrator.values
)

print(train_df.shape)
print(test_df.shape)

(492, 3)
(738, 3)


In [6]:
from sklearn import preprocessing
from keras.utils import to_categorical

label = preprocessing.LabelEncoder()
y_train = label.fit_transform(train_df["narrator"])
y_test = label.fit_transform(test_df["narrator"])

In [7]:
import tensorflow_hub as hub

bert_layer = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/2", trainable=True
)

In [8]:
from bert import tokenization
import numpy as np

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)


# from https://www.kaggle.com/code/nayansakhiya/text-classification-using-bert
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []

    for text in texts:
        text = tokenizer.tokenize(text)

        text = text[: max_len - 2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [9]:
import tensorflow as tf


# adapted from https://www.kaggle.com/code/nayansakhiya/text-classification-using-bert
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(
        shape=(max_len,), dtype=tf.int32, name="input_word_ids"
    )
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer(
        [input_word_ids, input_mask, segment_ids]
    )

    clf_output = sequence_output[:, 0, :]

    lay = tf.keras.layers.Dense(64, activation="relu")(clf_output)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    lay = tf.keras.layers.Dense(32, activation="relu")(lay)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    lay = tf.keras.layers.Dense(32, activation="relu")(lay)
    out = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)(
        lay
    )  # is or is not Bill Hardy

    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_mask, segment_ids], outputs=out
    )
    model.compile(
        tf.keras.optimizers.legacy.Adam(),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )

    return model

In [10]:
X_train = bert_encode(train_df.text.values, tokenizer)
X_test = bert_encode(test_df.text.values, tokenizer)

In [11]:
model = build_model(bert_layer)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_word_ids (InputLayer  [(None, 512)]                0         []                            
 )                                                                                                
                                                                                                  
 input_mask (InputLayer)     [(None, 512)]                0         []                            
                                                                                                  
 segment_ids (InputLayer)    [(None, 512)]                0         []                            
                                                                                                  
 keras_layer (KerasLayer)    [(None, 768),                1083102   ['input_word_ids[0][0]',  

In [12]:
model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=3,
    batch_size=4,
    verbose=1,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2ac16e350>

In [13]:
sample_df, _, _ = np.split(df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))])
X_sample = bert_encode(sample_df.text.values, tokenizer)
y_sample = sample_df["narrator"]

In [14]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

prediction = model.predict(X_sample)



In [None]:
p_pred = prediction.flatten()
y_pred = np.where(p_pred > 0.5, 1, 0)

In [None]:
# import matplotlib.pyplot as plt


# cm = confusion_matrix(y_sample, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)
# disp.plot()
# plt.show()

(WIP)