## Entity Disambiguation with an all-Born pipeline.

We can finally tackle our target problem, i.e., entity disambiguation!

In [None]:
import os
import numpy as np
from datetime import datetime
import random

from tqdm.notebook import tqdm
from IPython.display import display

from sklearn.metrics import accuracy_score, classification_report, log_loss

import spacy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from bornrule import BornClassifier

from wiki_tools import get_data_from_snippets

from models import MultilayerBornModel, FineTunedBornClassifier
from utils import construct_traintest_dataframe, apply_categorical_mapping, get_mlp_data, get_latest_model, get_latest_vectoriser, get_latest_encmap

## Data processing

In [None]:
data_dict = get_data_from_snippets()

We first choose a target entity to disambiguate.

In [None]:
target_entity = random.choice(list(data_dict.keys()))
target_entity

Then we collect a set of entities for training the disambiguating Multilayer Born model.

In [None]:
nlp = spacy.load("en_core_web_sm")
# train_size is kept small to avoid excessive resourse utilisation
traintest_df = construct_traintest_dataframe(nlp, target_entity, data_dict, train_size=2)

In [None]:
traintest_df.sample(10)

Finally, we need to make the dataset ameanable for the Born models.

In [None]:
train_df = traintest_df[traintest_df['entity'] != target_entity].reset_index(drop=True)
train_df.info()

In [None]:
test_df = traintest_df[traintest_df['entity'] == target_entity].reset_index(drop=True)
test_df.info()

In [None]:
mappings = get_latest_encmap()

In [None]:
enc_train_df = apply_categorical_mapping(train_df, mappings)
enc_train_df.sample(5)

In [None]:
enc_test_df = apply_categorical_mapping(test_df, mappings)
enc_test_df.sample(5)

Notice that we extract two targets: the targets for NER (for the Born Classifier), and the ones for NED (for the Multilayer Born model).

In [None]:
y_ner_train = enc_train_df['ner_tag'].to_list()
y_ner_test = enc_test_df['ner_tag'].to_list()
enc_train_df.drop('ner_tag', axis=1)
enc_test_df.drop('ner_tag', axis=1)

y_train = enc_train_df['disambig_label'].to_list()
y_test = enc_test_df['disambig_label'].to_list()
enc_train_df.drop('disambig_label', axis=1)
enc_test_df.drop('disambig_label', axis=1); # to suppress the output

In [None]:
dict_vec = get_latest_vectoriser()

# we need the output to be as expected, i.e., we should only vectorise the columns we pre-trained the born classifier on
cols_to_vectorise = ['sentence_id', 'token', 'pos', 'dep']

X_train = dict_vec.transform(enc_train_df[cols_to_vectorise].to_dict('records'))
X_test = dict_vec.transform(enc_test_df[cols_to_vectorise].to_dict('records'))

In [None]:
dict_vec

## NER with the Born Classifier

### (Digression) Does fine-tuning help?

We've seen that ensembling doesn't help much, so it's only natural to ask whether fine-tuning will.
The cells below investigate this question.

In [None]:
# a model trained from scratch on the new data
born_new = BornClassifier()
born_new.fit(X_train, y_ner_train)

# the model trained on the massive wikipedia data
born_trained = get_latest_model('clf')

# the fine-tuned model (the learning rate is set to an arbitrary value)
n_classes = len(mappings['ner_tag'])
born_fine = FineTunedBornClassifier(born_trained, n_classes=n_classes, learning_rate=0.8)
born_fine.fit(X_train, y_ner_train)

In [None]:
y_pred_new = born_new.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_new, labels=np.unique(y_train), zero_division=0))

In [None]:
y_pred_trained = born_trained.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_trained, labels=np.unique(y_train), zero_division=0))

The classification report above clearly motives the use of a pre-trained model: because our current dataset is quite small, we can have NER tags with very little support and, on these NER tags, the pre-trained model does much better!

In [None]:
y_pred_fine = born_fine.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_fine, labels=np.unique(y_train)))

Thus, from what we sated above, combining a small Born classifier with our pre-trained one should allow us to get the best of both worlds: good performance on the current data, because of the ex-novo Born classifier, and better generalisation due to the pre-trained model.

As the classification report above shows! (Note that the advantage might not be so clear all the time, because each time the notebook is re-run there is randomness. To better see the difference, I suggest increasing `train_size` in the `construct_traintest_dataframe` above.)

### Fine-tuning the Born Classifier for NED

Now that we've seen that fine-tuning does, in fact, help, let's create the fine-tuned Born classifier and move on to NED.

In [None]:
born_pretrained = get_latest_model('clf')

n_classes = len(mappings['ner_tag'])
born_finetuned = FineTunedBornClassifier(born_pretrained, n_classes=n_classes, learning_rate=0.4)
born_finetuned.fit(X_train, y_ner_train)

## NED with Multilayer Born

In [None]:
# we keep train_size small mostly so that this and the following cells don't consume excessive resources
# (even with train_size=2, this cell takes ~9m and training the model shows peaks of RAM usage touching ~30GB!)
X_mlp_train, y_mlp_train = get_mlp_data(X_train, y_train, born_finetuned)
X_mlp_test, y_mlp_test = get_mlp_data(X_test, y_test, born_finetuned)

In [None]:
input_size = X_mlp_train.shape[1] # ~600K (!)
out_size = n_classes # 19

# we reduce dimension aggressively because the input vectors are sparse 
# (TruncatedSVD results in an explained variance of ~0.99 with only 1000 features)
layer_sizes = [input_size, 512, 64, out_size]
model = MultilayerBornModel(layer_sizes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

dataset = TensorDataset(X_mlp_train, y_mlp_train.long())
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

We can finally train the Multilayer Born model!

In [None]:
num_epochs = 4

batch_losses = []
for epoch in range(num_epochs):
    total_loss = 0

    batch_pbar = tqdm(dataloader, total=len(dataloader), desc=f"Epoch {epoch+1}, Batch", position=1, leave=False)
    for batch_X, batch_y in batch_pbar:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        batch_losses.append(loss.item())
        
        batch_pbar.set_postfix({'batch_loss': f'{loss.item():.4f}'})
    display(batch_pbar)

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}] => Loss: {avg_loss:.4f}")

We could definitely benefit from more training...

Still, that is not the main point. More importantly, we need to save our progress!

In [None]:
timestamp = datetime.now().strftime("%d%m%Y-%H%M%S")

mlp_path = os.path.join("runs", f"mlp_born_{timestamp}.pt")
torch.save(model.state_dict(), mlp_path)
print(f"[+] Multilayer Born saved to {mlp_path}.")

## Let's test it!

In [None]:
input_size = X_mlp_train.shape[1] + 1
out_size = n_classes
layer_sizes = [input_size, 512, 64, out_size]
model = MultilayerBornModel(layer_sizes)
model.load_state_dict(get_latest_model("mlp"))

model.eval()

In [None]:
with torch.no_grad():
    y_pred_proba = model(X_mlp_test)
    
    y_pred_labels = torch.argmax(y_pred_proba, dim=1)
    
    # convert tensors to numpy arrays for sklearn metrics
    y_pred_proba_np = y_pred_proba.cpu().numpy()
    y_pred_labels_np = y_pred_labels.cpu().numpy()
    y_test_np = y_test.cpu().numpy()
    
    accuracy = accuracy_score(y_test_np, y_pred_labels_np)
    print(f"Accuracy: {accuracy:.4f}")
    logloss = log_loss(y_test_np, y_pred_proba_np)
    print(f"Log Loss: {logloss:.4f}")
    print("\nClassification report:")
    print(classification_report(y_test_np, y_pred_labels_np))

# def predict_proba(model, X):
#     model.eval()
#     with torch.no_grad():
#         probabilities = model(X)
#     return probabilities

# def predict(model, X):
#     probabilities = predict_proba(model, X)
#     return torch.argmax(probabilities, dim=1)