## Entity Disambiguation with an all-Born pipeline.

We can finally tackle our target problem, i.e., entity disambiguation!

In [25]:
import os
import numpy as np
from datetime import datetime
import random

from tqdm.notebook import tqdm
from IPython.display import display

from sklearn.metrics import classification_report

import spacy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from bornrule import BornClassifier

from wiki_tools import get_data_from_snippets

from models import MultilayerBornModel, FineTunedBornClassifier
from utils import construct_traintest_dataframe, apply_categorical_mapping, get_mlp_data, get_latest_model, get_latest_vectoriser, get_latest_encmap

## Data processing

In [2]:
data_dict = get_data_from_snippets()

We first choose a target entity to disambiguate.

In [3]:
target_entity = random.choice(list(data_dict.keys()))
target_entity

'Illyria'

Then we collect a set of entities for training the disambiguating Multilayer Born model.

In [4]:
nlp = spacy.load("en_core_web_sm")
# train_size is kept small to avoid excessive resourse utilisation
traintest_df = construct_traintest_dataframe(nlp, target_entity, data_dict, train_size=2)

In [5]:
traintest_df.sample(10)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
109,3,and,CCONJ,cc,NONE,Pile,singer,1
1308,2,the,DET,det,NONE,Tribe,1999_TV_series,0
2719,2,be,AUX,auxpass,NONE,Illyria,Angel,1
479,1,Web,NOUN,oprd,NONE,Pile,band,0
38,2,!,PUNCT,punct,NONE,Pile,singer,1
2060,2,in,ADP,prep,NONE,Illyria,1816–49,0
2006,0,mentioned,VERB,ROOT,NONE,Illyria,1816–49,0
2259,7,the,DET,det,NONE,Illyria,1816–49,0
1131,2,Cloud,PROPN,pobj,NONE,Tribe,1999_TV_series,0
897,0,compilation,NOUN,compound,NONE,Pile,band,0


Finally, we need to make the dataset ameanable for the Born models.

In [6]:
train_df = traintest_df[traintest_df['entity'] != target_entity].reset_index(drop=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   sentence_id     2000 non-null   int64 
 1   token           2000 non-null   object
 2   pos             2000 non-null   object
 3   dep             2000 non-null   object
 4   ner_tag         2000 non-null   object
 5   entity          2000 non-null   object
 6   disambig_key    2000 non-null   object
 7   disambig_label  2000 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 125.1+ KB


In [7]:
test_df = traintest_df[traintest_df['entity'] == target_entity].reset_index(drop=True)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   sentence_id     1218 non-null   int64 
 1   token           1218 non-null   object
 2   pos             1218 non-null   object
 3   dep             1218 non-null   object
 4   ner_tag         1218 non-null   object
 5   entity          1218 non-null   object
 6   disambig_key    1218 non-null   object
 7   disambig_label  1218 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 76.3+ KB


In [8]:
mappings = get_latest_encmap()

In [9]:
enc_train_df = apply_categorical_mapping(train_df, mappings)
enc_train_df.sample(5)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
1557,1,him,10,21,9,Tribe,comics,1
413,2,Yazawa,11,7,14,Pile,singer,1
651,3,On,1,39,9,Pile,band,0
1393,1,arrives,16,0,9,Tribe,comics,1
908,1,On,1,39,9,Pile,band,0


In [10]:
enc_test_df = apply_categorical_mapping(test_df, mappings)
enc_test_df.sample(5)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
166,6,Illyrien,11,15,12,Illyria,1816–49,0
918,2,series,7,36,9,Illyria,Angel,1
705,2,",",12,41,9,Illyria,Angel,1
522,2,Slovenian,0,6,10,Illyria,1816–49,0
673,0,\n,14,19,9,Illyria,Angel,1


Notice that we extract two targets: the targets for NER (for the Born Classifier), and the ones for NED (for the Multilayer Born model).

In [11]:
y_ner_train = enc_train_df['ner_tag'].to_list()
y_ner_test = enc_test_df['ner_tag'].to_list()
enc_train_df.drop('ner_tag', axis=1)
enc_test_df.drop('ner_tag', axis=1)

y_train = enc_train_df['disambig_label'].to_list()
y_test = enc_test_df['disambig_label'].to_list()
enc_train_df.drop('disambig_label', axis=1)
enc_test_df.drop('disambig_label', axis=1); # to suppress the output

In [12]:
dict_vec = get_latest_vectoriser()

# we need the output to be as expected, i.e., we should only vectorise the columns we pre-trained the born classifier on
cols_to_vectorise = ['sentence_id', 'token', 'pos', 'dep']

X_train = dict_vec.transform(enc_train_df[cols_to_vectorise].to_dict('records'))
X_test = dict_vec.transform(enc_test_df[cols_to_vectorise].to_dict('records'))

## NER with the Born Classifier

### (Digression) Does fine-tuning help?

We've seen that ensembling doesn't help much, so it's only natural to ask whether fine-tuning will.
The cells below investigate this question.

In [13]:
# a model trained from scratch on the new data
born_new = BornClassifier()
born_new.fit(X_train, y_ner_train)

# the model trained on the massive wikipedia data
born_trained = get_latest_model('clf')

# the fine-tuned model (the learning rate is set to an arbitrary value)
n_classes = len(mappings['ner_tag'])
born_fine = FineTunedBornClassifier(born_trained, n_classes=n_classes, learning_rate=0.8)
born_fine.fit(X_train, y_ner_train)

In [14]:
y_pred_new = born_new.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_new, labels=np.unique(y_train), zero_division=0))

              precision    recall  f1-score   support

           0       0.75      0.60      0.67         5
           1       0.00      0.00      0.00        33

   micro avg       0.60      0.08      0.14        38
   macro avg       0.38      0.30      0.33        38
weighted avg       0.10      0.08      0.09        38



In [15]:
y_pred_trained = born_trained.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_trained, labels=np.unique(y_train), zero_division=0))

              precision    recall  f1-score   support

           0       0.50      0.60      0.55         5
           1       0.77      0.30      0.43        33

   micro avg       0.68      0.34      0.46        38
   macro avg       0.63      0.45      0.49        38
weighted avg       0.73      0.34      0.45        38



The classification report above clearly motives the use of a pre-trained model: because our current dataset is quite small, we can have NER tags with very little support and, on these NER tags, the pre-trained model does much better!

In [16]:
y_pred_fine = born_fine.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_fine, labels=np.unique(y_train)))

              precision    recall  f1-score   support

           0       0.50      0.60      0.55         5
           1       0.77      0.30      0.43        33

   micro avg       0.68      0.34      0.46        38
   macro avg       0.63      0.45      0.49        38
weighted avg       0.73      0.34      0.45        38



Thus, from what we sated above, combining a small Born classifier with our pre-trained one should allow us to get the best of both worlds: good performance on the current data, because of the ex-novo Born classifier, and better generalisation due to the pre-trained model.

As the classification report above shows! (Note that the advantage might not be so clear all the time, because each time the notebook is re-run there is randomness. To better see the difference, I suggest increasing `train_size` in the `construct_traintest_dataframe` above.)

### Fine-tuning the Born Classifier for NED

Now that we've seen that fine-tuning does, in fact, help, let's create the fine-tuned Born classifier and move on to NED.

In [17]:
born_pretrained = get_latest_model('clf')

n_classes = len(mappings['ner_tag'])
born_finetuned = FineTunedBornClassifier(born_pretrained, n_classes=n_classes, learning_rate=0.4)
born_finetuned.fit(X_train, y_ner_train)

## NED with Multilayer Born

In [18]:
# we keep train_size small mostly so that this and the following cells don't consume excessive resources
# (even with train_size=2, this cell takes ~9m to complete and training the model shows peaks of RAM usage touching ~30GB!)
X_mlp_train, y_mlp_train = get_mlp_data(X_train, y_train, born_finetuned)
X_mlp_test, y_mlp_test = get_mlp_data(X_test, y_test, born_finetuned)

In [22]:
input_size = X_mlp_train.shape[1] # ~600K (!)
out_size = n_classes # 19

# we reduce dimension aggressively because the input vectors are sparse 
# (TruncatedSVD results in an explained variance of ~0.99 with only 1000 features)
layer_sizes = [input_size, 512, 64, out_size]
model = MultilayerBornModel(layer_sizes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

dataset = TensorDataset(X_mlp_train, y_mlp_train.long())
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

We can finally train the Multilayer Born model!

In [23]:
num_epochs = 4

batch_losses = []
for epoch in range(num_epochs):
    total_loss = 0

    batch_pbar = tqdm(dataloader, total=len(dataloader), desc=f"Epoch {epoch+1}, Batch", position=1, leave=False)
    for batch_X, batch_y in batch_pbar:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        batch_losses.append(loss.item())
        
        batch_pbar.set_postfix({'batch_loss': f'{loss.item():.4f}'})
    display(batch_pbar)

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}] => Loss: {avg_loss:.4f}")

Epoch 1, Batch:   0%|          | 0/32 [00:00<?, ?it/s]

<tqdm.notebook.tqdm_notebook at 0x171ba6f90>

Epoch [1/4] => Loss: 2.5333


Epoch 2, Batch:   0%|          | 0/32 [00:00<?, ?it/s]

<tqdm.notebook.tqdm_notebook at 0x171ba51f0>

Epoch [2/4] => Loss: 2.4956


Epoch 3, Batch:   0%|          | 0/32 [00:00<?, ?it/s]

<tqdm.notebook.tqdm_notebook at 0x171ba48f0>

Epoch [3/4] => Loss: 2.4950


Epoch 4, Batch:   0%|          | 0/32 [00:00<?, ?it/s]

<tqdm.notebook.tqdm_notebook at 0x15f4aaf90>

Epoch [4/4] => Loss: 2.4925


We could definitely benefit from more training...

Still, that is not the main point. More importantly, we need to save our progress!

In [27]:
timestamp = datetime.now().strftime("%d%m%Y-%H%M%S")

mlp_path = os.path.join("runs", f"mlp_born_{timestamp}.pt")
torch.save(model, mlp_path)
print(f"[+] Multilayer Born saved to {mlp_path}.")

[+] Multilayer Born saved to runs/mlp_born_28082024-165213.
