## Entity Disambiguation with an all-Born pipeline.

We can finally tackle our target problem, i.e., entity disambiguation!

In [1]:
import os
import numpy as np
from datetime import datetime
import random
import itertools
from collections import Counter

from tqdm.notebook import tqdm
from IPython.display import display

from sklearn.metrics import classification_report
from scipy import sparse

import spacy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from bornrule import BornClassifier

from wiki_tools import get_data_from_snippets

from models import MultilayerBornModel, FineTunedBornClassifier
from utils import construct_traintest_dataframe, apply_categorical_mapping, get_mlp_data, get_latest_model, get_latest_vectoriser, get_latest_encmap

## Data processing

In [2]:
data_dict = get_data_from_snippets()

We first choose a target entity to disambiguate.

In [3]:
target_entity = random.choice(list(data_dict.keys()))
target_entity

'Blade'

Then we collect a set of entities for training the disambiguating Multilayer Born model.

In [4]:
nlp = spacy.load("en_core_web_sm")
# train_size is kept small to avoid excessive resourse utilisation
traintest_df = construct_traintest_dataframe(nlp, target_entity, data_dict, train_size=50)

In [5]:
traintest_df.sample(10)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
134856,3,unlike,ADP,prep,NONE,Destiny,video_game_series,4
98633,0,Gamow,PROPN,conj,PERSON,Washington,George_Washington_University,0
34877,0,guitar,NOUN,npadvmod,NONE,Boston,band,2
11581,1,Dark,PROPN,pobj,NONE,Charlotte,singer,4
37963,4,in,ADP,prep,NONE,Daytona,Daytona_500,1
21078,0,the,DET,det,NONE,Death,Castlevania,0
69034,3,vocals,NOUN,conj,NONE,Faith,The_Cure_album,4
67550,1,white,ADJ,amod,NONE,Faith,George_Michael_album,1
36995,0,the,DET,det,NONE,Daytona,Daytona_200,0
109552,0,it,PRON,nsubjpass,NONE,Genesis,cryptocurrency_company,4


(Note that the fact that we're carrying around a `ner_tag` column, which we will have to delete later, is an inefficiency of the current pipeline which could (should) be addressed sooner or later.)

Finally, we need to make the dataset ameanable for the Born models.

In [6]:
train_df = traintest_df[traintest_df['entity'] != target_entity].reset_index(drop=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140300 entries, 0 to 140299
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   sentence_id     140300 non-null  int64 
 1   token           140300 non-null  object
 2   pos             140300 non-null  object
 3   dep             140300 non-null  object
 4   ner_tag         140300 non-null  object
 5   entity          140300 non-null  object
 6   disambig_key    140300 non-null  object
 7   disambig_label  140300 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 8.6+ MB


In [7]:
test_df = traintest_df[traintest_df['entity'] == target_entity].reset_index(drop=True)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2888 entries, 0 to 2887
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   sentence_id     2888 non-null   int64 
 1   token           2888 non-null   object
 2   pos             2888 non-null   object
 3   dep             2888 non-null   object
 4   ner_tag         2888 non-null   object
 5   entity          2888 non-null   object
 6   disambig_key    2888 non-null   object
 7   disambig_label  2888 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 180.6+ KB


In [8]:
mappings = get_latest_encmap()

In [9]:
enc_train_df = apply_categorical_mapping(train_df, mappings)
enc_train_df.sample(5)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
122764,1,the,5,20,9,Lucifer,Shinee_song,3
59320,1,-,12,41,4,Z,military_symbol,3
82779,2,Boulevard,11,35,3,A,New_York_City_Subway_service,1
120285,3,launch,16,1,9,Aston,Aston_University,4
76681,1,series,7,29,9,X,manga,3


In [10]:
enc_test_df = apply_categorical_mapping(test_df, mappings)
enc_test_df.sample(5)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
905,1,during,1,39,9,Blade,Marvel_Cinematic_Universe,1
1129,1,unexpectedly,2,4,9,Blade,character,4
722,0,though,13,24,9,Blade,Marvel_Cinematic_Universe,1
2087,3,survive,16,13,9,Blade,New_Line_franchise_character,2
474,2,During,1,39,9,Blade,Marvel_Cinematic_Universe,1


For the test set, we need to separate the disambiguations (to classify the documents).

In [11]:
cols = set(enc_test_df['disambig_key'])
enc_test_shards = [enc_test_df.loc[enc_test_df['disambig_key'] == col] for col in cols]

In [12]:
for df_shard in enc_test_shards:
    # to make printing a bit prettier (avoid newlines)
    cols_to_print = ['sentence_id', 'token', 'entity', 'disambig_key']
    print(df_shard[cols_to_print].sample(3))
    print()

      sentence_id token entity disambig_key
1767            2     -  Blade     Blade_II
1776            2     $  Blade     Blade_II
1525            0  Note  Blade     Blade_II

     sentence_id   token entity   disambig_key
409            3    with  Blade  Puppet_Master
137            1   woman  Blade  Puppet_Master
313            0  killed  Blade  Puppet_Master

     sentence_id   token entity               disambig_key
921            1  Barton  Blade  Marvel_Cinematic_Universe
726            0      be  Blade  Marvel_Cinematic_Universe
588            3   Blade  Blade  Marvel_Cinematic_Universe

      sentence_id     token entity                  disambig_key
2371            0     black  Blade  New_Line_franchise_character
2582            0  official  Blade  New_Line_franchise_character
2788            1      well  Blade  New_Line_franchise_character

      sentence_id token entity disambig_key
1089            3    II  Blade    character
1305            0    's  Blade    character
1313

We need to extract two targets: the targets for NER (for the Born Classifier), and the ones for NED (for the Multilayer Born model).

In [13]:
y_ner_train = enc_train_df['ner_tag'].to_list()
# we do not need to shard NER (we use the shards to reconstruct the labels anyway to maintain consistency)
y_ner_test = list(itertools.chain.from_iterable([shard['ner_tag'].to_list() for shard in enc_test_shards]))
y_train = enc_train_df['disambig_label'].to_list()
y_tests = [shard['disambig_label'].to_list() for shard in enc_test_shards]

enc_test_shards = [shard.drop('ner_tag', axis=1) for shard in enc_test_shards]
enc_test_shards = [shard.drop('disambig_label', axis=1) for shard in enc_test_shards]

In [14]:
dict_vec = get_latest_vectoriser()

# we need the output to be as expected, i.e., we should only vectorise the columns we pre-trained the born classifier on
cols_to_vectorise = ['sentence_id', 'token', 'pos', 'dep']

X_train = dict_vec.transform(enc_train_df[cols_to_vectorise].to_dict('records'))
X_tests = [dict_vec.transform(shard[cols_to_vectorise].to_dict('records')) for shard in enc_test_shards]

## NER with the Born Classifier

### (Digression) Does fine-tuning help?

We've seen that ensembling doesn't help much, so it's only natural to ask whether fine-tuning will.
The cells below investigate this question.

In [20]:
# a model trained from scratch on the new data
born_new = BornClassifier()
born_new.fit(X_train, y_ner_train)

# the model trained on the massive wikipedia data
born_trained = get_latest_model('clf')

# the fine-tuned model (the learning rate is set to an arbitrary value)
n_classes = len(mappings['ner_tag'])
born_fine = FineTunedBornClassifier(born_trained, n_classes=n_classes, learning_rate=0.83)
born_fine.fit(X_train, y_ner_train)

In [21]:
# we need to temporarily re-unite all the stacks
X_test = sparse.vstack(X_tests)

In [22]:
y_pred_new = born_new.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_new, labels=np.unique(y_train), zero_division=0))

              precision    recall  f1-score   support

           0       0.56      0.67      0.61        21
           1       0.59      0.49      0.53        68
           2       0.02      1.00      0.04         4
           3       0.18      0.27      0.22        15
           4       0.54      0.47      0.50        30

   micro avg       0.22      0.50      0.30       138
   macro avg       0.38      0.58      0.38       138
weighted avg       0.51      0.50      0.49       138



In [23]:
y_pred_trained = born_trained.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_trained, labels=np.unique(y_train), zero_division=0))

              precision    recall  f1-score   support

           0       0.39      0.62      0.48        21
           1       0.94      0.44      0.60        68
           2       0.09      0.75      0.15         4
           3       0.20      0.07      0.10        15
           4       0.74      0.47      0.57        30

   micro avg       0.49      0.44      0.47       138
   macro avg       0.47      0.47      0.38       138
weighted avg       0.71      0.44      0.51       138



The classification report above clearly motives the use of a pre-trained model: because our current dataset is quite small, we can have NER tags with very little support and, on these NER tags, the pre-trained model does much better!

In [24]:
y_pred_fine = born_fine.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_fine, labels=np.unique(y_train), zero_division=0))

              precision    recall  f1-score   support

           0       0.43      0.62      0.51        21
           1       0.88      0.44      0.59        68
           2       0.09      0.75      0.15         4
           3       0.33      0.13      0.19        15
           4       0.74      0.47      0.57        30

   micro avg       0.50      0.45      0.47       138
   macro avg       0.49      0.48      0.40       138
weighted avg       0.70      0.45      0.52       138



Thus, from what we sated above, combining a small Born classifier with our pre-trained one should allow us to get the best of both worlds: good performance on the current data, because of the ex-novo Born classifier, and better generalisation due to the pre-trained model.

As the classification report above shows! (Note that the advantage might not be so clear all the time, because each time the notebook is re-run there is randomness. To better see the difference, I suggest increasing `train_size` in the `construct_traintest_dataframe` above.)

### Fine-tuning the Born Classifier for NED

Now that we've seen that fine-tuning does, in fact, help, let's create the fine-tuned Born classifier and move on to NED.

In [20]:
born_pretrained = get_latest_model('clf')

n_classes = len(mappings['ner_tag'])
born_finetuned = FineTunedBornClassifier(born_pretrained, n_classes=n_classes, learning_rate=0.4)
born_finetuned.fit(X_train, y_ner_train)

## NED with Multilayer Born

In [21]:
# we keep train_size small mostly so that this and the following cells don't consume excessive resources
# (even with train_size=2, this cell and the one below take ~9m and training the model shows peaks of RAM usage touching ~30GB!)
X_mlp_train, y_mlp_train = get_mlp_data(X_train, y_train, born_finetuned)

In [22]:
# a rather elegant one-liner!
X_mlp_tests, y_mlp_tests = zip(*[get_mlp_data(X_test, y_test, born_finetuned) for X_test, y_test in zip(X_tests, y_tests)])

In [23]:
input_size = X_mlp_train.shape[1] # ~600K (!)
out_size = n_classes # 19

# we reduce dimension aggressively because the input vectors are sparse 
# (TruncatedSVD results in an explained variance of ~0.99 with only 1000 features)
layer_sizes = [input_size, 512, 64, out_size]
model = MultilayerBornModel(layer_sizes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

dataset = TensorDataset(X_mlp_train, y_mlp_train.long())
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

We can finally train the Multilayer Born model!

In [24]:
num_epochs = 4

batch_losses = []
for epoch in range(num_epochs):
    total_loss = 0

    batch_pbar = tqdm(dataloader, total=len(dataloader), desc=f"Epoch {epoch+1}, Batch", position=1, leave=False)
    for batch_X, batch_y in batch_pbar:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        batch_losses.append(loss.item())
        
        batch_pbar.set_postfix({'batch_loss': f'{loss.item():.4f}'})
    display(batch_pbar)

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}] => Loss: {avg_loss:.4f}")

Epoch 1, Batch:   0%|          | 0/36 [00:00<?, ?it/s]

KeyboardInterrupt: 

We could definitely benefit from more training...

In [None]:
timestamp = datetime.now().strftime("%d%m%Y-%H%M%S")

mlp_path = os.path.join("runs", f"mlp_born_{timestamp}.pt")
torch.save(model.state_dict(), mlp_path)
print(f"[+] Multilayer Born saved to {mlp_path}.")

## Let's test it!

In [25]:
model = MultilayerBornModel(layer_sizes)
model.load_state_dict(get_latest_model("mlp"))
model.eval() # this is not quite necessary in our case, but it doesn't hurt either

MultilayerBornModel(
  (layers): ModuleList(
    (0-1): 2 x LogitsBornLayer(
      (born): Born()
    )
    (2): Born()
  )
)

In [50]:
for X_mlp_test, y_mlp_test in zip(X_mlp_tests, y_mlp_tests):
    # we evaluate on each document
    with torch.no_grad():
        y_pred_proba = model(X_mlp_test)
        y_pred_labels = torch.argmax(y_pred_proba, dim=1)
        pred_disambig = Counter(y_pred_labels).most_common(1)[0][0]
        
        print(f"True disambiguation: {int(y_mlp_test[0].item())}.")
        print(f"Predicted disambiguation: {pred_disambig}.\n")

True disambiguation: 0.
Predicted disambiguation: 1.

True disambiguation: 1.
Predicted disambiguation: 1.



In [49]:
# we might want to remind ourselves of what the numerical lables correspond to
target_entity, test_df[['disambig_key', 'disambig_label']].value_counts()

('Fantastic',
 disambig_key    disambig_label
 1994_TV_series  0                 600
 2005_film       1                 528
 Name: count, dtype: int64)

Some more thourough testing would be quite welcome, but the computational load (and the time constraints) are what they are...