## Entity Disambiguation with an all-Born pipeline.

We can finally tackle our target problem, i.e., entity disambiguation!

In [1]:
import os
import numpy as np
from datetime import datetime
import random
import itertools
from collections import Counter

from tqdm.notebook import tqdm
from IPython.display import display

from sklearn.metrics import classification_report
from scipy import sparse

import spacy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from bornrule import BornClassifier

from wiki_tools import get_data_from_snippets

from models import MultilayerBornModel, FineTunedBornClassifier
from utils import construct_traintest_dataframe, apply_categorical_mapping, get_mlp_data, get_latest_model, get_latest_vectoriser, get_latest_encmap

## Data processing

In [2]:
data_dict = get_data_from_snippets()

We first choose a target entity to disambiguate.

In [3]:
target_entity = random.choice(list(data_dict.keys()))
target_entity

'Fantastic'

Then we collect a set of entities for training the disambiguating Multilayer Born model.

In [4]:
nlp = spacy.load("en_core_web_sm")
# train_size is kept small to avoid excessive resourse utilisation
traintest_df = construct_traintest_dataframe(nlp, target_entity, data_dict, train_size=2)

In [5]:
traintest_df.sample(10)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
2690,3,was,AUX,auxpass,NONE,Fantastic,2005_film,1
2779,4,),PUNCT,punct,NONE,Fantastic,2005_film,1
2936,2,to,PART,aux,NONE,Fantastic,1994_TV_series,0
2216,1,to,PART,aux,NONE,Shape,Shape_Arts,0
147,1,were,AUX,auxpass,NONE,Greenland,Greenland_shark,1
2877,0,396,NUM,dobj,MONEY,Fantastic,1994_TV_series,0
3151,1,he,PRON,appos,NONE,Fantastic,1994_TV_series,0
542,1,of,ADP,prep,NONE,Greenland,Greenland_halibut,0
1722,0,for,ADP,prep,NONE,Shape,Shape_Arts,0
1896,1,formerly,ADV,advmod,NONE,Shape,Shape_Arts,0


(Note that the fact that we're carrying around a `ner_tag` column, which we will have to delete later, is an inefficiency of the current pipeline which could (should) be addressed sooner or later.)

Finally, we need to make the dataset ameanable for the Born models.

In [6]:
train_df = traintest_df[traintest_df['entity'] != target_entity].reset_index(drop=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2271 entries, 0 to 2270
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   sentence_id     2271 non-null   int64 
 1   token           2271 non-null   object
 2   pos             2271 non-null   object
 3   dep             2271 non-null   object
 4   ner_tag         2271 non-null   object
 5   entity          2271 non-null   object
 6   disambig_key    2271 non-null   object
 7   disambig_label  2271 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 142.1+ KB


In [7]:
test_df = traintest_df[traintest_df['entity'] == target_entity].reset_index(drop=True)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   sentence_id     1128 non-null   int64 
 1   token           1128 non-null   object
 2   pos             1128 non-null   object
 3   dep             1128 non-null   object
 4   ner_tag         1128 non-null   object
 5   entity          1128 non-null   object
 6   disambig_key    1128 non-null   object
 7   disambig_label  1128 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 70.6+ KB


In [8]:
mappings = get_latest_encmap()

In [9]:
enc_train_df = apply_categorical_mapping(train_df, mappings)
enc_train_df.sample(5)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
528,1,migrated,16,15,9,Greenland,Greenland_halibut,0
1345,0,paced,16,6,9,Shape,song,1
1417,2,their,10,36,9,Shape,song,1
1799,3,",",12,41,9,Shape,Shape_Arts,0
360,0,on,1,39,9,Greenland,Greenland_shark,1


In [10]:
enc_test_df = apply_categorical_mapping(test_df, mappings)
enc_test_df.sample(5)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
473,1,/,15,41,9,Fantastic,2005_film,1
317,0,he,10,29,9,Fantastic,2005_film,1
115,4,at,1,39,9,Fantastic,2005_film,1
478,1,and,4,12,9,Fantastic,2005_film,1
624,1,It,10,29,9,Fantastic,1994_TV_series,0


For the test set, we need to separate the disambiguations (to classify the documents).

In [11]:
cols = set(enc_test_df['disambig_key'])
enc_test_shards = [enc_test_df.loc[enc_test_df['disambig_key'] == col] for col in cols]

In [12]:
for df_shard in enc_test_shards:
    # to make printing a bit prettier (avoid newlines)
    cols_to_print = ['sentence_id', 'token', 'entity', 'disambig_key']
    print(df_shard[cols_to_print].sample(3))
    print()

      sentence_id   token     entity    disambig_key
762             0  prison  Fantastic  1994_TV_series
557             0   fussy  Fantastic  1994_TV_series
1005            0   based  Fantastic  1994_TV_series

     sentence_id token     entity disambig_key
509            4     .  Fantastic    2005_film
123            4  Four  Fantastic    2005_film
213            2    on  Fantastic    2005_film



We need to extract two targets: the targets for NER (for the Born Classifier), and the ones for NED (for the Multilayer Born model).

In [13]:
y_ner_train = enc_train_df['ner_tag'].to_list()
# we do not need to shard NER (we use the shards to reconstruct the labels anyway to maintain consistency)
y_ner_test = list(itertools.chain.from_iterable([shard['ner_tag'].to_list() for shard in enc_test_shards]))
y_train = enc_train_df['disambig_label'].to_list()
y_tests = [shard['disambig_label'].to_list() for shard in enc_test_shards]

enc_test_shards = [shard.drop('ner_tag', axis=1) for shard in enc_test_shards]
enc_test_shards = [shard.drop('disambig_label', axis=1) for shard in enc_test_shards]

In [14]:
dict_vec = get_latest_vectoriser()

# we need the output to be as expected, i.e., we should only vectorise the columns we pre-trained the born classifier on
cols_to_vectorise = ['sentence_id', 'token', 'pos', 'dep']

X_train = dict_vec.transform(enc_train_df[cols_to_vectorise].to_dict('records'))
X_tests = [dict_vec.transform(shard[cols_to_vectorise].to_dict('records')) for shard in enc_test_shards]

## NER with the Born Classifier

### (Digression) Does fine-tuning help?

We've seen that ensembling doesn't help much, so it's only natural to ask whether fine-tuning will.
The cells below investigate this question.

In [15]:
# a model trained from scratch on the new data
born_new = BornClassifier()
born_new.fit(X_train, y_ner_train)

# the model trained on the massive wikipedia data
born_trained = get_latest_model('clf')

# the fine-tuned model (the learning rate is set to an arbitrary value)
n_classes = len(mappings['ner_tag'])
born_fine = FineTunedBornClassifier(born_trained, n_classes=n_classes, learning_rate=0.8)
born_fine.fit(X_train, y_ner_train)

In [16]:
# we need to temporarily re-unite all the stacks
X_test = sparse.vstack(X_tests)

In [17]:
y_pred_new = born_new.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_new, labels=np.unique(y_train), zero_division=0))

              precision    recall  f1-score   support

           0       0.42      0.77      0.54        30
           1       0.30      0.08      0.12        39

   micro avg       0.40      0.38      0.39        69
   macro avg       0.36      0.42      0.33        69
weighted avg       0.35      0.38      0.30        69



In [18]:
y_pred_trained = born_trained.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_trained, labels=np.unique(y_train), zero_division=0))

              precision    recall  f1-score   support

           0       0.18      0.07      0.10        30
           1       0.90      0.46      0.61        39

   micro avg       0.65      0.29      0.40        69
   macro avg       0.54      0.26      0.35        69
weighted avg       0.59      0.29      0.39        69



The classification report above clearly motives the use of a pre-trained model: because our current dataset is quite small, we can have NER tags with very little support and, on these NER tags, the pre-trained model does much better!

In [19]:
y_pred_fine = born_fine.predict(X_test)
print(classification_report(y_true=y_ner_test, y_pred=y_pred_fine, labels=np.unique(y_train), zero_division=0))

              precision    recall  f1-score   support

           0       0.18      0.07      0.10        30
           1       0.90      0.46      0.61        39

   micro avg       0.65      0.29      0.40        69
   macro avg       0.54      0.26      0.35        69
weighted avg       0.59      0.29      0.39        69



Thus, from what we sated above, combining a small Born classifier with our pre-trained one should allow us to get the best of both worlds: good performance on the current data, because of the ex-novo Born classifier, and better generalisation due to the pre-trained model.

As the classification report above shows! (Note that the advantage might not be so clear all the time, because each time the notebook is re-run there is randomness. To better see the difference, I suggest increasing `train_size` in the `construct_traintest_dataframe` above.)

### Fine-tuning the Born Classifier for NED

Now that we've seen that fine-tuning does, in fact, help, let's create the fine-tuned Born classifier and move on to NED.

In [20]:
born_pretrained = get_latest_model('clf')

n_classes = len(mappings['ner_tag'])
born_finetuned = FineTunedBornClassifier(born_pretrained, n_classes=n_classes, learning_rate=0.4)
born_finetuned.fit(X_train, y_ner_train)

## NED with Multilayer Born

In [21]:
# we keep train_size small mostly so that this and the following cells don't consume excessive resources
# (even with train_size=2, this cell takes ~9m and training the model shows peaks of RAM usage touching ~30GB!)
X_mlp_train, y_mlp_train = get_mlp_data(X_train, y_train, born_finetuned)

In [22]:
# a rather elegant one-liner!
X_mlp_tests, y_mlp_tests = zip(*[get_mlp_data(X_test, y_test, born_finetuned) for X_test, y_test in zip(X_tests, y_tests)])

In [23]:
input_size = X_mlp_train.shape[1] # ~600K (!)
out_size = n_classes # 19

# we reduce dimension aggressively because the input vectors are sparse 
# (TruncatedSVD results in an explained variance of ~0.99 with only 1000 features)
layer_sizes = [input_size, 512, 64, out_size]
model = MultilayerBornModel(layer_sizes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

dataset = TensorDataset(X_mlp_train, y_mlp_train.long())
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

We can finally train the Multilayer Born model!

In [24]:
num_epochs = 4

batch_losses = []
for epoch in range(num_epochs):
    total_loss = 0

    batch_pbar = tqdm(dataloader, total=len(dataloader), desc=f"Epoch {epoch+1}, Batch", position=1, leave=False)
    for batch_X, batch_y in batch_pbar:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        batch_losses.append(loss.item())
        
        batch_pbar.set_postfix({'batch_loss': f'{loss.item():.4f}'})
    display(batch_pbar)

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}] => Loss: {avg_loss:.4f}")

Epoch 1, Batch:   0%|          | 0/36 [00:00<?, ?it/s]

KeyboardInterrupt: 

We could definitely benefit from more training...

In [None]:
timestamp = datetime.now().strftime("%d%m%Y-%H%M%S")

mlp_path = os.path.join("runs", f"mlp_born_{timestamp}.pt")
torch.save(model.state_dict(), mlp_path)
print(f"[+] Multilayer Born saved to {mlp_path}.")

## Let's test it!

In [25]:
model = MultilayerBornModel(layer_sizes)
model.load_state_dict(get_latest_model("mlp"))
model.eval() # this is not quite necessary in our case, but it doesn't hurt either

MultilayerBornModel(
  (layers): ModuleList(
    (0-1): 2 x LogitsBornLayer(
      (born): Born()
    )
    (2): Born()
  )
)

In [50]:
for X_mlp_test, y_mlp_test in zip(X_mlp_tests, y_mlp_tests):
    # we evaluate on each document
    with torch.no_grad():
        y_pred_proba = model(X_mlp_test)
        y_pred_labels = torch.argmax(y_pred_proba, dim=1)
        pred_disambig = Counter(y_pred_labels).most_common(1)[0][0]
        
        print(f"True disambiguation: {int(y_mlp_test[0].item())}.")
        print(f"Predicted disambiguation: {pred_disambig}.\n")

True disambiguation: 0.
Predicted disambiguation: 1.

True disambiguation: 1.
Predicted disambiguation: 1.



In [49]:
# we might want to remind ourselves of what the numerical lables correspond to
target_entity, test_df[['disambig_key', 'disambig_label']].value_counts()

('Fantastic',
 disambig_key    disambig_label
 1994_TV_series  0                 600
 2005_film       1                 528
 Name: count, dtype: int64)

Some more thourough testing would be quite welcome, but the computational load (and the time constraints) are what they are...