## Entity Disambiguation with an all-Born pipeline.

We can finally tackle our target problem, i.e., entity disambiguation!

In [1]:
from itertools import islice
import os
import pandas as pd
import numpy as np
import scipy.sparse as sp
from datetime import datetime
import random

from sklearn.feature_extraction import DictVectorizer

import matplotlib.pyplot as plt 
import seaborn as sns

import spacy

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from bornrule import BornClassifier

from wiki_tools import get_data_from_snippets
from utils import LogitsBorn, process_doc, encode_categorical, get_mlp_data

## Data processing

In [2]:
def process_text_snips(nlp, d, entity=None, disambig_key=None):
    if isinstance(d, dict):
        dfs = []
        for k, v in d.items():
            if entity is None:
                # top level, so k is the entity name
                df = process_text_snips(nlp, v, entity=k)
            else:
                # inside an entity, so k is the disambig key
                df = process_text_snips(nlp, v, entity=entity, disambig_key=k)
            dfs.append(df)
        return pd.concat(dfs, ignore_index=True)
    elif isinstance(d, list):
        dfs = [process_text_snips(nlp, item, entity=entity, disambig_key=disambig_key) for item in d]
        return pd.concat(dfs, ignore_index=True)
    elif isinstance(d, str):
        doc = nlp(d)
        processed = process_doc(doc)
        df = pd.DataFrame(processed)
        df['entity'] = entity
        df['disambig_key'] = disambig_key
        return df
    else:
        return pd.DataFrame()

def construct_traintest_dataframe(nlp, target_entity, data_dict, train_size=50):
    n_disambigs_target = len(data_dict[target_entity])
    valid_ents = [
        ent for ent in data_dict.keys() 
        if (ent != target_entity) and (len(data_dict[ent]) >= n_disambigs_target)
    ]
    
    if train_size:
        # if a training size is specified, only choose enough entities to match it
        valid_ents = random.sample(valid_ents, min(train_size, len(valid_ents)))
    valid_ents.append(target_entity)  # include the target entity to process it
    
    train_dict = {}
    for ent in valid_ents:
        # we want the same number of disambiguations for each entity in the training set
        train_dict[ent] = dict(islice(data_dict[ent].items(), n_disambigs_target))
    
    df = process_text_snips(nlp, train_dict)
    
    disambig_index_map = {
        entity: {key: idx for idx, key in enumerate(sorted(df[df['entity'] == entity]['disambig_key'].unique()))}
        for entity in df['entity'].unique()
    }
    
    df['disambig_label'] = df.apply(lambda row: disambig_index_map[row['entity']][row['disambig_key']], axis=1)
    
    return df

In [3]:
data_dict = get_data_from_snippets()

We first choose a target entity to disambiguate.

In [4]:
target_entity = random.choice(list(data_dict.keys()))
target_entity

'Faith'

Then we collect a set of entities for training the disambiguating Born MLP.

In [5]:
nlp = spacy.load("en_core_web_sm")
traintest_df = construct_traintest_dataframe(nlp, target_entity, data_dict)

In [6]:
traintest_df.sample(10)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
12576,3,years,NOUN,pobj,DATE,Star,magazine,4
12991,3,father,NOUN,nsubj,NONE,Star,2024_Indian_film,1
21210,3,via,ADP,prep,NONE,Sanctuary,Canadian_TV_series,1
128924,3,"""",PUNCT,punct,NONE,Death,Marvel_Comics,3
87517,3,and,CCONJ,cc,NONE,Flower,Meerkat_Manor,3
95736,3,\n,SPACE,dep,NONE,Somerset,Somerset_West,4
33993,3,"""",PUNCT,punct,NONE,Napoleon,card_game,3
96596,1,to,ADP,prep,NONE,Utopia,Björk_album,1
1871,0,",",PUNCT,punct,NONE,Vulcan,mythology,4
4967,0,-,PUNCT,punct,DATE,Poison,Wooding_novel,3


In [7]:
test_df = traintest_df[traintest_df['entity'] == target_entity].reset_index(drop=True)
test_df.head(4)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
0,0,A,DET,det,NONE,Faith,Pop_Smoke_album,2
1,0,trailer,NOUN,compound,NONE,Faith,Pop_Smoke_album,2
2,0,video,NOUN,nsubjpass,NONE,Faith,Pop_Smoke_album,2
3,0,for,ADP,prep,NONE,Faith,Pop_Smoke_album,2


In [8]:
train_df = traintest_df[traintest_df['entity'] != target_entity].reset_index(drop=True)

In [9]:
cols_to_encode = set(train_df.columns) - {'sentence_id', 'token', 'entity', 'disambig_key', 'disambig_label'}
enc_train_df, mappings = encode_categorical(train_df, cols_to_encode)
enc_train_df.sample(5)

Unnamed: 0,entity,disambig_label,disambig_key,token,sentence_id,ner_tag,dep,pos
7215,Manchester,0,2015–,formed,3,9,6,16
105907,Pluto,3,astrology,major,3,9,6,0
115317,Chaos,3,genus,",",0,9,41,12
5192,Poison,3,Wooding_novel,She,3,9,29,10
43968,Vega,2,radio_network,survey,3,9,35,7


Becuase we already have the mappings, we can save some computation by simply re-using them (which we'd have to do anyhow to make sure to keep consistency between train and test data)!

In [10]:
def apply_categorical_mapping(prepared_df, mappings):
    encoded_df = prepared_df.copy()
    for col, mapping in mappings.items():
        value_to_index = {value: index for index, value in enumerate(mapping)}
        encoded_df[col] = prepared_df[col].map(value_to_index)
        encoded_df[col] = encoded_df[col].fillna(len(mapping))
        encoded_df[col] = encoded_df[col].astype(np.int8)
    return encoded_df

In [11]:
enc_test_df = apply_categorical_mapping(test_df, mappings)
enc_test_df.sample(5)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key,disambig_label
1119,2,Faith,17,35,9,Faith,George_Michael_album,1
592,3,held,16,1,9,Faith,George_Michael_album,1
1226,2,Train,11,35,18,Faith,Galantis_and_Dolly_Parton_song,0
929,0,"""",12,41,9,Faith,George_Michael_album,1
1378,2,experience,7,8,9,Faith,Galantis_and_Dolly_Parton_song,0


Now we need to vectorise and set the `disambig_label` column as target, and then we can get to training.

In [12]:
y_ner_train = enc_train_df['ner_tag'].to_list()
y_ner_test = enc_test_df['ner_tag'].to_list()
enc_train_df.drop('ner_tag', axis=1)
enc_test_df.drop('ner_tag', axis=1)

y_train = enc_train_df['disambig_label'].to_list()
y_test = enc_test_df['disambig_label'].to_list()
enc_train_df.drop('disambig_label', axis=1)
enc_test_df.drop('disambig_label', axis=1)

Unnamed: 0,sentence_id,token,pos,dep,ner_tag,entity,disambig_key
0,0,A,5,20,9,Faith,Pop_Smoke_album
1,0,trailer,7,14,9,Faith,Pop_Smoke_album
2,0,video,7,30,9,Faith,Pop_Smoke_album
3,0,for,1,39,9,Faith,Pop_Smoke_album
4,0,Faith,11,35,12,Faith,Pop_Smoke_album
...,...,...,...,...,...,...,...
2841,3,",",12,41,9,Faith,The_Cure_album
2842,3,in,1,39,9,Faith,The_Cure_album
2843,3,the,5,20,9,Faith,The_Cure_album
2844,3,fog,7,35,9,Faith,The_Cure_album


In [13]:
dict_vec = DictVectorizer()

# we need the output to be as expected, i.e., we should only vectorise the columns we pre-trained the born classifier on
cols_to_vectorise = ['sentence_id', 'token', 'pos', 'dep']

X_train = dict_vec.fit_transform(enc_train_df[cols_to_vectorise].to_dict('records'))
X_test = dict_vec.transform(enc_test_df[cols_to_vectorise].to_dict('records'))

In [14]:
sp.vstack([X_train, X_test])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 569573 stored elements and shape (142464, 18338)>

## Training

In [15]:
# we do not use the pretrained model because
# 1) quite a few NER tags are missing, and the dimensionality is wrong
# 2) I've tried to do so in a number of different ways, but simply retraining a born classifier on the new data worked better every time...
born_clf = BornClassifier()
born_clf.fit(X_train, y_ner_train)

In [16]:
features = dict_vec.get_feature_names_out()
X_mlp_train, y_mlp_train = get_mlp_data(X_train, y_train, born_clf, features, mappings)
X_mlp_test, y_mlp_test = get_mlp_data(X_test, y_test, born_clf, features, mappings)

We now have everything we need to train the Born MLP!

In [35]:
input_size = len(X_mlp_train[0])
out_size = len(np.unique(y_train))

layer_sizes = [input_size, 512, 128, 32, out_size]
model = LogitsBorn(layer_sizes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

dataset = TensorDataset(X_mlp_train, y_mlp_train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [36]:
model

LogitsBorn(
  (layers): ModuleList(
    (0-2): 3 x LogitsBornLayer(
      (born): Born()
    )
    (3): Born()
  )
)

In [37]:
num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0
    for batch_X, batch_y in dataloader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# predictions = model(new_data)

Epoch [1/10], Loss: 1.6094
Epoch [2/10], Loss: 1.6088
Epoch [3/10], Loss: 1.5489
Epoch [4/10], Loss: 1.1256
Epoch [5/10], Loss: 1.0775
Epoch [6/10], Loss: 0.9976
Epoch [7/10], Loss: 0.9676
Epoch [8/10], Loss: 0.9265
Epoch [9/10], Loss: 0.9206
Epoch [10/10], Loss: 0.9170
