## Entity Disambiguation with an all-Born pipeline.

We can finally tackle our target problem, i.e., entity disambiguation!

In [1]:
from itertools import islice
import os
import pandas as pd
import numpy as np
from datetime import datetime
import random

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt 
import seaborn as sns

import spacy

from wiki_tools import get_data_from_snippets
from utils import get_latest_model, process_doc, LogitsBorn, encode_categorical

## Data processing

In [2]:
import pandas as pd
from itertools import islice
import random

def process_text_snips(nlp, d, entity=None, disambig_key=None):
    if isinstance(d, dict):
        dfs = []
        for k, v in d.items():
            if entity is None:
                # top level, so k is the entity name
                df = process_text_snips(nlp, v, entity=k)
            else:
                # inside an entity, so k is the disambig key
                df = process_text_snips(nlp, v, entity=entity, disambig_key=k)
            dfs.append(df)
        return pd.concat(dfs, ignore_index=True)
    elif isinstance(d, list):
        dfs = [process_text_snips(nlp, item, entity=entity, disambig_key=disambig_key) for item in d]
        return pd.concat(dfs, ignore_index=True)
    elif isinstance(d, str):
        doc = nlp(d)
        processed = process_doc(doc)
        df = pd.DataFrame(processed)
        df['entity'] = entity
        df['disambig_key'] = disambig_key
        return df
    else:
        return pd.DataFrame()

def construct_traintest_dataframe(nlp, target_entity, data_dict, train_size=50):
    n_disambigs_target = len(data_dict[target_entity])
    valid_ents = [
        ent for ent in data_dict.keys() 
        if (ent != target_entity) and (len(data_dict[ent]) >= n_disambigs_target)
    ]
    
    if train_size:
        # if a training size is specified, only choose enough entities to match it
        valid_ents = random.sample(valid_ents, min(train_size, len(valid_ents)))
    valid_ents.append(target_entity)  # include the target entity to process it
    
    train_dict = {}
    for ent in valid_ents:
        # we want the same number of disambiguations for each entity in the training set
        train_dict[ent] = dict(islice(data_dict[ent].items(), n_disambigs_target))
    
    df = process_text_snips(nlp, train_dict)
    
    disambig_index_map = {
        entity: {key: idx for idx, key in enumerate(sorted(df[df['entity'] == entity]['disambig_key'].unique()))}
        for entity in df['entity'].unique()
    }
    
    df['disambig_label'] = df.apply(lambda row: disambig_index_map[row['entity']][row['disambig_key']], axis=1)
    
    return df

In [3]:
data_dict = get_data_from_snippets()

We first choose a target entity to disambiguate.

In [4]:
target_entity = random.choice(list(data_dict.keys()))
target_entity

'Scorpion'

Then we collect a set of entities for training the disambiguating Born MLP.

In [11]:
nlp = spacy.load("en_core_web_sm")
traintest_df = construct_traintest_dataframe(nlp, target_entity, data_dict)
traintest_df.drop('ner_tag', axis=1, inplace=True)

In [12]:
traintest_df.sample(10)

Unnamed: 0,sentence_id,token,pos,dep,entity,disambig_key,disambig_label
121512,0,five,NUM,nummod,Pluto,Disney,0
39366,1,Niggas,PROPN,pobj,Paris,Jay-Z_and_Kanye_West_song,0
132413,0,",",PUNCT,punct,Byzantine,Byzantine_fault,2
123328,3,and,CCONJ,cc,Pluto,mythology,4
72027,4,of,ADP,pcomp,Faith,South_Korean_TV_series,3
101601,0,Keawe,PROPN,pobj,Kamehameha,Kamehameha_I,0
2697,2,Ruby,PROPN,nsubj,Ruby,Supernatural,2
120501,3,23,NUM,nummod,Isis,TV_series,3
103374,1,increase,VERB,conj,Kamehameha,Kamehameha_Schools,2
31970,0,film,NOUN,attr,Spider,2002_film,0


In [13]:
test_df = traintest_df[traintest_df['entity'] == target_entity].reset_index(drop=True)
test_df.head(4)

Unnamed: 0,sentence_id,token,pos,dep,entity,disambig_key,disambig_label
0,0,Critical,ADJ,amod,Scorpion,Mortal_Kombat,0
1,0,reception,NOUN,nsubj,Scorpion,Mortal_Kombat,0
2,0,of,ADP,prep,Scorpion,Mortal_Kombat,0
3,0,Scorpion,NOUN,poss,Scorpion,Mortal_Kombat,0


In [14]:
train_df = traintest_df[traintest_df['entity'] != target_entity].reset_index(drop=True)

In [23]:
cols_to_encode = set(train_df.columns) - {'sentence_id', 'token', 'entity', 'disambig_key', 'disambig_label'}
enc_train_df, mappings = encode_categorical(train_df, cols_to_encode)
enc_train_df.sample(5)

Unnamed: 0,entity,token,sentence_id,disambig_label,disambig_key,dep,pos
46322,Flower,(,3,0,Japanese_group,40,12
65463,Swamp,human,1,4,comic_book,34,7
103207,Kamehameha,students,0,2,Kamehameha_Schools,34,7
122386,Pluto,October,0,2,Pluto_TV,34,11
85994,Electra,a,1,1,Pleiad,19,5


Becuase we already have the mappings, we can save some computation by simply re-using them (which we'd have to do anyhow to make sure to keep consistency between train and test data)!

In [21]:
def apply_categorical_mapping(prepared_df, mappings):
    encoded_df = prepared_df.copy()
    for col, mapping in mappings.items():
        value_to_index = {value: index for index, value in enumerate(mapping)}
        encoded_df[col] = prepared_df[col].map(value_to_index)
        encoded_df[col] = encoded_df[col].fillna(len(mapping))
        encoded_df[col] = encoded_df[col].astype(np.int8)
    return encoded_df

In [24]:
enc_test_df = apply_categorical_mapping(test_df, mappings)
enc_test_df.sample(5)

Unnamed: 0,sentence_id,token,pos,dep,entity,disambig_key,disambig_label
76,1,'s,9,11,Scorpion,Mortal_Kombat,0
819,1,is,3,0,Scorpion,TV_series,2
2660,1,prestigious,0,6,Scorpion,horse,3
1946,2,Macehead,11,8,Scorpion,Scorpion_II,1
1241,3,10,8,30,Scorpion,TV_series,2


## Model preparation

In [None]:
born_clf = get_latest_model('clf')