# Prepare and run EL training

In [1]:
import pandas as pd
import typer
import json
from collections import Counter
from pathlib import Path
from sklearn.model_selection import train_test_split
import spacy
from spacy.tokens import DocBin, Span
from spacy.kb import KnowledgeBase, Candidate
import custom_functions

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def find_no_candidates(row, no_candidate_sub_str=['NEL','NER']):
    """
    Find annotations without candidates
    """
    return any(sub_str in ' '.join(row['accept']) for sub_str in no_candidate_sub_str)

def find_best_candidate(row,best_candidate=0):
    """
    In multiple choice annotations select the candadite with the longest description
    """
    best_candidate_d={}
    for option_id in row['accept']:
        for candidate in list(filter(lambda x: option_id == x['id'], row['options'])):
            candidate_len=len(candidate['html'].split('a>:')[1])
            best_candidate_d[option_id]=candidate_len
    return [max(best_candidate_d, key=best_candidate_d.get)]

def make_doc(example):
    """
    Construct spaCy document object from dataset 
    """
    sentence = example["text"]
    if example["answer"] == "accept":
        QID = example["accept"]#[0]
        doc = nlp.make_doc(sentence)
        gold_ids.append(QID)
        entity = doc.char_span(
            example["start_char"],
            example["end_char"],
            label=example["label"],
            kb_id=QID,
        )
        doc.ents = [entity]
        for i, t in enumerate(doc):
            doc[i].is_sent_start = i == 0
        return doc

In [4]:
rng_seed=42

## Load annotation session data

In [5]:
data_session_1 = pd.read_json('../../3_prodigy_annotations/assets/prodigy_sessions/el_session_1.jsonl',lines=True)
data_session_2 = pd.read_json('../../3_prodigy_annotations/assets/prodigy_sessions/el_session_2.jsonl',lines=True)
data_session_3 = pd.read_json('../../3_prodigy_annotations/assets/prodigy_sessions/el_session_3.jsonl',lines=True)
df = data_session_1.append(data_session_2).append(data_session_3).reset_index(drop=True)
del(data_session_1)
del(data_session_2)
del(data_session_3)

In [6]:
df.shape

(4978, 17)

In [7]:
# Remove instances without viable candidates 
df = df[~df.apply(find_no_candidates,1)]

In [8]:
df.shape

(2880, 17)

In [9]:
# Remove instances wrongly accepted without any selected options
df['accept_len'] = df['accept'].apply(lambda x: len(''.join(x))).sort_values()
df = df[df['accept_len']!=0]
df.drop('accept_len',1,inplace=True)

  after removing the cwd from sys.path.


In [10]:
df.shape

(2876, 17)

In [11]:
# In cases with multiple annotated candidates, select candidate with the longest description 
df['accept']=df.apply(find_best_candidate,1)
# Extract accepted id from list 
df['accept']=df['accept'].apply(lambda x: x[0])

In [12]:
# Extract info from spaCy span dictionary
df.loc[:,'ents'] = df.loc[:,'spans'].apply(lambda x: x[0]['text'])
df['start_char'] = df['spans'].apply(lambda x: x[0]["start"])
df['end_char'] = df['spans'].apply(lambda x: x[0]["end"])
df['label'] = df['spans'].apply(lambda x: x[0]["label"])

In [13]:
df['ents'].nunique()

1449

## Make train and dev set

In [15]:
# Train test split
sampling_col = '_input_hash'
index_train, index_test = train_test_split(df[sampling_col].unique(), test_size=0.1, random_state=rng_seed)
df_train = df[df[sampling_col].isin(index_train)]
df_test = df[df[sampling_col].isin(index_test)]

In [16]:
df_train.shape, df_test.shape

((614, 21), (62, 21))

In [17]:
# Further wrangling
df_train=df_train[['ents','text','accept','start_char','end_char','label','answer']]
df_test=df_test[['ents','text','accept','start_char','end_char','label','answer']]
# Order train dataset randomly
df_train=df_train.sample(frac=1, random_state=rng_seed)
df_train=df_train.drop_duplicates()
df_test=df_test.drop_duplicates()

In [18]:
df_train.head(2)

Unnamed: 0,ents,text,accept,start_char,end_char,label,answer
3113,Muhyiddin,Anwar was due to succeed then-prime minister M...,Q1060949,151,160,PERSON,accept
3262,Vitter,Fleming and fellow congressman Charles Boustan...,13645,95,101,PERSON,accept


In [19]:
df_test.head(2)

Unnamed: 0,ents,text,accept,start_char,end_char,label,answer
4,Corbyn,"The Welsh Labour leader, Carwyn Jones has reje...",Q291169,73,79,PERSON,accept
5,Corbyn,"The Welsh Labour leader, Carwyn Jones has reje...",Q291169,254,260,PERSON,accept


In [20]:
# Export train and test sets
df_train_dir='../assets/df_train.csv'
df_test_dir='../assets/df_test.csv'
df_train.to_csv(df_train_dir)
df_test.to_csv(df_test_dir)

## Create `.spacy` corpus

In [21]:
nlp_model = 'en_core_web_lg'
nlp = spacy.load(nlp_model, exclude="parser, tagger")
train_corpus = '../assets/el_train.spacy'
test_corpus = '../assets/el_test.spacy'

In [23]:
# Generate spaCy Docs to train/test model
gold_ids = []

train_docs = df_train.apply(make_doc, axis=1)
test_docs = df_test.apply(make_doc, axis=1)
train_docbin = DocBin()
test_docbin = DocBin()

for doc in train_docs:
    train_docbin.add(doc)
for doc in test_docs:
    test_docbin.add(doc)

train_docbin.to_disk(train_corpus)
test_docbin.to_disk(test_corpus)

## Run training

## Package model

In [25]:
el_model='en_core_web_lg'
nlp=spacy.load(el_model)
nlp.add_pipe('entity_linker')

## Validate model

In [27]:
# Load EL model
el_model='training/en_pipeline-0.0.0/en_pipeline/en_pipeline-0.0.0'
nlp=spacy.load(el_model)

# Load KB
kb_loc='../../2_kb_datasets/assets/kb_lg_model_2022_11_07/'

sentence='John Michael was born in Paris in 1992'
doc=nlp(sentence)
embedding_len=len(doc.vector)

kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=embedding_len)
kb.from_disk(kb_loc)

# Load KB dataset
dataset='full'# OR'open_sanctions'# OR 'lilsis'
kb_iteration='_2022_11_07'
kb_data=pd.read_csv(f'../../2_kb_datasets/assets/kb_entities_{dataset}{kb_iteration}.csv',index_col=0)

# Count number of duplicates per KB alias
kb_data['id']=kb_data['id'].astype(str)
alias_duplication = kb_data['name'].value_counts().reset_index().rename(columns={'index':'name', 'name':'duplicate_counts'})
kb_data=kb_data.merge(alias_duplication)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [28]:
# Run predictions and extract results for person mentions in test dataset
df_lst=[]
texts=df_test['text'].unique()
i = 0
for text in texts:
    row_lst=[]
    doc=nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            row_lst.append([text, ent.text, ent.kb_id_])
    df_lst.extend(row_lst)
    i +=1
    if i% 100 == 0:
        print(i)

In [29]:
# Inspect predictions 
df_predictions=pd.DataFrame(df_lst,columns=['text', 'ents', 'pred_qid'])
df_predictions=df_predictions.drop_duplicates()

In [30]:
df_predictions.shape

(104, 3)

In [31]:
df_predictions

Unnamed: 0,text,ents,pred_qid
0,"The Welsh Labour leader, Carwyn Jones has reje...",Carwyn Jones,Q111840
1,"The Welsh Labour leader, Carwyn Jones has reje...",Corbyn,Q291169
2,“Liz Truss will be travelling the country wear...,Liz Truss,Q272201
3,“Liz Truss will be travelling the country wear...,Rishi,Q44274451
4,"The British chancellor, George Osborne, said T...",George Osborne,Q332493
...,...,...,...
104,"The Jouyets married in 2006, seven years after...",Brigitte,Q916162
105,"The Jouyets married in 2006, seven years after...",Anne-Claire Taittinger,311246
106,"The Jouyets married in 2006, seven years after...",Norman,Q332546
107,Defence minister Mark Lancaster wrote to McDon...,Mark Lancaster,Q750161


In [32]:
# Retrieve annotator info to predictions  
df_predictions=df_test.merge(df_predictions, on=['text','ents'], how='outer').drop_duplicates().sort_values('text')

In [33]:
# Retrieve KB information for prediction ids
df_predictions=df_predictions.merge(kb_data[['id','name','desc','kb_origin', 'duplicate_counts']], left_on=['pred_qid'], right_on=['id'], how='left')

In [34]:
# Reorder columns
df_predictions=df_predictions[['text','ents','accept','pred_qid', 'id', 'name', 'desc', 'duplicate_counts', 'kb_origin', 'start_char', 'end_char']].rename(columns={'ents':'mention','id':'kb_id', 'name':'kb_name'})                                                                             

In [None]:
# Inspect predictions
df_predictions

In [None]:
# Inspect instances where top choice matches prediction
df_predictions[df_predictions['accept']==df_predictions['pred_qid']]

In [None]:
# Inspect instances where mention matches kb aliases
df_predictions[df_predictions['mention']==df_predictions['kb_name']]

In [None]:
# Inspect instances where linking is different from top choice, but still matches the alias
df_predictions[(df_predictions['accept']!=df_predictions['pred_qid'])&(df_predictions['mention']==df_predictions['kb_name'])]