# Prepare and run EL training

In [37]:
import pandas

In [38]:
df = pandas.read_json('./data/el_session_1_clean.jsonl', lines=True)
df.head()

Unnamed: 0,text,_input_hash,_task_hash,gu_url,spans,meta,_is_binary,options,config,_view_id,accept,answer,_timestamp,_annotator_id,_session_id,flagged
0,"The Welsh Labour leader, Carwyn Jones has reje...",2464972,-288309831,www.theguardian.com/politics/2016/apr/29/welsh...,"[{'start': 73, 'end': 79, 'text': 'Corbyn', 'r...",{'score': 1},True,"[{'id': 'NER_WrongType', 'text': 'Incorrect en...",{'choice_style': 'multiple'},blocks,[Q291169],accept,1666974436,el_session_1-Anna,el_session_1-Anna,
1,"The Welsh Labour leader, Carwyn Jones has reje...",2464972,1845531960,www.theguardian.com/politics/2016/apr/29/welsh...,"[{'start': 254, 'end': 260, 'text': 'Corbyn', ...",{'score': 1},True,"[{'id': 'NER_WrongType', 'text': 'Incorrect en...",{'choice_style': 'multiple'},blocks,[Q291169],accept,1666974444,el_session_1-Anna,el_session_1-Anna,
2,The Conservatives’ idea to make firms list the...,-679110122,1266616370,www.theguardian.com/politics/2016/oct/09/tory-...,"[{'start': 136, 'end': 149, 'text': 'David Cam...",{'score': 0.6887111677000001},True,"[{'id': 'NER_WrongType', 'text': 'Incorrect en...",{'choice_style': 'multiple'},blocks,[53756],accept,1666974461,el_session_1-Anna,el_session_1-Anna,
3,The review is being led by the Labour MP David...,1774082438,1860777600,www.theguardian.com/uk-news/2016/jul/19/metrop...,"[{'start': 72, 'end': 85, 'text': 'David Camer...",{'score': 1},True,"[{'id': 'NER_WrongType', 'text': 'Incorrect en...",{'choice_style': 'multiple'},blocks,[53756],accept,1666974465,el_session_1-Anna,el_session_1-Anna,
4,Politics Home has been doing a dogged job of l...,1698700430,-948326007,www.theguardian.com/politics/2017/apr/27/the-s...,"[{'start': 257, 'end': 268, 'text': 'Theresa M...",{'score': 0.6887141895000001},True,"[{'id': 'NER_WrongType', 'text': 'Incorrect en...",{'choice_style': 'multiple'},blocks,[Q264766],accept,1666974475,el_session_1-Anna,el_session_1-Anna,


## Make train and dev set

In [39]:
from sklearn.model_selection import train_test_split

In [45]:
index_train, index_test = train_test_split(df['_input_hash'].unique(), test_size=0.4, random_state=14)

In [48]:
df_train = df[df['_input_hash'].isin(index_train)]
df_test = df[df['_input_hash'].isin(index_test)]

In [51]:
df_train.shape, df_test.shape

((389, 16), (266, 16))

## Create `.spacy` corpus

In [59]:
nlp_model = 'en_core_web_lg'

In [65]:
import typer
import json
from collections import Counter
from pathlib import Path
import spacy
from spacy.tokens import DocBin, Span

nlp = spacy.load(nlp_model, exclude="parser, tagger")
train_corpus = 'data/el_train.spacy'
test_corpus = 'data/el_test.spacy'



In [64]:
docs = []
gold_ids = []

def make_doc(example):
    sentence = example["text"]
    if example["answer"] == "accept":
        QID = example["accept"][0]
        doc = nlp.make_doc(sentence)
        gold_ids.append(QID)
        # we assume only 1 annotated span per sentence, and only 1 KB ID per span
        entity = doc.char_span(
            example["spans"][0]["start"],
            example["spans"][0]["end"],
            label=example["spans"][0]["label"],
            kb_id=QID,
        )
        doc.ents = [entity]
        for i, t in enumerate(doc):
            doc[i].is_sent_start = i == 0
        return doc

train_docs = df_train.apply(make_doc, axis=1)
test_docs = df_test.apply(make_doc, axis=1)

In [68]:
train_docbin = DocBin()
test_docbin = DocBin()

for doc in train_docs:
    train_docbin.add(doc)
for doc in test_docs:
    test_docbin.add(doc)

train_docbin.to_disk(train_corpus)
test_docbin.to_disk(test_corpus)

## Run training

In [4]:
!python -m spacy train configs/nel.cfg --output training --paths.train data/el_train.spacy --paths.dev data/el_test.spacy \
    --paths.kb kb/kb_full_2022_10_26 --paths.base_nlp ../Assets/my_output/nlp_full_2022_10_14 -c scripts/custom_functions.py

[38;5;4mℹ Saving to output directory: training[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-11-04 09:36:10,069] [INFO] Set up nlp object from config
[2022-11-04 09:36:10,080] [INFO] Pipeline: ['sentencizer', 'ner', 'entity_linker']
[2022-11-04 09:36:10,086] [INFO] Created vocabulary
[2022-11-04 09:36:13,165] [INFO] Added vectors: ../Assets/my_output/nlp_full_2022_10_14
[2022-11-04 09:36:15,405] [INFO] Finished initializing nlp object
[2022-11-04 09:36:47,382] [INFO] Initialized pipeline components: ['entity_linker']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['sentencizer', 'ner', 'entity_linker'][0m
[38;5;4mℹ Frozen components: ['sentencizer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS ENTIT...  SENTS_F  SENTS_P  SENTS_R  ENTS_F  ENTS_P  ENTS_R  NEL_MICRO_F  NEL_MICRO_R  NEL_MICRO_P  SCORE 
---  ------  -------------  -------  -------  -------  ------  ------  ------  -----------  -----------  -----------  ------
  sentence_n