In [None]:
# ce notebook est une adaptation du notebook : https://github.com/explosion/projects/blob/master/nel-emerson/scripts/notebook_video.ipynb

In [None]:
import spacy
import csv
from pathlib import Path
from spacy.kb import KnowledgeBase
import os
import json
import random
from spacy.util import minibatch, compounding

In [None]:
texte = ""
with open(("dorian.txt"), "r", encoding = "UTF-8") as fin :
    texte = texte.join(line.rstrip("\n") + " " for line in fin.readlines())

In [None]:
nlp = spacy.load("en_core_web_lg")
doc = nlp(texte)

In [None]:
def load_entities():
    entities_loc = Path.cwd() / "entities-dorian.csv"  
    names = dict()
    descriptions = dict()
    with entities_loc.open("r", encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            qid = row[0]
            name = row[1]
            desc = row[2]
            names[qid] = name
            descriptions[qid] = desc
    return names, descriptions

In [None]:
name_dict, desc_dict = load_entities()
for QID in name_dict.keys():
    print(f"{QID}, name={name_dict[QID]}, desc={desc_dict[QID]}")

In [None]:
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)

In [None]:
for qid, desc in desc_dict.items():
    desc_doc = nlp(desc)
    desc_enc = desc_doc.vector
    kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)   

In [None]:
for qid, name in name_dict.items():
    kb.add_alias(alias=name, entities=[qid], probabilities=[1])   

In [None]:
qids = name_dict.keys()
probs = [0.1 for qid in qids]
kb.add_alias(alias="Dorian", entities=qids, probabilities=probs)  

In [None]:
print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

In [None]:
print(f"Candidates for 'Lord Henry Wotton': {[c.entity_ for c in kb.get_candidates('Lord Henry Wotton')]}")
print(f"Candidates for 'Sibyl Vane': {[c.entity_ for c in kb.get_candidates('Sibyl Vane')]}")
print(f"Candidates for 'Victoria Wotton': {[c.entity_ for c in kb.get_candidates('Victoria Wotton')]}")
print(f"Candidates for 'James Vane': {[c.entity_ for c in kb.get_candidates('James Vane')]}")
print(f"Candidates for 'Wotton': {[c.entity_ for c in kb.get_candidates('Wotton')]}")
print(f"Candidates for 'Vane': {[c.entity_ for c in kb.get_candidates('Vane')]}")
print(f"Candidates for 'Wotton': {[c.entity_ for c in kb.get_candidates('Victoria')]}")
print(f"Candidates for 'Vane': {[c.entity_ for c in kb.get_candidates('Henry')]}")

In [None]:
print(f"Candidates for 'Basil': {[c.entity_ for c in kb.get_candidates('Basil')]}")
print(f"Candidates for 'Dorian': {[c.entity_ for c in kb.get_candidates('Dorian')]}")

In [None]:
json_loc = Path.cwd() / "dorian.jsonl" 
with json_loc.open("r", encoding="utf8") as jsonfile:
    line = jsonfile.readline()
    print(line)   

In [None]:
dataset = []

with json_loc.open("r", encoding="utf8") as jsonfile:
    for line in jsonfile:
        example = json.loads(line)
        text = example["text"]
        if example["answer"] == "accept":
            QID = example["accept"][0]
            offset = (example["spans"][0]["start"], example["spans"][0]["end"])
            links_dict = {QID: 1.0}
        dataset.append((text, {"links": {offset: links_dict}}))

In [None]:
dataset[0]

In [None]:
gold_ids = []
for text, annot in dataset:
    for span, links_dict in annot["links"].items():
        for link, value in links_dict.items():
            if value:
                gold_ids.append(link)

from collections import Counter
print(Counter(gold_ids))

In [None]:
train_dataset = []
test_dataset = []
for QID in qids:
    indices = [i for i, j in enumerate(gold_ids) if j == QID]
    train_dataset.extend(dataset[index] for index in indices[0:8])  
    test_dataset.extend(dataset[index] for index in indices[8:10])  
    
random.shuffle(train_dataset)
random.shuffle(test_dataset)

In [None]:
TRAIN_DOCS = []
for text, annotation in train_dataset:
    doc = nlp(text)    
    TRAIN_DOCS.append((doc, annotation))

In [None]:
entity_linker = nlp.create_pipe("entity_linker", config={"incl_prior": False})
entity_linker.set_kb(kb)
nlp.add_pipe(entity_linker, last=True)

In [None]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
with nlp.disable_pipes(*other_pipes):   
    optimizer = nlp.begin_training()
    for itn in range(500):   
        random.shuffle(TRAIN_DOCS)
        batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001)) 
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  
                annotations,   
                drop=0.2,      
                losses=losses,
                sgd=optimizer,
            )
        if itn % 50 == 0:
            print(itn, "Losses", losses)   
print(itn, "Losses", losses)