In [None]:
# ce notebook est une adaptation du notebook : https://github.com/explosion/projects/blob/master/nel-emerson/scripts/notebook_video.ipynb

In [None]:
import csv
from pathlib import Path
import spacy
from spacy.kb import KnowledgeBase
import random
from spacy.util import minibatch, compounding
import json

In [None]:
def load_entities():
    entities_loc = Path.cwd()/"loc-csv.csv" 

    actes = dict()
    contents = dict()
   
    with entities_loc.open("r", encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            acte = row[0]
            ide = row[1]
            value = row[2]
            content = row[3]
           
            actes[ide] = acte
            contents[ide] = content
           
    return actes, contents
    

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
text = "Amor vincit omnia."
doc = nlp(text)

In [None]:
actes_dict, contents_dict = load_entities()
for ID in actes_dict.keys():
    print(f"{ID}, entity={contents_dict[ID]}, acte={actes_dict[ID]}")

In [None]:
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)

In [None]:
for ide, content in contents_dict.items():
    content_doc = nlp(content)
    content_enc = content_doc.vector
    kb.add_entity(entity=ide, entity_vector=content_enc, freq=342)   

In [None]:
for ide, content in contents_dict.items():
    kb.add_alias(alias=content, entities=[ide], probabilities=[1])  

In [None]:
print(f"Candidates for 'casulis': {[c.entity_ for c in kb.get_candidates('Casulis')]}")

In [None]:
print(f"Candidates for 'bitterensis': {[c.entity_ for c in kb.get_candidates('Bterensis')]}")

In [None]:
all_entities = kb.get_entity_strings()

In [None]:
len(all_entities)

In [None]:
print(f"Candidates for 'turonensium': {[c.entity_ for c in kb.get_candidates('turonensium')]}")

In [None]:
json_loc = Path.cwd() / "text.jsonl" # distributed alongside this notebook

with json_loc.open("r", encoding="utf8") as jsonfile:
    line = jsonfile.readline()
    
print(line)   # print just the first line

In [None]:
dataset = []

with json_loc.open("r", encoding="utf8") as file:
    for line in file:
        data = json.loads(line)
        text = data["text"]
        links = {}
        for i, span in enumerate(data["spans"]):
            start = span["start"]
            end = span["end"]
            QID = data["accept"][i]
            links[(start, end)] = {QID: 1.0}

        if links:
            dataset.append((text, {"links": links}))

In [None]:
dataset[0]

In [None]:
gold_ids = []
for text, annot in dataset:
    for span, links_dict in annot["links"].items():
        for link, value in links_dict.items():
            if value:
                gold_ids.append(link)

from collections import Counter
print(Counter(gold_ids))

In [None]:
train_dataset = []
test_dataset = []
for QID in ide:#ici nous avons changé car nous n'avons pas d'alias
    indices = [i for i, j in enumerate(gold_ids) if j == QID]
    train_dataset.extend(dataset[index] for index in indices[0:8])  
    test_dataset.extend(dataset[index] for index in indices[8:10])
    
random.shuffle(train_dataset)
random.shuffle(test_dataset)

In [None]:
TRAIN_DOCS = []
for text, annotation in train_dataset:
    doc = nlp(text)     
    TRAIN_DOCS.append((doc, annotation))

In [None]:
entity_linker = nlp.create_pipe("entity_linker", config={"incl_prior": False})
entity_linker.set_kb(kb)
nlp.add_pipe(entity_linker, last=True)

In [None]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
with nlp.disable_pipes(*other_pipes):  
    optimizer = nlp.begin_training()
    for itn in range(500):   
        random.shuffle(TRAIN_DOCS)
        batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))  
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  
                annotations,   
                drop=0.2,      
                losses=losses,
                sgd=optimizer,
            )
        if itn % 50 == 0:
            print(itn, "Losses", losses)   
print(itn, "Losses", losses)

In [None]:
text = "Amor vincit omnia."
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.kb_id_)

In [None]:
for text, true_annot in test_dataset:
    print(text)
    print(f"Gold annotation: {true_annot}")
    doc = nlp(text)  
    for ent in doc.ents:
        if ent.text == "Francorum":
            print(f"Prediction: {ent.text}, {ent.label_}, {ent.kb_id_}")
    print()