In [44]:
# %load_ext autoreload
import random
import datetime
import ujson as json
import csv
from pathlib import Path
from copy import deepcopy
import spacy
from spacy.gold import GoldParse, tags_to_entities
from spacy.scorer import Scorer
from spacy.util import minibatch, compounding, load_model_from_path
from spacy import displacy
from spacy.cli.train import print_progress
from spacy.tokenizer import Tokenizer

In [26]:
model_path = "../models/plays-small-batches/epoch-834"
test_path = "../training/evaluation/fulltext-657.json"

def load():
    with open(test_path, 'r', encoding="utf-8") as file:
        return json.loads(file.read())

In [27]:
def custom_tokenizer(nlp):
    prefixes = deepcopy(nlp.Defaults.prefixes)
    prefixes += ("-",)
    
    suffixes = deepcopy(nlp.Defaults.suffixes)
    suffixes += ("-", "(?<!\.[A-Z])(?<=[A-Z])\.",)
    
    prefix_re = spacy.util.compile_prefix_regex(prefixes)
    suffix_re = spacy.util.compile_suffix_regex(suffixes)

    infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
    
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.Defaults.token_match)

In [28]:
nlp = load_model_from_path(model_path)
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [29]:
nlp.tokenizer = custom_tokenizer(nlp)

In [30]:
colors = {
    'SPEAKER': '#9DD1F1',
    'M-SPEAKERS': '#508AA8',
    'COMMENT': '#F1FAEE',
    'HEAD-COMMENT': '#DAD5EE',
    'ACT': '#BA2222',
    'SCENE': '#E63946',
    'OTHER-SEP': '#BA2222',
    
    'HEADER': '#FFBA08',
    'FOOTER': '#FFBA08',
    'PAGE-NUMBER':'#FFBA08',
}

In [31]:
def actual():
    text, annotations = load()
    entities = list(map(lambda ent: { 'start': ent[0], 'end': ent[1], 'label': ent[2] }, annotations['entities']))
        
    options = { 'colors': colors }
    displacy.render([{ 'text': text, 'ents': entities }], style="ent", manual=True, options=options, jupyter=True)

In [32]:
def prediction():
    text, _ = load()
    doc = nlp(text)
   
    options = { 'colors': colors }
    displacy.render(doc, style="ent",options=options, jupyter=True)

In [None]:
actual()

In [33]:
prediction()

In [42]:
class ScorerEntity(Scorer):
    def score_entity(self, tokens, gold, ent_label, verbose=False):
        gold_ents = set(tags_to_entities([annot[-1]
                        for annot in gold.orig_annot]))
        
        if '-' not in [token[-1] for token in gold.orig_annot]:
            cand_ents = set()
            for ent in tokens.ents:
                if ent.label_ != ent_label:
                    continue
                
                first = gold.cand_to_gold[ent.start]
                last = gold.cand_to_gold[ent.end-1]
                if first is None or last is None:
                    self.ner.fp += 1
                else:
                    cand_ents.add((ent.label_, first, last))
                    
            self.ner.score_set(cand_ents, gold_ents)  
       

In [40]:
def evaluate():
    text, annotations = load()
    scorer = Scorer()
    
    doc = nlp.make_doc(text)

    gold = GoldParse(doc, **annotations)
    prediction = nlp(text)

    scorer.score(prediction, gold)
    
    return scorer


def evaluate_by_ent():
    text, annotations = load()
    label_entities = {label: [] for label in colors.keys()}
    scorers = {}
    
    doc = nlp.make_doc(text)
    prediction = nlp(text)
    
    for annot in annotations['entities']:
        label_entities[annot[2]].append(annot)
           
    for label in colors.keys():
        scorer = ScorerEntity()

        gold = GoldParse(doc, entities=label_entities[label])

        scorer.score_entity(prediction, gold, label)

        scorers[label] = scorer
        
    return scorers


def print_scorers(scorers):
    print(
            "{:>12}".format("entity"),
            "{:>8}".format("ents_p"),
            "{:>8}".format("ents_r"),
            "{:>8}".format("ents_f"),
        )

    for label, scorer in scorers.items():
        print(
            "{:>12}".format(label),
            "{:>8}".format(round(scorer.scores['ents_p'], 2)),
            "{:>8}".format(round(scorer.scores['ents_r'], 2)),
            "{:>8}".format(round(scorer.scores['ents_f'], 2)),
        )

In [36]:
scorer = evaluate()
print(scorer.scores)

{'uas': 0.0, 'las': 0.0, 'ents_p': 83.5430784123911, 'ents_r': 79.68605724838412, 'ents_f': 81.56899810964083, 'tags_acc': 0.0, 'token_acc': 100.0}


In [45]:
scorers = evaluate_by_ent()
print_scorers(scorers)

      entity   ents_p   ents_r   ents_f
     SPEAKER    99.26    82.64    90.19
  M-SPEAKERS      0.0      0.0      0.0
     COMMENT     55.0    69.42    61.37
HEAD-COMMENT    98.89    80.18    88.56
         ACT    100.0    100.0    100.0
       SCENE      0.0      0.0      0.0
   OTHER-SEP      0.0      0.0      0.0
      HEADER      0.0      0.0      0.0
      FOOTER      0.0      0.0      0.0
 PAGE-NUMBER      0.0      0.0      0.0


In [36]:
# scorer = evaluate_orig(model_path,evaluation_data)
# %autoreload 2

scorer = evaluate_orig([[ text, annotations]])
starts = {token.idx: token.i for token in doc}
ends = {token.idx+len(token): token.i for token in doc}
t = text[23549:23577]

print('t', t) #.text, t.idx, len(token))
#print('starts', starts)
#print('ends', ends)
for start_char, end_char, label in annotations['entities']:
    start_token = starts.get(start_char)
    end_token = ends.get(end_char)
    # Only interested if the tokenization is correct
    if start_token is not None and end_token is not None:
        pass
    else:
        print("Tokenization failed start_char", start_char, "end_char", end_char, "start_token:", start_token, 'end_token:', end_token)
       

(5878, 'faisant', None, None, None, '-') -
(5879, 'encore', None, None, None, '-') -
(5880, 'la', None, None, None, '-') -
(5881, 'révérence.:Je', None, None, None, '-') -
(6278, 'HARPAGON.Ici', None, None, None, '-') -
(7526, 'à', None, None, None, '-') -
(7527, 'Élise):Après', None, None, None, '-') -
(7647, 'adressant', None, None, None, '-') -
(7648, 'la', None, None, None, '-') -
(7649, 'parole', None, None, None, '-') -
(7650, 'à', None, None, None, '-') -
(7651, 'Élise', None, None, None, '-') -
(7652, ',', None, None, None, '-') -
(7653, 'en', None, None, None, '-') -
(7654, "s'", None, None, None, '-') -
(7655, 'en', None, None, None, '-') -
(7656, 'allant', None, None, None, '-') -
(7657, 'du', None, None, None, '-') -
(7658, 'côté', None, None, None, '-') -
(7659, 'par', None, None, None, '-') -
(7660, 'où', None, None, None, '-') -
(7661, 'elle', None, None, None, '-') -
(7662, 'est', None, None, None, '-') -
(7663, 'sortie', None, None, None, '-') -
(7664, '.-', None, None

In [140]:
print(scorer.scores)

{'uas': 0.0, 'las': 0.0, 'ents_p': 100.0, 'ents_r': 100.0, 'ents_f': 100.0, 'tags_acc': 0.0, 'token_acc': 100.0}
