In [9]:
# %load_ext autoreload
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [10]:
import ujson as json
from pathlib import Path
from glob import glob
import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.util import load_model_from_path
from lang.fr import model

In [39]:
model_path = "../models/plays-small-batches/epoch-878"
train_path = '../training/fulltext-475.json'
evaluation_path = '../training/evaluation/fulltext*.json'

In [16]:
def load_files(path_glob):
    files = glob(path_glob)
    data = []
    
    for filepath in files:
        data.append({
            'path': filepath,
            'json': read_json(filepath),
        })


    return data

In [14]:
nlp = load_model_from_path(model_path)
model.configure(nlp)

In [17]:
def load_file(path):
    with open(path, 'r', encoding="utf-8") as file:
        return json.loads(file.read())

def evaluate(path):
    scorer = Scorer()
    
    text, annotations = load_file(path)
        
    doc = nlp.make_doc(text)
    gold = GoldParse(doc, **annotations)

    prediction = nlp(text)

    scorer.score(prediction, gold)

    return scorer.scores

In [40]:
paths = glob(train_path) + glob(evaluation_path)
for path in paths:
    scores = evaluate(path)
    status = "NOOK" if scores['ents_p'] == 0 else "OK"
    
    if status == "NOOK":
        os.rename(path, f"{path}.error")
        
    print(
        "{:<5}".format(status),
        "{:<40}".format(path),
        "ents_p:",
        round(scores['ents_p'], 2),
        "ents_r:",
        round(scores['ents_r'], 2),
        "ents_f:",
        round(scores['ents_f'], 2),
    )
    

NOOK  ../training/fulltext-475.json            ents_p: 0.0 ents_r: 0.0 ents_f: 0.0
