In [1]:
import random
import datetime
import ujson as json
import csv
from glob import glob
from pathlib import Path
import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.util import read_json, minibatch, compounding, decaying, load_model_from_path
from spacy import displacy

In [2]:
output_path = Path("../models/plays")
model_path = Path("../models/model-127")
resume_from_epoch = None # 49

In [3]:
files = glob('../training/fulltext*.json')
examples_data = []
for filepath in files:
    examples_data.append(read_json(filepath))

print("Loaded %d examples files" % len(examples_data))

Loaded 50 examples files


In [4]:
evaluation_pct = 20
random.shuffle(examples_data)

split_at = round(len(examples_data) * evaluation_pct / 100)

test_data = examples_data[:split_at]
train_data = examples_data[split_at:]

print("Defined %d train texts, %d test texts" % (len(train_data), len(test_data)))

Defined 40 train texts, 10 test texts


In [None]:
def evaluate(model_path, evaluation_data):
    nlp_loaded = load_model_from_path(model_path)
        
    scorer = Scorer()
    for (text, annotations) in evaluation_data:
        doc = nlp_loaded.make_doc(text)
        gold = GoldParse(doc, **annotations)
        prediction = nlp_loaded(text)

        scorer.score(prediction, gold)
    
    acc_loc = (model_path / 'accuracy.json')
    with acc_loc.open('w') as file_:
        file_.write(json.dumps(scorer.scores))
            
    return scorer

    del nlp_loaded

In [None]:
def save_model(model, path, optimizer):
    model.meta['name'] = "plays"
    with model.use_params(optimizer.averages):
        model.to_disk(path)

In [7]:
if resume_from_epoch is None:
    # nlp = spacy.blank('fr')
    # optimizer = nlp.begin_training()
    nlp = spacy.load("fr_core_news_sm")
    optimizer = nlp.entity.create_optimizer()
else:
    nlp = load_model_from_path(model_path)
    optimizer = nlp.entity.create_optimizer()

In [None]:
print(nlp.pipe_names)
nlp("Bonjour :")

In [9]:
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')
    

print(nlp.pipe_names)

['tagger', 'parser', 'ner']


In [10]:
LABELS = ('ACT', 'SCENE', 'OTHER-SEP', 'SPEAKER', 'M-SPEAKERS', 'COMMENT', 'HEAD-COMMENT')
if resume_from_epoch is None:
    for label in LABELS:
        ner.add_label(label)

In [None]:
stats_path = output_path / "stats.csv"
if resume_from_epoch is not None:
    stats_file = open(stats_path, "a")
    stats_csv = csv.writer(stats_file)
else:
    stats_file = open(stats_path, "w")
    stats_csv = csv.writer(stats_file)
    stats_csv.writerow(["epoch", "start_at", "end_at", "duration", "losses_ner", "ents_p", "ents_r", "ents_f"])

def write_stats(epoch, start_at, end_at, losses, scores):
    stats_csv.writerow([
        epoch,
        start_at, 
        end_at,
        end_at - start_at, 
        losses['ner'],
        scores.get("ents_p", None),
        scores.get("ents_r", None),
        scores.get("ents_f", None),
    ])
    
    stats_file.flush()

In [12]:
#for (text, annotations) in zip(*test_data):
# print(test_data[0][0])

In [13]:
training_start_at = datetime.datetime.now()
print("Start training at", training_start_at)

best_loss = None
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

dropout = decaying(0.3, 0.15, 1e-4)

with nlp.disable_pipes(*other_pipes):  # only train NER
    for epoch in range(resume_from_epoch or 0, 500):
        start_at = datetime.datetime.now()
        random.shuffle(train_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
   
        drop = next(dropout)
        for batch in batches:  
            texts, annotations = zip(*batch)

            try:
                nlp.update(texts, annotations, sgd=optimizer, drop=drop, losses=losses)
            except:
                print("Problem with", texts, annotations)
                raise
                    
        epoch_model_path = output_path / ('epoch-%03d' % epoch)
        
        save_model(nlp, epoch_model_path,  optimizer=optimizer)
        
        if best_loss is None or losses['ner'] < best_loss:
            best_loss = losses['ner']
            
        if epoch % 5 == 0 or best_loss == losses['ner']:
            scores = evaluate(epoch_model_path, test_data).scores
        else:
            scores = {}
        
        end_at = datetime.datetime.now()
        
        write_stats(epoch, start_at, end_at, losses, scores)
       
        print('Epoch %d, losses' % epoch, losses, scores, 'end_at:', end_at, 'duration:', end_at - start_at)
            
            
stats_file.close()
training_end_at = datetime.datetime.now()
print('Training ended at', training_end_at, 'took', training_end_at - training_start_at)

Start training at 2019-01-01 09:55:56.968891
Epoch 0, losses {'ner': 17713.693603515625} {'uas': 0.0, 'las': 0.0, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 2019-01-01 10:01:25.069720 duration: 0:05:28.099793
Epoch 1, losses {'ner': 11155.94056892395} {'uas': 0.0, 'las': 0.0, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 2019-01-01 10:08:01.320909 duration: 0:06:36.248408
Epoch 2, losses {'ner': 10590.818182945251} {'uas': 0.0, 'las': 0.0, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 2019-01-01 10:13:46.695568 duration: 0:05:45.364247
Epoch 3, losses {'ner': 10190.166845321655} {'uas': 0.0, 'las': 0.0, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 2019-01-01 10:19:15.301327 duration: 0:05:28.604922
Epoch 4, losses {'ner': 9838.853479385376} {'uas': 0.0, 'las': 0.0, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0,

Epoch 49, losses {'ner': 9187.652970790863} {} end_at: 2019-01-01 19:07:00.428976 duration: 0:07:28.084741
Epoch 50, losses {'ner': 9174.564269542694} {'uas': 0.0, 'las': 0.0, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 2019-01-01 19:16:02.574712 duration: 0:09:02.142373
Epoch 51, losses {'ner': 9191.962679862976} {} end_at: 2019-01-01 19:23:49.101289 duration: 0:07:46.509793
Epoch 52, losses {'ner': 9165.61578655243} {'uas': 0.0, 'las': 0.0, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 2019-01-01 19:33:18.359111 duration: 0:09:29.256094
Epoch 53, losses {'ner': 9148.335360527039} {'uas': 0.0, 'las': 0.0, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 2019-01-01 19:40:34.890631 duration: 0:07:16.529085
Epoch 54, losses {'ner': 9200.39971446991} {} end_at: 2019-01-01 19:49:50.570165 duration: 0:09:15.678608
Epoch 55, losses {'ner': 9194.024083137512} {'ua

KeyboardInterrupt: 

In [None]:
model_path = output_path / 'model-final'
save_model(nlp, model_path, optimizer=optimzer)
scorer = evaluate(model_path)
print("Accuracy: ", scorer.scores)

In [None]:
evaluation_data2 = [
    ("Acte Premier",  { 'entities': [(0, 12, "ACT")]}),
    ("scène 1",  { 'entities': [(0, 7, "SCENE")]}),
    ("LE GARAGISTE(, avec humeur.)\nJe suis ton père", { 'entities': [(0, 12, "SPEAKER"), (15, 27, "HEAD-COMMENT")] }),
    ("LUCIEN, étonné : c'est pas vrai", { 'entities': [(0, 6, "SPEAKER"), (8, 14, "HEAD-COMMENT")]}),
    ("LE PREMIER : eh si",  { 'entities': [(0, 10, "SPEAKER")]}),
    ("LES AUTRES\nnon de dieu",  { 'entities': [(0, 10, "SPEAKER")]}),
    ("LUCIEN et LUCIENNE , bien ensemble : c'est nous tous",  { 'entities': [(0, 19, "M-SPEAKER"), (22, 35, "HEAD-COMMENT")]}),
    ("RIDEAU", { 'entities': [(0, 6, "COMMENT")]})
]

scorer = evaluate(model_path, evaluation_data2)
print("Accuracy: ", scorer.scores)

In [None]:
def render_evaluation(model_path, evaluation):
    nlp_loaded = load_model_from_path(model_path)
    for (text, _) in evaluation:
        doc = nlp_loaded(text)
        if len(doc.ents) == 0:
            print(text)
        else:
            displacy.render(doc, style="ent", jupyter=True)

render_evaluation(model_path, evaluation_data2) 

In [None]:
def debug_evaluation(evaluation):
    text, _ = zip(*evaluation)
    
    docs = nlp.pipe(text)
    
    for doc in docs:
        [print(
            "{:>12}".format(e.label_),
            "{:<3}".format(e.start_char),
            "->",
            "{:<3}".format(e.end_char),
            e.text) for e in doc.ents]
debug_evaluation(evaluation_data2)