In [87]:
import random
import datetime
import ujson as json
import csv
from pathlib import Path
import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.util import minibatch, compounding, load_model_from_path
from spacy import displacy
from spacy.cli.train import print_progress

In [18]:
model_name = "fr_core_news_md"
rehearsal_nlp = spacy.load(model_name)
output_path = Path("../models")

In [19]:
rehearsal_file = Path("../rehearsal/wikipedia.moliere.txt")
# rehearsal_file = Path("../rehearsal/combined.txt")
with open(rehearsal_file, 'r', encoding="utf-8") as file:
    text = file.read()
    rehearsal_source = rehearsal_nlp(text)

In [20]:
def rehearsal_texts(count):
    sents = list(rehearsal_source.sents)
    random_sentences = random.sample(sents, min(count, len(sents)))
    return list(map(lambda sent: sent.text.strip(), random_sentences))

In [56]:

def data_from_file(file_path):
    data = []
    with open(file_path, 'r', encoding="utf-8") as file:
        for line in file:
            if len(line.strip()) == 0:
                pass
            else:
                data.append(json.loads(line))

    return data


In [69]:
example_path = Path("../training/training-38.jsonl")
example_path = Path("../training/examples-201812278.jsonl")
evaluation_path = Path("../training/evaluation.jsonl")

example_data = data_from_file(example_path)
print("Loaded %d examples." % len(example_data))

evaluation_data = data_from_file(evaluation_path)
print("Loaded %d evaluation examples." % len(evaluation_data))

Loaded 34594 examples.
Loaded 7080 evaluation examples.


In [70]:
# evaluation_pct = 20

# random.shuffle(example_data)

# split_at = round(len(example_data) * evaluation_pct / 100)

# evaluation_data = example_data[:split_at]
# validation_data = example_data[split_at:]
validation_data = example_data

In [71]:
revision_data = []

revision_texts = rehearsal_texts(len(validation_data))
for doc in rehearsal_nlp.pipe(revision_texts):
    entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
  
    revision_data.append((doc.text, { 'entities': entities }))
    
print("Loaded %d random rehearsal sentences." % len(revision_data))

Loaded 1225 random rehearsal sentences.


In [72]:
train_data = revision_data + validation_data * 1
# train_data = revision_data + random.sample(example_data, 5)
print("Total train data contains %d examples (%d validation upsampled)." % (len(train_data), len(validation_data)))
print("Evaluation data contains %d examples" % len(evaluation_data))

Total train data contains 35819 examples (34594 validation upsampled).
Evaluation data contains 7080 examples


In [74]:
resume_from_it = 49

if resume_from_it is not None:
    nlp = load_model_from_path(output_path / ('model-%03d' % resume_from_it))
else:
    nlp = spacy.load(model_name)

In [75]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe('ner')

In [76]:
LABELS = ('ACT', 'SCENE', 'OTHER-SEP', 'SPEAKER', 'M-SPEAKERS', 'COMMENT', 'HEAD-COMMENT')
if resume_from_it is not None:
    for label in LABELS:
        ner.add_label(label)

In [77]:
optimizer = nlp.entity.create_optimizer()

In [85]:
stats_path = output_path / "stats.csv"
if resume_from_it is not None:
    stats_file = open(stats_path, "a")
    stats_csv = csv.writer(stats_file)
else:
    stats_file = open(stats_path, "w")
    stats_csv = csv.writer(stats_file)
    stats_csv.writerow(["epoch", "start_at", "end_at", "duration", "losses_ner", "ents_p", "ents_r", "ents_f"])

def write_stats(epoch, start_at, end_at, losses, scores):
    stats_csv.writerow([
        epoch,
        start_at, 
        end_at,
        end_at - start_at, 
        losses['ner'],
        scores.get("ents_p", None),
        scores.get("ents_r", None),
        scores.get("ents_f", None),
    ])
    
    stats_file.flush()

In [80]:
def evaluate(model_path, evaluation):
    nlp_loaded = load_model_from_path(model_path)
        
    scorer = Scorer()
    for (text, annotations) in evaluation:
        doc = nlp_loaded.make_doc(text)
        gold = GoldParse(doc, **annotations)
        prediction = nlp_loaded(text)

        scorer.score(prediction, gold)
    
    acc_loc = (model_path / 'accuracy.json')
    with acc_loc.open('w') as file_:
        file_.write(json.dumps(scorer.scores))
            
    return scorer

    del nlp_loaded


def save_model(model, path):
    with model.use_params(optimizer.averages):
        model.to_disk(path)

In [86]:
training_start_at = datetime.datetime.now()
print("Start training at", training_start_at)

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    for epoch in range(resume_from_it or 0, 201):
        start_at = datetime.datetime.now()
        random.shuffle(train_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
   

        for batch in batches:  
            texts, annotations = zip(*batch)

            try:
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            except:
                print("Problem with", texts, annotations)
                raise
                    
        epoch_model_path = output_path / ('model-%03d' % epoch)
        
        save_model(nlp, epoch_model_path)
        
        if epoch % 5 == 0:
            scores = evaluate(epoch_model_path, evaluation_data).scores
        else:
            scores = {}
        
        end_at = datetime.datetime.now()
        
        write_stats(epoch, start_at, end_at, losses, scores)
       
        print('Epoch %d, losses' % epoch, losses, scores, 'end_at:', end_at, 'duration:', end_at - start_at)
            
            
stats_file.close()
training_end_at = datetime.datetime.now()
print('Training ended at', training_end_at, 'took', training_end_at - training_start_at)

Start training at 2018-12-30 12:08:01.047060
Epoch 49, losses {'ner': 10.473067509879572} {} end_at: 2018-12-30 12:17:24.679044 duration: 0:09:23.631302
Epoch 50, losses {'ner': 8.474316215664377} {'uas': 0.0, 'las': 0.0, 'ents_p': 92.5900435879789, 'ents_r': 93.77323420074349, 'ents_f': 93.17788295047905, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 2018-12-30 12:30:22.037485 duration: 0:12:57.357061
Epoch 51, losses {'ner': 7.353596541617581} {} end_at: 2018-12-30 12:40:08.423886 duration: 0:09:46.385710
Epoch 52, losses {'ner': 6.946924080998046} {} end_at: 2018-12-30 12:49:00.922499 duration: 0:08:52.498147
Epoch 53, losses {'ner': 7.011000112489917} {} end_at: 2018-12-30 12:57:50.598325 duration: 0:08:49.675177
Epoch 54, losses {'ner': 6.725677622729314} {} end_at: 2018-12-30 13:06:38.956537 duration: 0:08:48.357796
Epoch 55, losses {'ner': 6.709388064734361} {'uas': 0.0, 'las': 0.0, 'ents_p': 93.46330275229357, 'ents_r': 94.67936802973978, 'ents_f': 94.06740535549399, 'tags_acc':

Epoch 109, losses {'ner': 3.933046725733129} {} end_at: 2018-12-30 22:22:25.053526 duration: 0:11:04.231847
Epoch 110, losses {'ner': 4.294804543189605} {'uas': 0.0, 'las': 0.0, 'ents_p': 93.49704411095952, 'ents_r': 95.53903345724906, 'ents_f': 94.50700988278557, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 2018-12-30 22:34:58.025399 duration: 0:12:32.971523
Epoch 111, losses {'ner': 3.9768068124920024} {} end_at: 2018-12-30 22:46:30.899558 duration: 0:11:32.873655
Epoch 112, losses {'ner': 3.6770493011478615} {} end_at: 2018-12-30 22:58:54.195371 duration: 0:12:23.295162
Epoch 113, losses {'ner': 4.0322246725655715} {} end_at: 2018-12-30 23:11:27.329447 duration: 0:12:33.133053
Epoch 114, losses {'ner': 4.096595093699839} {} end_at: 2018-12-30 23:22:22.515814 duration: 0:10:55.185538
Epoch 115, losses {'ner': 3.7638948533295418} {'uas': 0.0, 'las': 0.0, 'ents_p': 92.9283771532185, 'ents_r': 95.26022304832715, 'ents_f': 94.07985314364387, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 2

Epoch 169, losses {'ner': 4.275872084941666} {} end_at: 2018-12-31 09:45:07.123860 duration: 0:12:37.410661
Epoch 170, losses {'ner': 3.532226822977235} {'uas': 0.0, 'las': 0.0, 'ents_p': 93.19945230488361, 'ents_r': 94.88847583643123, 'ents_f': 94.03638038222427, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 2018-12-31 09:59:34.854669 duration: 0:14:27.730102
Epoch 171, losses {'ner': 3.197744350510814} {} end_at: 2018-12-31 17:33:17.986234 duration: 7:33:43.129665
Epoch 172, losses {'ner': 3.4619128280714957} {} end_at: 2018-12-31 17:45:59.814813 duration: 0:12:41.827269
Epoch 173, losses {'ner': 3.709958743090594} {} end_at: 2018-12-31 17:59:50.646661 duration: 0:13:50.831022
Epoch 174, losses {'ner': 4.060656280560203} {} end_at: 2018-12-31 18:14:29.132071 duration: 0:14:38.483167
Epoch 175, losses {'ner': 4.362917160812601} {'uas': 0.0, 'las': 0.0, 'ents_p': 94.61697722567288, 'ents_r': 95.56226765799256, 'ents_f': 95.08727314761299, 'tags_acc': 0.0, 'token_acc': 100.0} end_at: 201

KeyboardInterrupt: 

In [59]:
model_path = output_path / 'model-final'
save_model(nlp, model_path)
scorer = evaluate(model_path, evaluation_data)
print("Accuracy: ", scorer.scores)

Accuracy:  {'uas': 0.0, 'las': 0.0, 'ents_p': 89.28894927536231, 'ents_r': 91.61245353159852, 'ents_f': 90.43577981651376, 'tags_acc': 0.0, 'token_acc': 100.0}


In [53]:
evaluation_data2 = [
    ("Acte Premier",  { 'entities': [(0, 12, "ACT")]}),
    ("scène 1",  { 'entities': [(0, 7, "SCENE")]}),
    ("LE GARAGISTE(, avec humeur.)\nJe suis ton père", { 'entities': [(0, 12, "SPEAKER"), (15, 27, "HEAD-COMMENT")] }),
    ("LUCIEN, étonné : c'est pas vrai", { 'entities': [(0, 6, "SPEAKER"), (8, 14, "HEAD-COMMENT")]}),
    ("LE PREMIER : eh si",  { 'entities': [(0, 10, "SPEAKER")]}),
    ("LES AUTRES\nnon de dieu",  { 'entities': [(0, 10, "SPEAKER")]}),
    ("LUCIEN et LUCIENNE , bien ensemble : c'est nous tous",  { 'entities': [(0, 19, "M-SPEAKER"), (22, 35, "HEAD-COMMENT")]}),
    ("RIDEAU", { 'entities': [(0, 6, "COMMENT")]})
]

# scorer = evaluate(model_path, evaluation_data2)
# print("Accuracy: ", scorer.scores)

Accuracy:  {'uas': 0.0, 'las': 0.0, 'ents_p': 100.0, 'ents_r': 100.0, 'ents_f': 100.0, 'tags_acc': 0.0, 'token_acc': 100.0}


In [51]:
def render_evaluation(model_path, evaluation):
    nlp_loaded = load_model_from_path(model_path)
    for (text, _) in evaluation:
        doc = nlp_loaded(text)
        if len(doc.ents) == 0:
            print(text)
        else:
            displacy.render(doc, style="ent", jupyter=True)

render_evaluation(model_path, evaluation_data2) 

In [49]:
def debug_evaluation(evaluation):
    text, _ = zip(*evaluation)
    
    docs = nlp.pipe(text)
    
    for doc in docs:
        [print(
            "{:>12}".format(e.label_),
            "{:<3}".format(e.start_char),
            "->",
            "{:<3}".format(e.end_char),
            e.text) for e in doc.ents]
debug_evaluation(evaluation_data2)

         ACT 0   -> 12  Acte Premier
       SCENE 0   -> 7   scène 1
     SPEAKER 0   -> 12  LE GARAGISTE
HEAD-COMMENT 15  -> 27  avec humeur.
     SPEAKER 0   -> 6   LUCIEN
HEAD-COMMENT 8   -> 14  étonné
     SPEAKER 0   -> 10  LE PREMIER
     SPEAKER 0   -> 10  LES AUTRES
  M-SPEAKERS 0   -> 18  LUCIEN et LUCIENNE
     COMMENT 0   -> 6   RIDEAU


[49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79]