In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import random
import datetime
import ujson as json
import csv
from glob import glob
from pathlib import Path

import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.util import read_json, minibatch, compounding, decaying, load_model_from_path
from lang.fr import model

In [3]:
output_path = Path("../models/plays-small-batches")
resume_from_epoch = 880

In [4]:
def load_files(path_glob, name):
    files = glob(path_glob)
    data = []
    for filepath in files:
        data.append(read_json(filepath))

    random.shuffle(data)
    
    print("Loaded %d %s files" % (len(data), name))
    
    return data

In [8]:
train_path = '../training/fulltext*.json'
evaluation_path = '../training/evaluation/fulltext*.json'

def reload_data():
    print("Reload data")
    train_data = load_files(train_path, "train")
    test_data = load_files(evaluation_path, "test")
    
    return (train_data, test_data)

In [16]:
def ensure_scorable():
    paths = glob(train_path) + glob(evaluation_path)
    for path in paths:
        text, annotations = read_json(path)
        
        doc = nlp.make_doc(text)
        gold = GoldParse(doc, **annotations)
        
        prediction = nlp(text)

        scorer = Scorer()
        scorer.score(prediction, gold)

        status = "NOOK" if scorer.scores['ents_p'] == 0 else "OK"

        if status == "NOOK":
            os.rename(path, f"{path}.error")

        print(
            "{:<5}".format(status),
            "{:<40}".format(path),
        )
    

In [10]:
def evaluate(model_path, evaluation_data):
    nlp_loaded = load_model_from_path(model_path)
    model.configure(nlp_loaded)
    
    scorer = Scorer()
    for (text, annotations) in evaluation_data:
        doc = nlp_loaded.make_doc(text)
        gold = GoldParse(doc, **annotations)
        prediction = nlp_loaded(text)

        scorer.score(prediction, gold)
    
    acc_loc = (model_path / 'accuracy.json')
    with acc_loc.open('w') as file_:
        file_.write(json.dumps(scorer.scores))
            
    return scorer

    del nlp_loaded

In [11]:
def save_model(model, path, optimizer):
    model.meta['name'] = "plays"
    with model.use_params(optimizer.averages):
        model.to_disk(path)

In [12]:
if resume_from_epoch is None:
    # nlp = spacy.blank('fr')
    # optimizer = nlp.begin_training()
    nlp = spacy.load("fr_core_news_sm")
else:
    model_path = output_path / ('epoch-%03d' % (resume_from_epoch - 1))
    nlp = load_model_from_path(model_path)
    
model.configure(nlp)
optimizer = nlp.entity.create_optimizer()

In [None]:
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')
    

print(nlp.pipe_names)

In [None]:
LABELS = ('ACT', 'SCENE', 'OTHER-SEP', 'SPEAKER', 'M-SPEAKERS', 'COMMENT', 'HEAD-COMMENT',
          'PAGE-NUMBER', 'FOOTER')
for label in LABELS:
    ner.add_label(label)

In [17]:
ensure_scorable()
train_data, test_data = reload_data()

OK    ../training/fulltext-636.json           
OK    ../training/fulltext-sample.json        
OK    ../training/fulltext-616.json           
OK    ../training/fulltext-354.json           
OK    ../training/fulltext-641.json           
OK    ../training/fulltext-15.json            
OK    ../training/fulltext-355.json           
OK    ../training/fulltext-640.json           
OK    ../training/fulltext-617.json           
OK    ../training/fulltext-441.json           
OK    ../training/fulltext-380.json           
OK    ../training/fulltext-38.json            
OK    ../training/fulltext-379.json           
OK    ../training/fulltext-512.json           
OK    ../training/fulltext-318.json           
OK    ../training/fulltext-363.json           
OK    ../training/fulltext-22.json            
OK    ../training/fulltext-637.json           
OK    ../training/fulltext-515.json           
OK    ../training/fulltext-407.json           
OK    ../training/fulltext-610.json           
OK    ../trai

In [18]:
stats_path = output_path / "stats.csv"
if resume_from_epoch is not None:
    stats_file = open(stats_path, "a")
    stats_csv = csv.writer(stats_file)
else:
    stats_file = open(stats_path, "w")
    stats_csv = csv.writer(stats_file)
    stats_csv.writerow(["epoch", "start_at", "end_at", "duration", "losses_ner", "ents_p", "ents_r", "ents_f"])

def write_stats(epoch, start_at, end_at, losses, scores):
    stats_csv.writerow([
        epoch,
        start_at, 
        end_at,
        end_at - start_at, 
        losses['ner'],
        scores.get("ents_p", None),
        scores.get("ents_r", None),
        scores.get("ents_f", None),
    ])
    
    stats_file.flush()

In [None]:
training_start_at = datetime.datetime.now()
print("Start training at", training_start_at)

best_loss = None
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

dropout = decaying(0.4, 0.2, 1e-4)

with nlp.disable_pipes(*other_pipes):  # only train NER
    for epoch in range(resume_from_epoch or 0, 2000):
        start_at = datetime.datetime.now()
        
        if epoch % 10 == 0:
            train_data, test_data = reload_data()
            
        random.shuffle(train_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(1.1, 8., 1.001))
   
        drop = next(dropout)
        for batch in batches:  
            texts, annotations = zip(*batch)

            try:
                nlp.update(texts, annotations, sgd=optimizer, drop=drop, losses=losses)
            except:
                print("Problem with", texts, annotations)
                raise
                    
        epoch_model_path = output_path / ('epoch-%03d' % epoch)
        
        save_model(nlp, epoch_model_path,  optimizer=optimizer)
        
        if best_loss is None or losses['ner'] < best_loss:
            best_loss = losses['ner']
            
        if epoch % 5 == 0 or best_loss == losses['ner']:
            scores = evaluate(epoch_model_path, test_data).scores
        else:
            scores = {}
        
        end_at = datetime.datetime.now()
        
        write_stats(epoch, start_at, end_at, losses, scores)
       
        print(f'Epoch {epoch}')
            
            
stats_file.close()
training_end_at = datetime.datetime.now()
print('Training ended at', training_end_at, 'took', training_end_at - training_start_at)

Start training at 2019-01-08 01:31:09.729845
Reload data
Loaded 89 train files
Loaded 16 test files
Epoch %d


In [None]:
model_path = output_path / 'model-final'
save_model(nlp, model_path, optimizer=optimzer)
scorer = evaluate(model_path)
print("Accuracy: ", scorer.scores)