In [1]:
# A new NER model is trained that contains new entities. The new entities are
# defined in the list: ['Method', 'Generic', 'Task', 'Material', 'Eval', 'Other']
# The new model takes as input annotated sentences extracted from pdf files,
# describing methods, architectures, and applications of Deep Learning. The
# sentences have been annotated using Brat (http://brat.nlplab.org/).
# The training is done by using the statistical models provided by spaCy
# (https://spacy.io/). The trained model can be saved in a user defined folder
# for future use.
#
# This material is based upon work supported by Defense Advanced Research
# Projects Agency (DARPA) under Agreement No. HR00111990010

In [2]:
from __future__ import unicode_literals, print_function

import time
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse
from spacy.scorer import Scorer

from brat2spacy import create_training_data
from ner_utils import ner_eval, test_ner_model

In [3]:
# new entity labels
new_entities_list = ['Method', 'Generic', 'Task', 'Material', 'Eval', 'Other']

In [4]:
input_dir = './Data/Abstracts-annotated/'
model_dir = './Models/'
test_dir = './Data/TestData/'
output_dir = './Output/'
n_iter = 20

In [5]:
# The main function that sets up the SpaCy pipeline and entity recognizer. The new entities are defined as a list of strings.
# Input -
#   model: the name of an existing trained model
#   new_model_name: the name of the new entity model
#   output_dir: the path of the directory where the new trained model will be saved.
#   n_iter: number of training iterations (epochs)
# Output -
#   The trained entity model stored in the output_dir
def main(model=None, new_model_name='DCC_ent', input_dir=input_dir, saved_model_dir=model_dir, output_dir=output_dir, test_dir=test_dir, n_iter=n_iter):
    # create the training from annotated data produced by using Brat
    training_data = create_training_data(input_dir)

    # check if the user provides an existing language model
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded existing model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("No model provided, created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        # otherwise, get it, so we can add labels to it
        ner = nlp.get_pipe('ner')

    # add all new entities to the recognizer
    for i in range(len(new_entities_list)):
      ner.add_label(new_entities_list[i])

    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # start the training of the recognizer (and the time)
    training_start_time = time.time()
    for itn in range(n_iter):
        random.shuffle(training_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(training_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                       losses=losses)
        print('iter:', itn)
        print('Losses', losses)

    training_end_time = time.time()
    print("training time: ", training_end_time-training_start_time)


    ############################
    # test the ner model on a set of text data taken from papers
    # (if the user does not provide text data, no testing will be performed)
    if test_dir is not None:
        # test_ner_model(nlp, test_dir)
        test_ner_model(nlp, test_dir, output_dir)

    ##########################
    # model evaluation
    #
    # define a set of examples that will be used as ground truth
    examples = [
        ('Deep learning is applied in many every day application with great success in object recognition.',
         [(0, 13, 'Method'), (77, 95, 'Task')]),
        ('Recurrent neural networks are used for forecasting and natural language processing.',
         [(0, 25, 'Method'), (39, 50, 'Task'), (55, 82, 'Task')]),
        ('Convolutional neural networks are frequently used in object recognition and medical image processing.',
         [(0, 25, 'Method'), (39, 50, 'Task'), (55, 82, 'Task')])
    ]
    res = ner_eval(nlp, examples)
    print("\nModel evaluation results:")
    print(res)



    ############################################
    # save trained model
    # (if the user does not provide a directory, the trained model will not be saved)
    if saved_model_dir is not None:
        saved_model_dir = Path(saved_model_dir)
        if not saved_model_dir.exists():
            saved_model_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(saved_model_dir)
        print("The model was saved to the directory: ", saved_model_dir)

        # test the saved model
        #print("Loading from", output_dir)
        #nlp2 = spacy.load(output_dir)
        #doc2 = nlp2(test_text)
        #for ent in doc2.ents:
        #    print(ent.label_, ent.text)

In [6]:
main(model=None, new_model_name='DCC_ent', input_dir=input_dir, saved_model_dir=model_dir, output_dir=output_dir, test_dir=test_dir, n_iter=n_iter)

No model provided, created blank 'en' model
iter: 0
Losses {'ner': 3022.2243793369453}
iter: 1
Losses {'ner': 2244.108322228706}
iter: 2
Losses {'ner': 2311.565472467265}
iter: 3
Losses {'ner': 2243.270687968303}
iter: 4
Losses {'ner': 2149.488937510091}
iter: 5
Losses {'ner': 2156.46981529255}
iter: 6
Losses {'ner': 2162.117791154143}
iter: 7
Losses {'ner': 2103.718651425457}
iter: 8
Losses {'ner': 1953.7203311534977}
iter: 9
Losses {'ner': 1994.9417897222372}
iter: 10
Losses {'ner': 1896.2301712836077}
iter: 11
Losses {'ner': 1886.722394054865}
iter: 12
Losses {'ner': 1834.0932025137326}
iter: 13
Losses {'ner': 1763.5886323446719}
iter: 14
Losses {'ner': 1754.6374313799643}
iter: 15
Losses {'ner': 1718.9872628686987}
iter: 16
Losses {'ner': 1758.185118803317}
iter: 17
Losses {'ner': 1693.5768959849142}
iter: 18
Losses {'ner': 1725.0825236104897}
iter: 19
Losses {'ner': 1548.6570398850354}
training time:  242.86782002449036

Model evaluation results:
{'uas': 0.0, 'las': 0.0, 'ents_p':