In [1]:
# A new NER model is trained that contains new entities. The new entities are
# defined in the list: ['Method', 'Generic', 'Task', 'Material', 'Eval', 'Other']
# The new model takes as input annotated sentences extracted from pdf files,
# describing methods, architectures, and applications of Deep Learning. The
# sentences have been annotated using Brat (http://brat.nlplab.org/).
# The training is done by using the statistical models provided by spaCy
# (https://spacy.io/). The trained model can be saved in a user defined folder
# for future use.
#
# This material is based upon work supported by Defense Advanced Research
# Projects Agency (DARPA) under Agreement No. HR00111990010


In [2]:
from __future__ import unicode_literals, print_function

import time
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse
from spacy.scorer import Scorer

from brat2spacy import *
from ner_model_eval import *
from test_dcc_entities import *

In [3]:
# new entity label
new_entities_list = ['Method', 'Generic', 'Task', 'Material', 'Eval', 'Other']

In [4]:
input_dir = 'Data/Abstracts-annotated30/'
output_dir = 'Models/'
test_dir = 'Data/TestData'

In [5]:
# The main function that sets up the SpaCy pipeline and entity recognizer. The new entities are defined as a list of strings.
# Input -
#   model: the name of an existing trained model
#   new_model_name: the name of the new entity model
#   output_dir: the path of the directory where the new trained model will be saved.
#   n_iter: number of training iterations (epochs)
# Output -
#   The trained entity model stored in the output_dir
def main(model=None, new_model_name='DCC_ent', input_dir=input_dir, output_dir=output_dir, test_dir=test_dir, n_iter=50):
    # create the training from annotated data produced by using Brat
    training_data = create_training_data(input_dir)

    # check if the user provides an existing language model
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded existing model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("No model provided, created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        # otherwise, get it, so we can add labels to it
        ner = nlp.get_pipe('ner')

    # add all new entities to the recognizer
    for i in range(len(new_entities_list)):
      ner.add_label(new_entities_list[i])

    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # start the training of the recognizer (and the time)
    training_start_time = time.time()
    for itn in range(n_iter):
        random.shuffle(training_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(training_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                       losses=losses)
        print('iter:', itn)
        print('Losses', losses)

    training_end_time = time.time()
    print("training time: ", training_end_time-training_start_time)


    ############################
    # test the ner model on a set of text data taken from papers
    if test_dir is not None:
        test_ner_model(nlp, test_dir)


    ##########################
    # model evaluation
    #
    # define a set of examples that will be used as ground truth
    examples = [
        ('Deep learning is applied in many every day application with great success in object recognition.',
         [(0, 13, 'Method'), (77, 95, 'Task')]),
        ('Recurrent neural networks are used for forecasting and natural language processing.',
         [(0, 25, 'Method'), (39, 50, 'Task'), (55, 82, 'Task')]),
        ('Convolutional neural networks are frequently used in object recognition and medical image processing.',
         [(0, 25, 'Method'), (39, 50, 'Task'), (55, 82, 'Task')])
    ]
    res = ner_eval(nlp, examples)
    print("\nModel evaluation results:")
    print(res)



    ############################################
    # save trained model
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("The model was saved to the directory: ", output_dir)

        # test the saved model
        #print("Loading from", output_dir)
        #nlp2 = spacy.load(output_dir)
        #doc2 = nlp2(test_text)
        #for ent in doc2.ents:
        #    print(ent.label_, ent.text)

In [7]:
nitr = 9
main(model=None, new_model_name='DCC_ent_nb', input_dir=input_dir, output_dir=output_dir, test_dir=test_dir, n_iter=nitr)

No model provided, created blank 'en' model
iter: 0
Losses {'ner': 108.21851252304975}
iter: 1
Losses {'ner': 85.76653762509036}
iter: 2
Losses {'ner': 81.27281272419792}
iter: 3
Losses {'ner': 74.9332996900288}
iter: 4
Losses {'ner': 74.14935253512775}
iter: 5
Losses {'ner': 69.09996159855076}
iter: 6
Losses {'ner': 59.436107283175836}
iter: 7
Losses {'ner': 52.70170359487369}
iter: 8
Losses {'ner': 51.998610874995855}
training time:  127.2059998512268

Entities detected in the text: 'non-local methods exploiting the self-similarity of natural signals have been well studied, for example in image analysis and restoration. existing approaches, however, rely on k-nearest neighbors (knn) matching in a ﬁxed feature space. the main hurdle in optimizing this feature space w. r. t. application performance is the non-differentiability of the knn selection rule. to overcome this, we propose a continuous deterministic relaxation of knn selection that maintains differentiability w. r. t. pairwise

(Figure 1). Finally, we can use RecurJac to evaluate the robustness of neural networks, by giving a certified lower bound within which no adversarial examples can be found.'
Method Deep neural networks 0 20
Generic major criticisms 89 105
Generic behavior 173 181
Method neural network 213 227
Generic input space 313 324
Generic critical step 433 446
Generic Jacobian matrix 487 502
Generic x within a 521 531
Task certain region 532 546
Method quantity 641 649
Generic Jacobian bound 679 693
Material know 727 731
Generic safe region 760 771
Generic input space 780 791
Generic robustness veriffcation 939 962
Method generative adversarial networks 975 1006
Method GANs 1008 1012
Generic training process 1044 1060
Generic Lipschitz constant 1143 1161
Method discriminator network 1169 1190
Generic 2017 1304 1308
Method neural networks 1369 1384
Generic Jacobian matrix 1390 1405
Generic Jacobian matrix 1447 1462
Generic generalization gap 1543 1561
Method Telgarsky 2017; Arora 1681 1702
Generic

Method b"b'under-estimation 367 387
Generic b"b'Jacobian matrix 410 429
Method b"b'neural network 512 530
Method b"b'computational cost 552 574
Generic b"b'Jacobian matrix 758 777
Generic ' 3970 802 808
Generic 2677 2683 810 819
Generic ' 2774 840 846
Generic b"b'Jacobian matrix 862 881
Material 2878 884 888
Generic b"b'stationary points 905 926

Entities detected in the text: 'b'Method' b'b"b\'Deep neural networks' 10 34
b'Method' b'b"b\'neural network' 52 70
b'Method' b'b"b\'critical step' 90 107
b'Generic' b'b"b\'Jacobian matrix' 128 147
b'Method' b'b"b\'discriminator network' 196 221
b'Generic' b'b"b\'Jacobian matrix' 242 261
b'Generic' b'b"b\'Jacobian matrix' 282 301
b'Method' b'b"b\'deep neural networks' 321 345
b'Method' b'1014 1036\n' 347 357
b'Method' b'b"b\'under-estimation' 367 387
b'Generic' b'b"b\'Jacobian matrix' 410 429
b'Method' b'b"b\'neural network' 512 530
b'Method' b'b"b\'computational cost' 552 574
b'Generic' b'b"b\'Jacobian matrix' 758 777
b'Generic' b"' 3970" 802