# Convert JSONL into list

In [10]:
import ujson
import spacy
from pathlib import Path

In [11]:
def read_jsonl(file_path):
    """Read a .jsonl file and yield its contents line by line.
    file_path (unicode / Path): The file path.
    YIELDS: The loaded JSON contents of each line.
    """
    with Path(file_path).open('r', encoding='utf8') as f:
        for line in f:
            try:  # hack to handle broken jsonl
                yield ujson.loads(line.strip())
            except ValueError:
                continue

In [12]:
# def write_jsonl(file_path, lines):
#     """Create a .jsonl file and dump contents.
#     file_path (unicode / Path): The path to the output file.
#     lines (list): The JSON-serializable contents of each line.
#     """
#     data = [ujson.dumps(line, escape_forward_slashes=False) for line in lines]
#     Path(file_path).open('w', encoding='utf-8').write('\n'.join(data))

In [13]:
list_of_train = []
path = read_jsonl("ner-train.jsonl")
for i in path:
    list_of_train.append(i)

# Training Process

In [14]:
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.
For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
Last tested with: v2.1.0
"""
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding, decaying


# Training data
TRAIN_DATA = list_of_train



@plac.annotations(
    model=("id_ud-tag-dep-ner-1.0.0/"),
    output_dir=("Final/"),
    n_iter=(1000),
)
def main(model=None, output_dir=None, n_iter=100):
    best_loss = 0.0
    iter_since_best = 0
    n_early_stopping = 5

    with open("INFO-model.txt", "w") as text_file:
        """Load the model, set up the pipeline and train the entity recognizer."""
        if model is not None:
            nlp = spacy.load(model)  # load existing spaCy model
            print("Loaded model '%s'" % model, file=text_file)
        else:
            nlp = spacy.blank("en")  # create blank Language class
            print("Created blank 'en' model", file=text_file)

        # create the built-in pipeline components and add them to the pipeline
        # nlp.create_pipe works for built-ins that are registered with spaCy
        if "ner" not in nlp.pipe_names:
            ner = nlp.create_pipe("ner")
            nlp.add_pipe(ner, last=True)
        # otherwise, get it so we can add labels
        else:
            ner = nlp.get_pipe("ner")

        # add labels
        for _, annotations in TRAIN_DATA:
            for ent in annotations.get("entities"):
                ner.add_label(ent[2])

        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
        with nlp.disable_pipes(*other_pipes):  # only train NER
            nlp.begin_training()
            for itn in range(n_iter):
                random.shuffle(TRAIN_DATA)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(TRAIN_DATA, size=compounding(1.0, 100.0, 1.001))
                # dropout decaying as spaCy mentions in tips and advice
                dropout = decaying(0.5, 0.2, 0.0001)
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(
                        texts,  # batch of texts
                        annotations,  # batch of annotations
                        drop=next(dropout),  # dropout - make it harder to memorise data
                        losses=losses,
                    )
                print("Losses", losses, file=text_file)
                
                # save model to output directory
                if output_dir is not None:
                    output_dir = Path(output_dir)
                    if not output_dir.exists():
                        output_dir.mkdir()
                    # save each epochs model into directory
                    output_epoch = output_dir / ("model%d" % itn)
                    nlp.to_disk(output_epoch)
                    print("Saved model to", output_epoch, file=text_file)
                
                # Early Stopping
                current_loss = losses['ner']
                
                if itn == 0:
                    best_loss = current_loss
                elif current_loss > best_loss:
                    iter_since_best += 1
                else:
                    iter_since_best = 0
                    best_loss = current_loss
                    
                if iter_since_best >= n_early_stopping:
                    best_iter = itn - iter_since_best
                    print("Early stopping, best iteration is: {}".format(itn - iter_since_best), file=text_file)
                    print("Best score = {}; Final iteration score = {}".format(best_loss, current_loss), file=text_file)                       
                    break

                    
        # test the best model
        output_best = output_dir / ("model%d" % best_iter)
        print("Loading from best model", output_best, file=text_file)
        nlp2 = spacy.load(output_best)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents], file=text_file)
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc], file=text_file)

#         # save model to output directory
#         if output_dir is not None:
#             output_dir = Path(output_dir)
#             if not output_dir.exists():
#                 output_dir.mkdir()
#             nlp.to_disk(output_dir)
#             print("Saved model to", output_dir, file=text_file)

#             # test the saved model
#             print("Loading from", output_dir, file=text_file)
#             nlp2 = spacy.load(output_dir)
#             for text, _ in TRAIN_DATA:
#                 doc = nlp2(text)
#                 print("Entities", [(ent.text, ent.label_) for ent in doc.ents], file=text_file)
#                 print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc], file=text_file)
            
            
# if __name__ == "__main__":
#     plac.call(main)

In [None]:
main("id_ud-tag-dep-ner-1.0.0/","Final/",1000)