In [None]:
#Spacy uses Jsonlines for training
!pip install jsonlines

In [None]:
import jsonlines
import string 
import pandas as pd
import json
import glob
import random
import re

#Spacy imports
import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy import displacy
from collections import Counter
from spacy.util import minibatch, compounding
from spacy.util import decaying

import warnings
import pickle
import plac
from pathlib import Path
from tqdm import tqdm

spacy.prefer_gpu()

In [None]:
#reading the json1 file for the Dataset
lst = []
for file in glob.glob("*.json1"):
    with jsonlines.open(file) as reader:
        for obj in reader:
            lst.append(obj)

In [None]:
#creating the dataset (which will be cleaned in further steps)
Dataset = []
for data in lst:
    ents = [tuple(entity) for entity in data['labels']]
    Dataset.append((data['text'].lower().strip(),{'entities':ents}))
random.shuffle(Dataset)
test_dataset = Dataset[641:741]
Dataset = Dataset[0:641]
print(len(Dataset))

In [None]:
#creating a dictionary of the annotations
def create_dict(x):
    dictionary = {}
    text = x[0]
    annot = x[1]
    index = annot['entities']
    for start,end,tag in index:
        word = text[start:end].lower().strip()
        dictionary[word]=tag
    return dictionary

In [None]:
dictionary = {}
for val in Dataset:
    tmp_dict = create_dict(val)
#     print(tmp_dict)
#     break
    dictionary.update(tmp_dict)
    

dictionary = {k.strip().lower(): v for (k, v) in dictionary.items()}
len(dictionary)

In [None]:
dict_keys = dictionary.keys()
# dict_keys
# dictionary

In [None]:
# Dataset[0][0]

In [None]:
#Checking if overlapping annotations exist. For example "Judgement and udgement(wrongly annotated)" or "Supreme Court and"
def is_overlap(a, b):
    a, b = sorted([a, b])
    if a[0] <= b[0] and a[1] >= b[1]:
        return True
    elif a[1] > b[0]:
        return True
    return False

In [None]:
# assert is_overlap((1,9), (3,4)) == True
# assert is_overlap((3,4), (1,9)) == True
# assert is_overlap((1, 5), (3, 7)) == True
# assert is_overlap((4, 9), (1, 6)) == True
# assert is_overlap((1,6), (7,9)) == False
# assert is_overlap((1,4),(4,6)) == False

In [None]:
# dictionary

In [None]:
# [k for k in dictionary if len(k) == 44]

In [None]:
# from collections import Counter
# Counter(map(len, dictionary)).most_common(10)

In [None]:
# sorted(dictionary.items(), key=lambda x: len(x[0]), reverse=True)[:10]

In [None]:
import re

In [None]:
#Script to clean out the annotations
dictionary2 = {re.compile(re.escape(k)): v for k,v in sorted(dictionary.items(), key=lambda x: len(x[0]), reverse=True) if len(k) >= 4}

In [None]:
#Script to re-annotate the dataset
from __future__ import unicode_literals, print_function
def entities_finder(document):  
    entities = []
    my_str=document
    some_list= set()
    for key, final_tag in dictionary2.items():
        start = 0
        for match in key.finditer(my_str):
            out_interval = match.span()
            if out_interval in some_list:
                continue
            if not any(is_overlap(out_interval, interval) for interval in some_list):
                entities.append((out_interval[0], out_interval[1], final_tag))
                some_list.add(out_interval)

    return entities

In [None]:
#To store in the format specified for Spacy
def formatting_func(text):
    val=(text,{'entities':entities_finder(text)})
    return val

In [None]:
cleaned_dataset=[]
for i in tqdm(sorted(Dataset, key=lambda x: len(x[0]))):
    cleaned_dataset.append(formatting_func(i[0]))
cleaned_dataset[0]

In [None]:
# len(sorted(Dataset, key=lambda x: len(x[0]),reverse=True)[3][0])

In [None]:
# len(sorted(Dataset, key=lambda x: len(x[0]),reverse=True)[0][0])

In [None]:
# len(cleaned_dataset[0][0])

In [None]:
random.shuffle(cleaned_dataset)

In [None]:
train_set = cleaned_dataset[:500]
len(train_set)

In [None]:
test_set = cleaned_dataset[500:]
len(test_set)

In [None]:
def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot['entities'])
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

In [None]:
#!/usr/bin/env python
# coding: utf8

# Training additional entity types using spaCy

warnings.simplefilter("ignore", UserWarning)
LABEL = ['Authority', 'Action', 'Area', 'Party', 'Subject', 'SubjectElements'] #currently used annotations
def main(model=None, new_model_name='new_model', output_dir="/content/", n_iter=15):
    """Setting up the pipeline and entity recognizer, and training the new entity."""
    max_batch_size = 16
    if len(train_set) < 500:
        max_batch_size /= 2
    nlp = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    for i in LABEL:
        ner.add_label(i)   # Add new entity labels to entity recognizer

    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()

    # Get names of other pipes to disable them during training to train only NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(train_set)
            losses = {}
            batches = minibatch(train_set, size=compounding(1, max_batch_size, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.5,
                           losses=losses)
            scores = evaluate(nlp, train_set)
            print("Scores ",scores)
            print('Losses', losses)
                # Save model 
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
            
prdnlp = main()

In [None]:
# to render the generated annotations
nlp2 = spacy.load("/content")
doc = nlp2(test_dataset[5][0])
displacy.render(nlp2(str(doc)), jupyter=True, style='ent')

In [None]:
#Evaluate the model on the test_set created after auto-annotations
examples = test_set
results = evaluate(nlp2, examples)
print(results)

In [None]:
# This checks for mis-alignments in the annotations shown by ('-'). Spacy ignores these as a Userwarning
# But too many mis-alignments will lead to a crunch in your train set

# from spacy.gold import biluo_tags_from_offsets
# for text, annot in train_set:
#     doc = nlp2.make_doc(text)
#     print(biluo_tags_from_offsets(doc, annot["entities"]))