### CR with Spacy - Example

In [1]:
import json, logging, sys
import pandas as pd

In [2]:
ner_file = "spaCy_data/se_ner_annotated.tsv"
df_data = pd.read_csv(ner_file,sep="\t",encoding="latin1").fillna(method='ffill')

In [3]:
df_data = df_data[['Word', 'Tag']]
df_data.to_csv('spaCy_data/spacy_ner.tsv', sep='\t',index=False)

In [4]:
# Convert .tsv file to dataturks json format. 

def tsv_to_json_format(input_path,output_path,unknown_label):
    
    try:
        f=open(input_path,'r') # input file
        fp=open(output_path, 'w') # output file
        data_dict={}
        annotations =[]
        label_dict={}
        s=''
        start=0
        for line in f:
            if line[0:len(line)-1]!='.\tO':
                word,entity=line.split('\t')
                s+=word+" "
                entity=entity[:len(entity)-1]
                if entity!=unknown_label:
                    if len(entity) != 1:
                        d={}
                        d['text']=word
                        d['start']=start
                        d['end']=start+len(word)-1  
                        try:
                            label_dict[entity].append(d)
                        except:
                            label_dict[entity]=[]
                            label_dict[entity].append(d) 
                start+=len(word)+1
            else:
                data_dict['content']=s
                s=''
                label_list=[]
                for ents in list(label_dict.keys()):
                    for i in range(len(label_dict[ents])):
                        if(label_dict[ents][i]['text']!=''):
                            l=[ents,label_dict[ents][i]]
                            for j in range(i+1,len(label_dict[ents])): 
                                if(label_dict[ents][i]['text']==label_dict[ents][j]['text']):  
                                    di={}
                                    di['start']=label_dict[ents][j]['start']
                                    di['end']=label_dict[ents][j]['end']
                                    di['text']=label_dict[ents][i]['text']
                                    l.append(di)
                                    label_dict[ents][j]['text']=''
                            label_list.append(l)                          
                            
                for entities in label_list:
                    label={}
                    label['label']=[entities[0]]
                    label['points']=entities[1:]
                    annotations.append(label)
                data_dict['annotation']=annotations
                annotations=[]
                json.dump(data_dict, fp)
                fp.write('\n')
                data_dict={}
                start=0
                label_dict={}
    except Exception as e:
        logging.exception("Unable to process file" + "\n" + "error = " + str(e))
        return None

tsv_to_json_format("spaCy_data/spacy_ner.tsv",'spaCy_data/se_ner_spacy.json','abc')

In [5]:
# Convert json file to spaCy format.
import plac
import logging
import argparse
import sys
import os
import json
import pickle

#@plac.annotations(input_file=("Input file", "option", "i", str), output_file=("Output file", "option", "o", str))

def to_spacy_format(input_file=None, output_file=None):
    try:
        training_data = []
        lines=[]
        with open(input_file, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                point = annotation['points'][0]
                labels = annotation['label']
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        with open(output_file, 'wb') as fp:
            pickle.dump(training_data, fp)

    except Exception as e:
        logging.exception("Unable to process " + input_file + "\n" + "error = " + str(e))
        return None

to_spacy_format('spaCy_data/se_ner_spacy.json', 'spaCy_data/se_ner_spacy_new.json')


In [6]:
import spacy
nlp = spacy.blank('en')  # create blank Language class
ner = nlp.create_pipe('ner')

In [7]:
list(df_data.Tag.unique())

['O',
 'B-syscon',
 'B-grp',
 'B-seterm',
 'B-opcon',
 'I-opcon',
 'B-mea',
 'I-mea',
 'B-loc',
 'I-loc',
 'B-abb',
 'I-grp',
 'I-syscon',
 'B-cardinal',
 'B-org',
 'I-org',
 'B-event',
 'I-event',
 'I-seterm',
 'I-abb',
 'B-art']

In [8]:
#!/usr/bin/env python
# coding: utf8

# Training additional entity types using spaCy
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# New entity labels
# Specify the new entity labels which you want to add here
LABEL = list(df_data.Tag.unique())

# Loading training data 
with open ('spaCy_data/se_ner_spacy_new.json', 'rb') as fp:
    TRAIN_DATA = pickle.load(fp)

FULL_DATA = TRAIN_DATA
num_of_examples = len(FULL_DATA)
print(num_of_examples)

3606


In [9]:
TEST_DATA = FULL_DATA[int(num_of_examples*0.8):]
TRAIN_DATA = FULL_DATA[:int(num_of_examples*0.8)]

In [10]:
def train_2(model=None, new_model_name='spacy_cr', output_dir="spaCy_data", n_iter=25):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    """ADD MULTIPLE LABELS TO NER MODEL"""
    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)
            
    # test the trained model
    test_text = 'Acceptable Risk is the risk that is understood and agreed to by the program/project.'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for i, ent in enumerate(doc.ents):
        print("Entity number %s is %s in text: '%s'" % (i, ent.label_, ent.text))

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

In [11]:
train_2()

Created blank 'en' model
{'ner': 9102.066369072516}
{'ner': 5363.53910399159}
{'ner': 4601.456692442025}
{'ner': 4290.486043255721}
{'ner': 3812.5983096510145}
{'ner': 3545.289167556115}
{'ner': 3283.787138477605}
{'ner': 3127.5881329122435}
{'ner': 2921.274833429352}
{'ner': 2838.875882211671}
{'ner': 2674.8092282525263}
{'ner': 2517.7672035375413}
{'ner': 2449.2215016705995}
{'ner': 2317.4224637877314}
{'ner': 2226.2623050444117}
{'ner': 2133.6803761870096}
{'ner': 2221.6246143530943}
{'ner': 2144.1063946379304}
{'ner': 1954.9057893707222}
{'ner': 2033.166076132395}
{'ner': 1875.9221412862328}
{'ner': 1934.8065849468621}
{'ner': 1739.6350626578906}
{'ner': 1794.2866389906062}
{'ner': 1628.1831157038791}
Entities in 'Acceptable Risk is the risk that is understood and agreed to by the program/project.'
Entity number 0 is B-mea in text: 'Acceptable'
Entity number 1 is I-mea in text: 'Risk'
Entity number 2 is B-mea in text: 'risk'
Entity number 3 is B-opcon in text: 'program'
Entity numb

In [14]:
import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot['entities'])
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

# for spaCy's pretrained use 'en_core_web_sm' or 'en_core_web_lg'
#ner_model = spacy.load('spaCy_data') 
ner_model = spacy.load('en_core_web_lg') 
results = evaluate(ner_model, TEST_DATA)
print(results)

{'uas': 0.0, 'las': 0.0, 'las_per_type': {'nsubj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'aux': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'root': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'xcomp': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'acomp': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'prep': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'det': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'amod': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'pobj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'compound': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'conj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'cc': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'attr': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'mark': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'advcl': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'dobj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'dep': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'advmod': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'relcl': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'pcomp': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'acl': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'ccomp': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'nsubjpass': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'auxpass': {'p': 0.0, 'r':

In [15]:
ner_model = spacy.load('spaCy_data') 
results = evaluate(ner_model, TEST_DATA)
print(results)

{'uas': 0.0, 'las': 0.0, 'las_per_type': {'': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'ents_p': 88.06068601583114, 'ents_r': 90.4471544715447, 'ents_f': 89.2379679144385, 'ents_per_type': {'B-seterm': {'p': 91.38576779026218, 'r': 96.06299212598425, 'f': 93.66602687140114}, 'I-opcon': {'p': 87.65822784810126, 'r': 73.47480106100795, 'f': 79.94227994227995}, 'B-opcon': {'p': 86.85897435897436, 'r': 92.41261722080137, 'f': 89.54977282114828}, 'B-mea': {'p': 90.54054054054053, 'r': 97.57281553398059, 'f': 93.9252336448598}, 'B-syscon': {'p': 85.95238095238096, 'r': 92.32736572890026, 'f': 89.02589395807645}, 'B-cardinal': {'p': 100.0, 'r': 100.0, 'f': 100.0}, 'I-event': {'p': 96.7741935483871, 'r': 73.17073170731707, 'f': 83.33333333333334}, 'B-event': {'p': 93.65079365079364, 'r': 96.72131147540983, 'f': 95.16129032258064}, 'B-grp': {'p': 92.74193548387096, 'r': 95.0413223140496, 'f': 93.87755102040816}, 'I-seterm': {'p': 100.0, 'r': 75.0, 'f': 85.71428571428571}, 'I-syscon': {'p': 77.777777777

### References: 

https://towardsdatascience.com/custom-named-entity-recognition-using-spacy-7140ebbb3718
https://timkuhn.github.io/TextMining/spacy/ner/2018/01/24/spaCy_NER_Training.html