In [None]:
import os 
import pandas as pd
import numpy as np
import json
import spacy
from spacy.util import minibatch, compounding
from spacy.scorer import Scorer
from spacy.gold import GoldParse

In [None]:
#1)  Converting data from doccano jsonl annotation output

def convert_doccano_to_spacy(filepath):
    # opens the file as strings
    with open(filepath, 'rb') as fp:
        data = fp.readlines()
        training_data = []
    # converts the strings to json elements

    for record in data:
        entities = []
        read_record = json.loads(record)

        # gathers the text of the current element
        text = read_record['text']
        entities_record = read_record['labels']
        # gathers the label of the curent element

        # reshapes the labels to match the spacy format
        for start, end, label in entities_record: 
            entities.append((start, end, label))
        
        # append the curent flyer information to the main appender lis
        training_data.append((text, {'entities': entities}))
        
    
    return training_data


# 2) converting doccano outputs. Notice - sometimes pc will recognize it as .txt and sometimes as .json1, change the extension accordingly below
NER_data_20200223 = convert_doccano_to_spacy(r'E:\temporary_flownform_directory\docano_output\doccano_export_NER_20200223.txt')

NER_data_20200209 = convert_doccano_to_spacy(r'E:\temporary_flownform_directory\docano_output\doccano_export_NER_20200209.json1')


In [None]:
NER_data_all = NER_data_20200223 + NER_data_20200209
print(len(NER_data_all))

In [None]:
import random

random.shuffle(NER_data_all)

train_data = NER_data_all[0: int(len(NER_data_all)*0.8)]
test_data = NER_data_all[int(len(NER_data_all)*0.8): len(NER_data_all)]

print(len(train_data))
print(len(test_data))


In [None]:
# train for demo purposes


def train_spacy(data,iterations, model = None):
    train_data = data
    
    # loading the model
    if model is not None:
        nlp = spacy.load(model)  # load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # train a blank model on top of thestandard en NER model
        print("Created blank 'en' model\n")
    
    # if blank model is used we need to add the ner to the pipeline. Otherwise, get it with assumption it is called the same
    # Ner will be only part of our pipeline
    # common steps of the pipeline are "pipeline": ["tagger", "parser", "ner"]
    # spacy will tokenize each word and apply the pipeline steps?
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")
    
   # add labels to current ner
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

#     # get names of other pipes to disable them during training
#     pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
#     other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
#     with nlp.disable_pipes(*other_pipes):  # only train NER
            
    # starts the training and returns an optimizer function for updating the model weights
    optimizer = nlp.begin_training()
    
    for itn in range(iterations):
        counter = 0

        print("\nStatring iteration " + str(itn))
        
        # raining data is shuffled to ensure the model doesn’t make any generalizations based on the order of examples
        random.shuffle(train_data)
        losses = {}
        for text, entities in train_data:
            counter += 1
            print("processing text {}/{}".format(counter, len(train_data)))
# 1)            
# SIMPLE TRAINING STYLE - sequence of raw texts and dictionaries of annotations
#             nlp.update(
#                 [text],  # batch of texts
#                 [entities],  # batch of annotations
#                 drop=0.2,  # dropout rate- makes it harder to memorise data. a rate at which to randomly “drop” individual features and representations. 
#                 sgd=optimizer,  # callable to update weights
#                 losses=losses)

# 2)
# STANDARD SPACY TRAINING STYLE- sequence of Doc and GoldParse objects

#             doc = nlp.make_doc(text)
#             gold = GoldParse(doc, entities=entities['entities'])
#             nlp.update(
#                 [doc],  # batch of texts
#                 [gold],  # batch of annotations
#                 drop=0.2,  # dropout rate- makes it harder to memorise data. a rate at which to randomly “drop” individual features and representations. 
#                 sgd=optimizer,  # callable to update weights
#                 losses=losses)
            
            
            
# 3) the minibatch (most optimal?) approach

            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))

            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                ) 
            
        print("losses", losses)
        
        
        
        
    return nlp


subset_train_data = train_data[0:5]

spacy_subset_demo_purposes = train_spacy(data = subset_train_data,
            iterations= 3)

In [None]:
# evaluation for demo purposes

def evaluate(ner_model, examples):
    scorer = Scorer()
    for flyer in examples:
        input_, annot = flyer
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot['entities'])
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores


def custom_evaluation_script(annotated_test_data, ner_model, list_of_entities):
    
    
    # 1) creating df from true labels from doccano annotators
    true_df = pd.DataFrame(dict(('True_{}'.format(x),[]) for x in list_of_entities))
                           
    for text_and_entities in annotated_test_data:
        temp_dict = dict(('True_{}'.format(x),[]) for x in list_of_entities)
        text, entities = text_and_entities        
        
        for i in entities['entities']:
            temp_dict['True_{}'.format(i[2])].append(text[i[0]:i[1]])
            
        true_df = true_df.append(temp_dict, ignore_index = True)
        
    
    # 2) create df from recognized labels from trained NER model
    NER_df = pd.DataFrame(dict(('NER_{}'.format(x),[]) for x in list_of_entities))
    
    text_for_extraction = [x[0] for x in annotated_test_data]
    
    for flyer_text in text_for_extraction:  
        temp_dict = dict(('NER_{}'.format(x),[]) for x in list_of_entities)
        doc = spacy_subset_demo_purposes(flyer_text)
        
        extracted_entity_label_tuples = [(ent.text, ent.label_) for ent in doc.ents]
        
        for i in extracted_entity_label_tuples:
            temp_dict['NER_{}'.format(i[1])].append(i[0])
    
        NER_df = NER_df.append(temp_dict, ignore_index = True)
    
    
    # 3) use NER scorer for calculating metrics
    ner_score_results = evaluate(ner_model, annotated_test_data)
    
    return true_df, NER_df, ner_score_results





true, extracted, score_results = custom_evaluation_script(annotated_test_data = test_data,
                        ner_model = spacy_subset_demo_purposes,
                        list_of_entities = ['EMAIL',
                                            'GPE (countries, cities, states)',
                                            'PERSON',
                                            'PRICING',
                                            'SIZE',
                                            'Street Adress',
                                            'ZIP CODE'
                                           ]
                        )


score_results

In [None]:
%%time
def train_spacy(data,iterations, model = None):
    train_data = data
    
    # loading the model
    if model is not None:
        nlp = spacy.load(model)  # load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # train a blank model on top of thestandard en NER model
        print("Created blank 'en' model\n")
    
    # if blank model is used we need to add the ner to the pipeline. Otherwise, get it with assumption it is called the same
    # Ner will be only part of our pipeline
    # common steps of the pipeline are "pipeline": ["tagger", "parser", "ner"]
    # spacy will tokenize each word and apply the pipeline steps?
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")
    
   # add labels to current ner
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

#     # get names of other pipes to disable them during training
#     pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
#     other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
#     with nlp.disable_pipes(*other_pipes):  # only train NER
            
    # starts the training and returns an optimizer function for updating the model weights
    optimizer = nlp.begin_training()
    
    for itn in range(iterations):
        counter = 0

        print("\nStatring iteration " + str(itn))
        
        # raining data is shuffled to ensure the model doesn’t make any generalizations based on the order of examples
        random.shuffle(train_data)
        losses = {}
        for text, entities in train_data:
            counter += 1
            print("processing text {}/{}".format(counter, len(train_data)))
# 1)            
# SIMPLE TRAINING STYLE - sequence of raw texts and dictionaries of annotations
#             nlp.update(
#                 [text],  # batch of texts
#                 [entities],  # batch of annotations
#                 drop=0.2,  # dropout rate- makes it harder to memorise data. a rate at which to randomly “drop” individual features and representations. 
#                 sgd=optimizer,  # callable to update weights
#                 losses=losses)

# 2)
# STANDARD SPACY TRAINING STYLE- sequence of Doc and GoldParse objects

#             doc = nlp.make_doc(text)
#             gold = GoldParse(doc, entities=entities['entities'])
#             nlp.update(
#                 [doc],  # batch of texts
#                 [gold],  # batch of annotations
#                 drop=0.2,  # dropout rate- makes it harder to memorise data. a rate at which to randomly “drop” individual features and representations. 
#                 sgd=optimizer,  # callable to update weights
#                 losses=losses)
            
            
            
# 3) the minibatch (most optimal?) approach

            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))

            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                ) 
            
        print("losses", losses)
        
        
        
        
    return nlp




In [None]:
subset_train_data = train_data[0:5]

spacy_subset_300_iterations_3 = train_spacy(data = subset_train_data,
            iterations= 3)

spacy_subset_all_train_iterations_3 = train_spacy(data = train_data,
            iterations= 3)

spacy_subset_all_train_iterations_6 = train_spacy(data = train_data,
            iterations= 6)

In [None]:
spacy_subset_all_train_iterations_6.to_disk(r'E:\temporary_flownform_directory\ner_output')
spacy_subset_all_train_iterations_3.to_disk(r'E:\temporary_flownform_directory\ner_output')
spacy_subset_300_iterations_3.to_disk(r'E:\temporary_flownform_directory\ner_output')

In [None]:
def evaluate(ner_model, examples):
    scorer = Scorer()
    for flyer in examples:
        input_, annot = flyer
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot['entities'])
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores


def custom_evaluation_script(annotated_test_data, ner_model, list_of_entities):
    
    
    # 1) creating df from true labels from doccano annotators
    true_df = pd.DataFrame(dict(('True_{}'.format(x),[]) for x in list_of_entities))
                           
    for text_and_entities in annotated_test_data:
        temp_dict = dict(('True_{}'.format(x),[]) for x in list_of_entities)
        text, entities = text_and_entities        
        
        for i in entities['entities']:
            temp_dict['True_{}'.format(i[2])].append(text[i[0]:i[1]])
            
        true_df = true_df.append(temp_dict, ignore_index = True)
        
    
    # 2) create df from recognized labels from trained NER model
    NER_df = pd.DataFrame(dict(('NER_{}'.format(x),[]) for x in list_of_entities))
    
    text_for_extraction = [x[0] for x in annotated_test_data]
    
    for flyer_text in text_for_extraction:  
        temp_dict = dict(('NER_{}'.format(x),[]) for x in list_of_entities)
        doc = spacy_subset_all_train_iterations_6(flyer_text)
        
        extracted_entity_label_tuples = [(ent.text, ent.label_) for ent in doc.ents]
        
        for i in extracted_entity_label_tuples:
            temp_dict['NER_{}'.format(i[1])].append(i[0])
    
        NER_df = NER_df.append(temp_dict, ignore_index = True)
    
    
    # 3) use NER scorer for calculating metrics
    ner_score_results = evaluate(ner_model, annotated_test_data)
    
    return true_df, NER_df, ner_score_results


In [None]:
true, extracted, score_results = custom_evaluation_script(annotated_test_data = test_data,
                        ner_model = spacy_subset_all_train_iterations_6,
                        list_of_entities = ['EMAIL',
                                            'GPE (countries, cities, states)',
                                            'PERSON',
                                            'PRICING',
                                            'SIZE',
                                            'Street Adress',
                                            'ZIP CODE'
                                           ]
                        )

output_df = pd.concat([true,extracted], axis = 1)

In [None]:
score_results

In [None]:
true, extracted, score_results = custom_evaluation_script(annotated_test_data = test_data,
                        ner_model = spacy_subset_all_train_iterations_3,
                        list_of_entities = ['EMAIL',
                                            'GPE (countries, cities, states)',
                                            'PERSON',
                                            'PRICING',
                                            'SIZE',
                                            'Street Adress',
                                            'ZIP CODE'
                                           ]
                        )

output_df = pd.concat([true,extracted], axis = 1)


In [None]:
score_results

In [None]:
true, extracted, score_results = custom_evaluation_script(annotated_test_data = test_data,
                        ner_model = spacy_subset_300_iterations_3,
                        list_of_entities = ['EMAIL',
                                            'GPE (countries, cities, states)',
                                            'PERSON',
                                            'PRICING',
                                            'SIZE',
                                            'Street Adress',
                                            'ZIP CODE'
                                           ]
                        )

output_df = pd.concat([true,extracted], axis = 1)

In [None]:
score_results