In [1]:
import time
import re
import random

import json
import numpy as np
import pandas as pd

import spacy

In [2]:
spacy.warnings.filterwarnings("ignore")

In [6]:
df = pd.read_csv('./RO43-1000_groups_labeled.csv', encoding='utf-8 sig')

In [17]:
missing_tag = df['tag'].isnull()

unlabeled_df = df[missing_tag] 
labeled_df = df[~missing_tag]

In [19]:
text = 'We need to deliver it to Festy.'

In [20]:
len(text)

31

In [21]:
train_data = []
for index, row in labeled_df.iterrows():
    text = row['chars']
    tag = row['tag']
    
    text_start, text_end = 0, len(text)
    entities = [(text_start, text_end, tag)]
    specs = {'entities': entities}
    data_item = (text, specs)
    
    train_data.append(data_item)

In [23]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn) + ".", end="")
            start = time.time()
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            end = time.time()
            print(" Loss: ", losses)
            print("Duration:", end - start)
    return nlp

In [24]:
epochs = 100
nlp = train_spacy(train_data, epochs)

Statring iteration 0. Loss:  {'ner': 924.0723485709991}
Duration: 55.30208611488342
Statring iteration 1. Loss:  {'ner': 404.730376214795}
Duration: 63.33017563819885
Statring iteration 2. Loss:  {'ner': 166.1198701716154}
Duration: 72.5246992111206
Statring iteration 3. Loss:  {'ner': 75.41188966067477}
Duration: 92.79164481163025
Statring iteration 4. Loss:  {'ner': 170.29756092960858}
Duration: 111.9171290397644
Statring iteration 5. Loss:  {'ner': 256.74812736866534}
Duration: 111.9559211730957
Statring iteration 6. Loss:  {'ner': 245.0671411218572}
Duration: 111.90702080726624
Statring iteration 7. Loss:  {'ner': 95.93747757755943}
Duration: 112.33915734291077
Statring iteration 8. Loss:  {'ner': 127.03558780159537}
Duration: 111.97490286827087
Statring iteration 9. Loss:  {'ner': 36.11684017241342}
Duration: 70.90481543540955
Statring iteration 10. Loss:  {'ner': 83.33315625303472}
Duration: 40.52880930900574
Statring iteration 11. Loss:  {'ner': 10.17639698000949}
Duration: 40.4

In [25]:
# Save our trained Model
MODEL_PATH = 'C:/Users/gradi/Documents/projects/machine_learning/leaflet_data_extractor/spacy_models/group-labeling-v0.3'
nlp.to_disk(MODEL_PATH)