In [3]:
from __future__ import unicode_literals, print_function
import spacy
import en_core_web_sm
import en_core_web_md
# import en_core_web_lg
import random
from pathlib import Path
from spacy.util import minibatch, compounding
import tarfile
import xml.etree.ElementTree as ET
import os
from pandas import ExcelWriter
import pandas as pd
from sklearn.model_selection import train_test_split
import timeit
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.util import decaying


In [4]:
def get_batches(train_data, model_type):
    max_batch_sizes = {"tagger": 32, "parser": 16, "ner": 16, "textcat": 64}
    max_batch_size = max_batch_sizes[model_type]
    if len(train_data) < 1000:
        max_batch_size /= 2
    if len(train_data) < 500:
        max_batch_size /= 2
    batch_size = compounding(1, max_batch_size, 1.001)
    batches = minibatch(train_data, size=batch_size)
    return batches
def transformDate(orginalData = None):
    texts = [text for text in orginalData['text']]
    transformTags = []
    for tags in orginalData['tags']:
        entity = {'entities': []}
        for tag in eval(tags):
            if tag['TYPE'] == 'DATE':
                entity['entities'].append((int(tag['start']), int(tag['end']), 'DATE'))
        transformTags.append(entity)
    transformedTrain = list(zip(texts, transformTags))
    return transformedTrain
def findAllEntityTransform(data = None):
    uniqueTag = set()
    for (_, tags) in data:
        for tag in tags['entities']:
            uniqueTag.add(tag[2])
    return uniqueTag

In [5]:
def main(model=None, n_iter=3, output_dir=None):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model) # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")
    for LABEL in uniqueTag:
        ner.add_label(LABEL)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
#     ner.add_label("VEGETABLE")
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    dropout = decaying(0.6, 0.2, 1e-4)
#     batches = get_batches(train_data=TRAIN_DATA, model_type='ner')
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1, 4, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):            
            if itn%3 == 0:
                random.shuffle(TRAIN_DATA)  
                batches = minibatch(TRAIN_DATA, size=sizes)
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    try:
                        nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses)
                    except:
                        textWithProblem.append(texts)
                        pass
                print("Losses", losses)
                
                if output_dir is not None:
                    output_dir_tmp = output_dir + str(itn)
                    output_dir_tmp = Path(output_dir_tmp)
                    if not output_dir_tmp.exists():
                        output_dir_tmp.mkdir()
            #         nlp.meta["name"] = new_model_name  # rename model
                    nlp.to_disk(output_dir_tmp)
                    f1Train.append(evaluate(nlp, TRAIN_DATA, n_iter = itn))
                    f1Test.append(evaluate(nlp, TEST_DATA, n_iter = itn))
                    print("Saved model to", output_dir_tmp)
            else:
                random.shuffle(TRAIN_DATA)
                batches = minibatch(TRAIN_DATA, size=sizes)
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    try:
                        nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses)
                    except:
                        textWithProblem.append(texts)
                        pass
                f1Train.append(evaluate(nlp, TRAIN_DATA, n_iter = itn))
                f1Test.append(evaluate(nlp, TEST_DATA, n_iter = itn))
                print("Losses", losses)
            
            
#             sizes = compounding(1.0, 4.0, 1.001)
#             # batch up the examples using spaCy's minibatch
#             for itn in range(n_iter):
#                 random.shuffle(TRAIN_DATA)
#                 batches = minibatch(TRAIN_DATA, size=sizes)
#                 losses = {}
#                 for batch in batches:
#                     texts, annotations = zip(*batch)
#                     try:
#                         nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
#                     except:
#                         textWithProblem.append(texts)
#                         pass
#                 print("Losses", losses)

    # save model to output directory    
    if output_dir is not None:
        output_dir_final = output_dir + str(n_iter - 1)
        output_dir_final = Path(output_dir_final)
        if not output_dir_final.exists():
            output_dir_final.mkdir()
#         nlp.meta["name"] = new_model_name  # rename model
            nlp.to_disk(output_dir_final)
            f1Train.append(evaluate(nlp, TRAIN_DATA, n_iter = n_iter))
            f1Test.append(evaluate(nlp, TEST_DATA, n_iter = n_iter))
            print("Saved model to", output_dir_final)
        

def evaluate(ner_model, examples, n_iter):
    stats = ['ents_f']
    scorer = Scorer() 
    for input_, annot in examples:
        try:
            doc_gold_text = ner_model.make_doc(input_) #the raw text
            gold = GoldParse(doc_gold_text, entities=annot['entities']) # combine the raw text with corrected id
            pred_value = ner_model(input_) # a doc object, which contains the predicted entity
            scorer.score(pred_value, gold) # calculate the accuracy
        except:
            textWithProblem.append((input_, annot))
    return {n_iter: scorer.scores[k] for k in stats} # accumulate all the documents

# find all entity type in the original data
def findAllEntity(data = None):
    uniqueTag = set()
    [[uniqueTag.add(tag['TYPE']) for tag in eval(tags)] for tags in data['tags']]
    return uniqueTag

In [6]:
f1Train = []
f1Test = []
df = pd.read_excel('./PythonExport.xlsx')
df_2006Train = pd.read_excel('./i2b2-2006Train.xlsx')
df_2006Test = pd.read_excel('./i2b2-2006Test.xlsx')
df = pd.concat([df, df_2006Test, df_2006Train], axis = 0)
train, test = train_test_split(df, test_size=0.1, random_state = 42)
transformedTrainDate = transformDate(train)
transformedTestDate = transformDate(test)
uniqueTag = findAllEntityTransform(transformedTrainDate)
TRAIN_DATA = transformedTrainDate
TEST_DATA = transformedTestDate
textWithProblem = []

In [None]:
main(n_iter = 50, output_dir='./date/emptyDateModel')

Created blank 'en' model
Losses {'ner': 72030.82142430753}
Saved model to date\emptyDateModel0
Losses {'ner': 11576.881575357014}
Losses {'ner': 2894.2421365093724}
Losses {'ner': 1733.8358518542775}
Saved model to date\emptyDateModel3
Losses {'ner': 1393.5690614163027}
Losses {'ner': 1201.4600139381425}


In [16]:
ner_model = spacy.load('./model1/') # for spaCy's pretrained use 'en_core_web_sm'
resOriTrain = evaluate(ner_model, TRAIN_DATA)
print("train accuracy:")
print(resOriTrain)
resOriTest = evaluate(ner_model, transformDate(test))
print("test accuracy:")
print(resOriTest)

train accuracy:
{'ents_p': 97.81285231116121, 'ents_r': 98.32275611967361, 'ents_f': 98.06714140386572, 'ents_per_type': {'DATE': {'p': 97.81285231116121, 'r': 98.32275611967361, 'f': 98.06714140386572}}}
test accuracy:
{'ents_p': 96.42248722316864, 'ents_r': 96.42248722316864, 'ents_f': 96.42248722316864, 'ents_per_type': {'DATE': {'p': 96.42248722316864, 'r': 96.42248722316864, 'f': 96.42248722316864}}}


In [None]:
ner_model = spacy.load('./model2/') # for spaCy's pretrained use 'en_core_web_sm'
resOriTrain = evaluate(ner_model, TRAIN_DATA)
print("train accuracy:")
print(resOriTrain)
resOriTest = evaluate(ner_model, transformDate(test))
print("test accuracy:")
print(resOriTest)

In [17]:
len(textWithProblem)

3

In [22]:
doc = ner_model(textWithProblem[1][0])
spacy.displacy.render(doc, style = 'ent')