In [1]:
from __future__ import unicode_literals, print_function
import spacy
import en_core_web_sm
import en_core_web_md
# import en_core_web_lg
import random
from pathlib import Path
from spacy.util import minibatch, compounding
import tarfile
import xml.etree.ElementTree as ET
import os
from pandas import ExcelWriter
import pandas as pd
from sklearn.model_selection import train_test_split
import timeit
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.util import decaying

In [28]:
def get_batches(train_data, model_type):
    max_batch_sizes = {"tagger": 32, "parser": 16, "ner": 16, "textcat": 64}
    max_batch_size = max_batch_sizes[model_type]
    if len(train_data) < 1000:
        max_batch_size /= 2
    if len(train_data) < 500:
        max_batch_size /= 2
    batch_size = compounding(1, max_batch_size, 1.001)
    batches = minibatch(train_data, size=batch_size)
    return batches
def transformPerson(orginalData = None):
    texts = [text for text in orginalData['text']]
    transformTags = []
    for tags in orginalData['tags']:
        entity = {'entities': []}
        for tag in eval(tags):
            if tag['TYPE'] == 'DOCTOR' or tag['TYPE'] == 'PATIENT':
                entity['entities'].append((int(tag['start']), int(tag['end']), 'PERSON'))
        transformTags.append(entity)
    transformedTrain = list(zip(texts, transformTags))
    return transformedTrain
def findAllEntityTransform(data = None):
    uniqueTag = set()
    for (_, tags) in data:
        for tag in tags['entities']:
            uniqueTag.add(tag[2])
    return uniqueTag
def findOneEntity(data = None, entType = None):
    for tags in data['tags']:
        for tag in eval(tags):
            if tag['TYPE'] == entType:
                print(tag)
def evaluate(ner_model, examples, n_iter):
    stats = ['ents_f']
    scorer = Scorer() 
    for input_, annot in examples:
        try:
            doc_gold_text = ner_model.make_doc(input_) #the raw text
            gold = GoldParse(doc_gold_text, entities=annot['entities']) # combine the raw text with corrected id
            pred_value = ner_model(input_) # a doc object, which contains the predicted entity
            scorer.score(pred_value, gold) # calculate the accuracy
        except:
            textWithProblem.append((input_, annot))
    return {n_iter: scorer.scores[k] for k in stats} # accumulate all the documents

# find all entity type in the original data
def findAllEntity(data = None):
    uniqueTag = set()
    [[uniqueTag.add(tag['TYPE']) for tag in eval(tags)] for tags in data['tags']]
    return uniqueTag

In [29]:
def main(model=None, n_iter=3, output_dir=None):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model) # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")
    for LABEL in uniqueTag:
        ner.add_label(LABEL)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
#     ner.add_label("VEGETABLE")
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    dropout = decaying(0.6, 0.2, 1e-4)
    batches = get_batches(train_data=TRAIN_DATA, model_type='ner')
    with nlp.disable_pipes(*other_pipes):  # only train NER
#         sizes = compounding(4, 32, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):            
            if itn%3 == 0:
                random.shuffle(TRAIN_DATA)               
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    try:
                        nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses)
                    except:
                        textWithProblem.append(texts)
                        pass
                print("Losses", losses)
                if output_dir + str(itn) is not None:
                    output_dir = Path(output_dir + str(itn))
                    if not output_dir.exists():
                        output_dir.mkdir()
            #         nlp.meta["name"] = new_model_name  # rename model
                    nlp.to_disk(output_dir)
                    f1Train.append(evaluate(nlp, TRAIN_DATA, n_iter = itn))
                    f1Test.append(evaluate(nlp, TEST_DATA, n_iter = itn))
                    print("Saved model to", output_dir)
            else:
                random.shuffle(TRAIN_DATA)
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    try:
                        nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses)
                    except:
                        textWithProblem.append(texts)
                        pass
                f1Train.append(evaluate(nlp, TRAIN_DATA, n_iter = itn))
                f1Test.append(evaluate(nlp, TEST_DATA, n_iter = itn))
                print("Losses", losses)

    # save model to output directory
    if output_dir + str(n_iter) is not None:
        output_dir = Path(output_dir + str(n_iter))
        if not output_dir.exists():
            output_dir.mkdir()
#         nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        f1Train.append(evaluate(nlp, TRAIN_DATA, n_iter = n_iter))
        f1Test.append(evaluate(nlp, TEST_DATA, n_iter = n_iter))
        print("Saved model to", output_dir)



In [None]:
f1Train = []
f1Test = []
df = pd.read_excel('./PythonExport.xlsx')
df_2006Train = pd.read_excel('./i2b2-2006Train.xlsx')
df_2006Test = pd.read_excel('./i2b2-2006Test.xlsx')
df = pd.concat([df, df_2006Test, df_2006Train], axis = 0)
train, test = train_test_split(df, test_size=0.1, random_state = 42)
transformedTrainPerson = transformPerson(train)
transformedTestPerson = transformPerson(test)
uniqueTag = findAllEntityTransform(transformedTrainPerson)
TRAIN_DATA = transformedTrainPerson
TEST_DATA = transformedTestPerson
textWithProblem = []

In [None]:
main(n_iter = 3, output_dir='./emptyNameModel')