In [1]:
from __future__ import unicode_literals, print_function
import spacy
import en_core_web_sm
import en_core_web_md
# import en_core_web_lg
import random
from pathlib import Path
from spacy.util import minibatch, compounding
import tarfile
import xml.etree.ElementTree as ET
import os
from pandas import ExcelWriter
import pandas as pd
from sklearn.model_selection import train_test_split
import timeit
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.util import decaying

In [12]:
def get_batches(train_data, model_type):
    max_batch_sizes = {"tagger": 32, "parser": 16, "ner": 16, "textcat": 64}
    max_batch_size = max_batch_sizes[model_type]
    if len(train_data) < 1000:
        max_batch_size /= 2
    if len(train_data) < 500:
        max_batch_size /= 2
    batch_size = compounding(1, max_batch_size, 1.001)
    batches = minibatch(train_data, size=batch_size)
    return batches
def transformName(orginalData = None):
    texts = [text for text in orginalData['text']]
    transformTags = []
    for tags in orginalData['tags']:
        entity = {'entities': []}
        for tag in eval(tags):
            if tag['TYPE'] == 'DOCTOR' or tag['TYPE'] == 'PATIENT':
                entity['entities'].append((int(tag['start']), int(tag['end']), 'NAME'))
        transformTags.append(entity)
    transformedTrain = list(zip(texts, transformTags))
    return transformedTrain
def findAllEntityTransform(data = None):
    uniqueTag = set()
    for (_, tags) in data:
        for tag in tags['entities']:
            uniqueTag.add(tag[2])
    return uniqueTag
def findOneEntity(data = None, entType = None):
    for tags in data['tags']:
        for tag in eval(tags):
            if tag['TYPE'] == entType:
                print(tag)
def evaluate(ner_model, examples, n_iter):
    stats = ['ents_f']
    scorer = Scorer() 
    for input_, annot in examples:
        try:
            doc_gold_text = ner_model.make_doc(input_) #the raw text
            gold = GoldParse(doc_gold_text, entities=annot['entities']) # combine the raw text with corrected id
            pred_value = ner_model(input_) # a doc object, which contains the predicted entity
            scorer.score(pred_value, gold) # calculate the accuracy
        except:
            textWithProblem.append((input_, annot))
    return {n_iter: scorer.scores[k] for k in stats} # accumulate all the documents

# find all entity type in the original data
def findAllEntity(data = None):
    uniqueTag = set()
    [[uniqueTag.add(tag['TYPE']) for tag in eval(tags)] for tags in data['tags']]
    return uniqueTag

In [2]:
def main(model=None, n_iter=3, output_dir=None):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model) # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")
    for LABEL in uniqueTag:
        ner.add_label(LABEL)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
#     ner.add_label("VEGETABLE")
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    dropout = decaying(0.6, 0.2, 1e-4)
    batches = get_batches(train_data=TRAIN_DATA, model_type='ner')
    with nlp.disable_pipes(*other_pipes):  # only train NER
#         sizes = compounding(4, 32, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):            
            if itn%3 == 0:
                random.shuffle(TRAIN_DATA)               
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    try:
                        nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses)
                    except:
                        textWithProblem.append(texts)
                        pass
                print("Losses", losses)
                if output_dir + str(itn) is not None:
                    output_dir = Path(output_dir)
                    if not output_dir.exists():
                        output_dir.mkdir()
            #         nlp.meta["name"] = new_model_name  # rename model
                    nlp.to_disk(output_dir)
                    f1Train.append(evaluate(nlp, TRAIN_DATA, n_iter = itn))
                    f1Test.append(evaluate(nlp, TEST_DATA, n_iter = itn))
                    print("Saved model to", output_dir)
            else:
                random.shuffle(TRAIN_DATA)
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    try:
                        nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses)
                    except:
                        textWithProblem.append(texts)
                        pass
                f1Train.append(evaluate(nlp, TRAIN_DATA, n_iter = itn))
                f1Test.append(evaluate(nlp, TEST_DATA, n_iter = itn))
                print("Losses", losses)

    # save model to output directory
    if output_dir + str(n_itr) is not None:
        output_dir = Path(output_dir + str(n_itr))
        if not output_dir.exists():
            output_dir.mkdir()
#         nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        f1Train.append(evaluate(nlp, TRAIN_DATA, n_iter = n_iter))
        f1Test.append(evaluate(nlp, TEST_DATA, n_iter = n_iter))
        print("Saved model to", output_dir)



In [None]:
f1Train = []
f1Test = []
df = pd.read_excel('./PythonExport.xlsx')
df_2006Train = pd.read_excel('./i2b2-2006Train.xlsx')
df_2006Test = pd.read_excel('./i2b2-2006Test.xlsx')
df = pd.concat([df, df_2006Test, df_2006Train], axis = 0)
train, test = train_test_split(df, test_size=0.1, random_state = 42)
transformedTrainDate = transformDate(train)
transformedTestDate = transformDate(test)
uniqueTag = findAllEntityTransform(transformedTrainDate)
TRAIN_DATA = transformedTrainDate
TEST_DATA = transformedTestDate
textWithProblem = []

In [7]:
df = pd.read_excel('./PythonExport.xlsx')
df_2006Train = pd.read_excel('./i2b2-2006Train.xlsx')
df_2006Test = pd.read_excel('./i2b2-2006Test.xlsx')

In [8]:
len(df)

790

In [9]:
len(df_2006Train)

668

In [10]:
len(df_2006Test)

220

In [22]:
findOneEntity(data = df_2006Train, entType='PATIENT')

{'start': '275', 'end': '280', 'text': 'Blind', 'TYPE': 'PATIENT'}
{'start': '242', 'end': '263', 'text': 'GIRRESNET , DIEDREO A', 'TYPE': 'PATIENT'}
{'start': '155', 'end': '168', 'text': 'LYSSFUST , NY', 'TYPE': 'PATIENT'}
{'start': '132', 'end': '149', 'text': 'GLASS , NELLE KKE', 'TYPE': 'PATIENT'}
{'start': '1036', 'end': '1041', 'text': 'Glass', 'TYPE': 'PATIENT'}
{'start': '1394', 'end': '1399', 'text': 'Glass', 'TYPE': 'PATIENT'}
{'start': '1394', 'end': '1399', 'text': 'Glass', 'TYPE': 'PATIENT'}
{'start': '193', 'end': '215', 'text': 'MERVBENWIERST , BIE S.', 'TYPE': 'PATIENT'}
{'start': '140', 'end': '154', 'text': 'HUHTLAND , CIE', 'TYPE': 'PATIENT'}
{'start': '280', 'end': '294', 'text': 'HUHTLAND , CIE', 'TYPE': 'PATIENT'}
{'start': '422', 'end': '435', 'text': 'Tlandpiernshi', 'TYPE': 'PATIENT'}
{'start': '1282', 'end': '1295', 'text': 'Tlandpiernshi', 'TYPE': 'PATIENT'}
{'start': '2441', 'end': '2454', 'text': 'Tlandpiernshi', 'TYPE': 'PATIENT'}
{'start': '140', 'end': 

{'start': '431', 'end': '435', 'text': 'Cast', 'TYPE': 'PATIENT'}
{'start': '183', 'end': '195', 'text': 'REEF , SANTA', 'TYPE': 'PATIENT'}
{'start': '153', 'end': '174', 'text': 'KOTELEEBDILS , TOZA S', 'TYPE': 'PATIENT'}
{'start': '6574', 'end': '6586', 'text': 'Koteleebdils', 'TYPE': 'PATIENT'}
{'start': '357', 'end': '361', 'text': 'Noun', 'TYPE': 'PATIENT'}
{'start': '147', 'end': '168', 'text': 'FALMDREPSGREND , LINE', 'TYPE': 'PATIENT'}
{'start': '294', 'end': '315', 'text': 'FALMDREPSGREND , LINE', 'TYPE': 'PATIENT'}
{'start': '195', 'end': '199', 'text': 'Rule', 'TYPE': 'PATIENT'}
{'start': '6462', 'end': '6466', 'text': 'Rule', 'TYPE': 'PATIENT'}
{'start': '182', 'end': '204', 'text': 'MESQUOWNFETH , LIBRENT', 'TYPE': 'PATIENT'}
{'start': '8674', 'end': '8679', 'text': 'Hauth', 'TYPE': 'PATIENT'}
{'start': '8674', 'end': '8679', 'text': 'Hauth', 'TYPE': 'PATIENT'}
{'start': '287', 'end': '292', 'text': 'Stone', 'TYPE': 'PATIENT'}
{'start': '4189', 'end': '4194', 'text': 'Ston

{'start': '5183', 'end': '5188', 'text': 'Loose', 'TYPE': 'PATIENT'}
{'start': '130', 'end': '144', 'text': 'GLORENC , NILD', 'TYPE': 'PATIENT'}
{'start': '133', 'end': '153', 'text': 'SLEMNEAVKOTE , PAUCI', 'TYPE': 'PATIENT'}
{'start': '139', 'end': '156', 'text': 'CAPKOTESKOC , KER', 'TYPE': 'PATIENT'}
{'start': '282', 'end': '299', 'text': 'CAPKOTESKOC , KER', 'TYPE': 'PATIENT'}
{'start': '175', 'end': '184', 'text': 'GACH , CH', 'TYPE': 'PATIENT'}
{'start': '1120', 'end': '1124', 'text': 'Gach', 'TYPE': 'PATIENT'}
{'start': '133', 'end': '147', 'text': 'DOSE , LEYRU K', 'TYPE': 'PATIENT'}
{'start': '642', 'end': '646', 'text': 'Dose', 'TYPE': 'PATIENT'}
{'start': '141', 'end': '165', 'text': 'MERVBENWIERST , EZOSAN B', 'TYPE': 'PATIENT'}
{'start': '140', 'end': '154', 'text': 'KOTEGLO , IESA', 'TYPE': 'PATIENT'}
{'start': '280', 'end': '294', 'text': 'KOTEGLO , IESA', 'TYPE': 'PATIENT'}
{'start': '140', 'end': '160', 'text': 'KOTE , LYFRANKLAPALM', 'TYPE': 'PATIENT'}
{'start': '286

{'start': '196', 'end': '205', 'text': 'A , MIFER', 'TYPE': 'PATIENT'}
{'start': '155', 'end': '176', 'text': 'SPIELBOBETREAU , EYRI', 'TYPE': 'PATIENT'}
{'start': '189', 'end': '211', 'text': 'RIDLFRIELS , BRANTN M.', 'TYPE': 'PATIENT'}
{'start': '174', 'end': '197', 'text': 'ROHEESCFOOKSKOTE , RUMA', 'TYPE': 'PATIENT'}
{'start': '171', 'end': '195', 'text': 'TREAUTHORECHIRD , JUATHA', 'TYPE': 'PATIENT'}
{'start': '136', 'end': '158', 'text': 'JESCSHUFFSUBE , CRIA S', 'TYPE': 'PATIENT'}
{'start': '155', 'end': '174', 'text': 'FLICHFENK , VINNETA', 'TYPE': 'PATIENT'}
{'start': '155', 'end': '169', 'text': 'ALEEB , LINEES', 'TYPE': 'PATIENT'}
{'start': '155', 'end': '176', 'text': 'HALPENUSCLINKE , SEPH', 'TYPE': 'PATIENT'}
{'start': '145', 'end': '174', 'text': 'KAYSFREIERMVEQUEDUHE , THAATA', 'TYPE': 'PATIENT'}
{'start': '300', 'end': '329', 'text': 'KAYSFREIERMVEQUEDUHE , THAATA', 'TYPE': 'PATIENT'}
{'start': '179', 'end': '200', 'text': 'SHUFFKOTE , TILDEZY R', 'TYPE': 'PATIENT'}
{'

{'start': '151', 'end': '166', 'text': 'SAWTHORE , DIRI', 'TYPE': 'PATIENT'}
{'start': '154', 'end': '170', 'text': 'CHIRDVOLN , IENA', 'TYPE': 'PATIENT'}
{'start': '153', 'end': '181', 'text': 'PIERNHEAGLEFREIERM , NANIROL', 'TYPE': 'PATIENT'}
{'start': '177', 'end': '194', 'text': 'DUOSSPALD , BRINI', 'TYPE': 'PATIENT'}
{'start': '140', 'end': '149', 'text': 'SAP , A A', 'TYPE': 'PATIENT'}
{'start': '174', 'end': '193', 'text': 'AGNEAGURNE , TOFRAN', 'TYPE': 'PATIENT'}
{'start': '172', 'end': '193', 'text': 'JESCBULLSSUBE , SHALL', 'TYPE': 'PATIENT'}
{'start': '144', 'end': '169', 'text': 'FYFEFREIERM , OLECARDRAND', 'TYPE': 'PATIENT'}
{'start': '295', 'end': '320', 'text': 'FYFEFREIERM , OLECARDRAND', 'TYPE': 'PATIENT'}
{'start': '189', 'end': '204', 'text': 'KAYSKUDZ , NARO', 'TYPE': 'PATIENT'}
{'start': '173', 'end': '202', 'text': 'STREUSPERBREEF , HIREENGLEN E', 'TYPE': 'PATIENT'}
{'start': '472', 'end': '486', 'text': 'Streusperbreef', 'TYPE': 'PATIENT'}
{'start': '157', 'end':

{'start': '178', 'end': '204', 'text': 'LUDZHAYTHTONERUDES , MASRA', 'TYPE': 'PATIENT'}
{'start': '173', 'end': '196', 'text': 'FEDDNOORT , RIKEREUGE L', 'TYPE': 'PATIENT'}
{'start': '145', 'end': '165', 'text': 'ROUCHEDUHETLAND , RA', 'TYPE': 'PATIENT'}
{'start': '291', 'end': '311', 'text': 'ROUCHEDUHETLAND , RA', 'TYPE': 'PATIENT'}
{'start': '156', 'end': '172', 'text': 'ASHA KA BEALYARB', 'TYPE': 'PATIENT'}
{'start': '146', 'end': '158', 'text': 'RHALT , CAYO', 'TYPE': 'PATIENT'}
{'start': '284', 'end': '296', 'text': 'RHALT , CAYO', 'TYPE': 'PATIENT'}
{'start': '146', 'end': '164', 'text': 'FREIERMLEEB , USNA', 'TYPE': 'PATIENT'}
{'start': '290', 'end': '308', 'text': 'FREIERMLEEB , USNA', 'TYPE': 'PATIENT'}
{'start': '172', 'end': '190', 'text': 'LARDES , ANIECA S.', 'TYPE': 'PATIENT'}
{'start': '214', 'end': '218', 'text': 'SINT', 'TYPE': 'PATIENT'}
{'start': '157', 'end': '181', 'text': 'BEATHEWIERSTJESC , JOWIL', 'TYPE': 'PATIENT'}
{'start': '155', 'end': '175', 'text': 'MOFLE

{'start': '154', 'end': '172', 'text': 'DARLSTELLPAL CHINA', 'TYPE': 'PATIENT'}
{'start': '155', 'end': '166', 'text': 'BETLA CHECK', 'TYPE': 'PATIENT'}
{'start': '157', 'end': '174', 'text': 'KOORSKOTE , RETCO', 'TYPE': 'PATIENT'}
{'start': '157', 'end': '176', 'text': 'SMANTVERGE , TAWNEN', 'TYPE': 'PATIENT'}
{'start': '156', 'end': '178', 'text': 'SWANKEGIERNILL , LAJAC', 'TYPE': 'PATIENT'}
{'start': '192', 'end': '206', 'text': 'OBEFYFE , TREE', 'TYPE': 'PATIENT'}
{'start': '155', 'end': '170', 'text': 'GUILDSELC , TAT', 'TYPE': 'PATIENT'}
{'start': '171', 'end': '197', 'text': 'CLOZLARDAFREIERM , IEDEARC', 'TYPE': 'PATIENT'}
{'start': '156', 'end': '186', 'text': 'SCOSTGACMARV , HELAVEHARBRASTY', 'TYPE': 'PATIENT'}
{'start': '190', 'end': '217', 'text': 'ZUKMAVLARDES , CINEQUAMEAAA', 'TYPE': 'PATIENT'}
{'start': '155', 'end': '176', 'text': 'FEDDNOORT , RIKEREUGE', 'TYPE': 'PATIENT'}
{'start': '156', 'end': '180', 'text': 'SHUFFBELBBREUN , LYSSESG', 'TYPE': 'PATIENT'}
{'start': '1

In [19]:
findAllEntity(df_2006Test)

{'AGE', 'DATE', 'DOCTOR', 'HOSPITAL', 'ID', 'LOCATION', 'PATIENT', 'PHONE'}