In [36]:
# imports 
import spacy
from pathlib import Path
from spacy.tokenizer import Tokenizer
from utils import files_for_year
from utils import read_files_for_year
import logging
import json

In [2]:
# load ner model 
nlp = spacy.load('en_core_web_sm')

In [61]:
PATH = Path('Full_text/Acton.1893.b19783280.txt')
acton_1893 = open(PATH).read()
doc = nlp(acton_1893)

# Try the default NER model from spacy

In [62]:
def print_ents(ent_number=20):
    count = 0 
    for ent in doc.ents:
        if count < ent_number:
            print(ent.text, ent.label_)
            count += 1
print_ents(30)

I. A.R.332 PERSON
BS X ORG
ACTION LOCAL BOARD ORG
THE Medical Officer ORG
the year 1893 DATE
the ACTON VALE PRINTING PRESS ORG
Acton Vale GPE
W. ACT PERSON
ACTON LOCAL BOARD ORG
THE Medical Officer ORG
the year 1893 DATE
the ACTON VALE PRINTING PRESS ORG
Acton Vale GPE
W. Acton Local Board ORG
Officer of Health ORG
the year 1893 DATE
the Acton Local Board ORG
Acton GPE
the year 1893 DATE
first ORDINAL
Acton GPE
Sanitary Science PERSON
1884 DATE
1885 & 1886 DATE
1885 DATE
1885 DATE
1886 1887 DATE
1888 DATE
1889 DATE
1890 DATE


This gives us a good starting point but we might also want to have some additional entities 

## Creating a new entity type for diseases



## Prepare some data for annotation 


### Use Dataturk website to create annotations 

I created a small [training set](https://dataturks.com/projects/davanstrien/MOH%201895%20full%20text/export). I only used on report to see how the model might work (probably not very well). If we want to do this properly we will need to also create a validation set and think more about whether we want out model to perform well for only one entity or multiple entities. 

Convert into spacy format using script provides by [Dataturk](https://dataturks.com/help/dataturks-ner-json-to-spacy-train.php) 

In [37]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
return None

In [38]:
annotations = convert_dataturks_to_spacy('ner_tags.json')

# Train our model on our annotations

In [57]:
from __future__ import unicode_literals, print_function
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# new entity label
LABEL = 'Disease'
OUTDIR = Path('models/')
# training data
# Note: If you're using an existing model, make sure to mix in examples of
# other entity types that spaCy correctly recognized before. Otherwise, your
# model might learn the new type, but "forget" what it previously knew.
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
TRAIN_DATA = annotations


def train_ner(model=None, new_model_name='disease', output_dir=OUTDIR, n_iter=10):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print('Losses', losses)

    # test the trained model
    test_text = ' I have included deaths certified as from Diarrhoea alone, or in combination with some other cause of ill defined nature; and also deaths certified as from Epidemic Enteritis, Zymotic Enteritis, Epidemic Diarrhoea, Summer Diarrhoea, Dysentery and Dysenteric diarrhoea, Choleraic Diarrhoea, Cholera, Cholera Nostras, (in the absence of Asiatic Cholera), 10 According to the previous methods of classification I could have shown a marked diminution in the number of deaths from Diarrhoea, but the method as now suggested is more satisfactory.'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path('output_dir')
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)


In [59]:
# Train our model on our training data 
train_ner()

Created blank 'en' model
Losses {'ner': 1387.8690509796143}
Losses {'ner': 398.28905057907104}
Losses {'ner': 106.6355249478338}
Losses {'ner': 63.06028253759854}
Losses {'ner': 70.9119262383499}
Losses {'ner': 57.94399209912462}
Losses {'ner': 111.4920801589833}
Losses {'ner': 71.88125707020441}
Losses {'ner': 85.45086969149997}
Losses {'ner': 77.80551897608075}
Entities in ' I have included deaths certified as from Diarrhoea alone, or in combination with some other cause of ill defined nature; and also deaths certified as from Epidemic Enteritis, Zymotic Enteritis, Epidemic Diarrhoea, Summer Diarrhoea, Dysentery and Dysenteric diarrhoea, Choleraic Diarrhoea, Cholera, Cholera Nostras, (in the absence of Asiatic Cholera), 10 According to the previous methods of classification I could have shown a marked diminution in the number of deaths from Diarrhoea, but the method as now suggested is more satisfactory.'
Disease Diarrhoea
Disease Epidemic Enteritis,
Disease Zymotic Enteritis
Disease

In [58]:
# Load our new NER model 
output_dir = Path('output_dir')
nlp2 = spacy.load(output_dir)


## Test the model

Testing here is done 'by eye' we need to test the model properly with a validation set to do get a proper metric for how well it is performing on our data

In [56]:

bethnal_green = open('Full_text/BethnalGreen.1894.b17997689.txt').read()
doc3 = nlp2(bethnal_green)
for ent in doc3.ents:
            print(ent.label_, ent.text)

Disease DURING THE
Disease GEORGE PADDOCK
Disease Membranous Croup
Disease ) 23
Disease Whooping Cough
Disease Small Pox
Disease Hospitals
Disease Respiratory Diseases
Disease Inhabited Houses
Disease Births,
Disease Deaths,
Disease and Marriages
Disease Annual
Disease Deaths
Disease Street List
Disease James Westminster
Disease Hampstead
Disease Plumstead
Disease St. George
Disease Old
Disease Strand
Disease Southwark
Disease St. Luke
Disease 4
Disease St. George
Disease these
Disease Poplar,
Disease St.
Disease Saviour
Disease St. George
Disease Southwark
Disease Whitechapel
Disease Diarrhoea
Disease 328
Disease Pancras
Disease Dietic Diseases
Disease 23
Disease 0.07
Disease 0.01
Disease 2
Disease VII.—
Disease 0.11 0.02
Disease Deaths Registered
Disease Metropolitan Sanitary
Disease Bethnal Green,
Disease Diseases
Disease , Ages,
Disease 65 &
Disease Membranous Croup
Disease Measles
Disease Whooping Cough
Disease Diarrhoea
Disease Rheumatic Fever
Disease 2
Disease Deaths
Disease Sma