<a href="https://colab.research.google.com/github/ferran9908/CustomNamedEntityRecognition/blob/master/SpacyNoteBook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import spacy
from spacy.matcher import PhraseMatcher
import plac
import random
from pathlib import Path

In [0]:
# Get Phrase's Indexes
# Utility Function
def offsetter(lbl,doc,matchedItem):
    o_one = len(str(doc[0:matchedItem[1]]))
    subDoc = doc[matchedItem[1]:matchedItem[2]]
    o_two = o_one + len(str(subDoc))
    return (o_one+1,o_two+1,label)

In [0]:
nlp = spacy.load('en')

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

In [0]:
# Convert Text File contents to list
diseaseList = []
with open('diseaseList.txt','r') as fileObj:
    for line in fileObj:
        diseaseList.append(line[:-1])

In [0]:
label = 'DISEASE'
matcher = PhraseMatcher(nlp.vocab)

for i in diseaseList:
    matcher.add(label,None,nlp(i))

In [0]:
# Test Cell
one = nlp("Someone has the Abdominal aortic aneurysm and Thrush in men")
matches = matcher(one)

matches


In [0]:
# Gather Training Data
res = []
to_train_ents = []
with open('diseases.txt') as fileObj:
    line = True
    while line:
        line = fileObj.readline()
        line = line[:-1]
        mnlp_line = nlp(line)
        matches = matcher(mnlp_line)
        res = [offsetter(label,mnlp_line,x) for x in matches]
        to_train_ents.append((line,dict(entities=res)))

In [0]:
del to_train_ents[1890]

In [0]:
#!/usr/bin/env python
# coding: utf8
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.
For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
Last tested with: v2.1.0
"""
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# training data
# TRAIN_DATA = [
#     ("The patient is suffering from Anaemia", {"entities": [(30, 37, "DISEASE")]}),
#     ("The patient has Pneumonia and Fever", {"entities": [(16, 25, "DISEASE"), (31, 36, "DISEASE")]}),
#     ('the patient is suffering from Abdominal aortic aneurysm', {'entities': [(30, 55, 'DISEASE')]})
# ]


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir=None, n_iter=50):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in to_train_ents:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(to_train_ents)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(to_train_ents, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Iteration:",itn+1)
            print("Losses", losses)

    # test the trained model
    for text, _ in to_train_ents[:20]:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    # if output_dir is not None:
    #     output_dir = Path(output_dir)
    #     if not output_dir.exists():
    #         output_dir.mkdir()
    #     nlp.to_disk(output_dir)
    #     print("Saved model to", output_dir)

    #     # test the saved model
    #     print("Loading from", output_dir)
    #     nlp2 = spacy.load(output_dir)
    #     for text, _ in to_train_ents:
    #         doc = nlp2(text)
    #         print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    #         print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    nlp.to_disk('modelTrained')


if __name__ == "__main__":
    main()

    # Expected output:
    # Entities [('Shaka Khan', 'PERSON')]
    # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
    # ('Khan', 'PERSON', 1), ('?', '', 2)]
    # Entities [('London', 'LOC'), ('Berlin', 'LOC')]
    # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
    # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]

In [0]:
!pwd

In [0]:
nlp.to_disk('/content/model')

In [0]:
train = ["It looks like the patient has a high fever","this looks like a case of Cellulitis"]

In [0]:
nlp2 = spacy.load('/content/modelTrained')
for text in train:
  doc = nlp2(text)
  print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
  print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [0]:
!pwd

In [0]:
!zip -r trainedModel modelTrained