In [24]:
import spacy
from spacy.matcher import PhraseMatcher
import plac
import random
from pathlib import Path

In [25]:
# Get Phrase's Indexes
# Utility Function
def offsetter(lbl,doc,matchedItem):
    o_one = len(str(doc[0:matchedItem[1]]))
    subDoc = doc[matchedItem[1]:matchedItem[2]]
    o_two = o_one + len(str(subDoc))
    return (o_one+1,o_two+1,label)

In [26]:
nlp = spacy.load('en')

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

In [27]:
# Convert Text File contents to list
diseaseList = []
with open('diseaseList.txt','r') as fileObj:
    for line in fileObj:
        diseaseList.append(line[:-1])

In [28]:
label = 'DISEASE'
matcher = PhraseMatcher(nlp.vocab)

for i in diseaseList:
    matcher.add(label,None,nlp(i))

In [29]:
# Test Cell
one = nlp("Someone has the Abdominal aortic aneurysm and Thrush in men")
matches = matcher(one)

matches


[(9255184837977538312, 3, 6), (9255184837977538312, 7, 10)]

In [30]:
# Gather Training Data
res = []
to_train_ents = []
with open('diseases.txt') as fileObj:
    line = True
    while line:
        line = fileObj.readline()
        line = line[:-1]
        mnlp_line = nlp(line)
        matches = matcher(mnlp_line)
        res = [offsetter(label,mnlp_line,x) for x in matches]
        to_train_ents.append((line,dict(entities=res)))

In [33]:
del to_train_ents[1890]

In [22]:
optimizer = nlp.begin_training()

# Get all other Pipes
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

# Disable Other pipes
with nlp.disable_pipes(*other_pipes):
    for itn in range(2):
        print(itn)
        losses = {}
        random.shuffle(to_train_ents)
        for item in to_train_ents:
            print("Iteration: " + str(itn+1))
            print(item)
            nlp.update([item[0]],[item[1]],sgd=optimizer,drop=1,losses=losses)

0
Iteration: 1
('this looks like a case of Psoriatic arthritis', {'entities': [(26, 45, 'DISEASE')]})


KeyError: "[E022] Could not find a transition with the name 'B-DISEASE' in the NER model."

In [None]:
# Save Model to Disk
nlp.to_disk('Model')

In [None]:
# Test Model
nlp = spacy.load('en')
sentence = "this looks like a case of Lymphoedema"
docs = nlp(sentence)
docs

In [None]:
for ent in docs.ents: 
    print(ent.text, ent.start_char, ent.end_char, ent.label_) 

In [3]:
nlp = spacy.blank('en')

In [10]:
# From Documentation in GitHub
#!/usr/bin/env python
# coding: utf8
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.
For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
Last tested with: v2.1.0
"""
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# training data
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
#     if model is not None:
#         nlp = spacy.load(model)  # load existing spaCy model
#         print("Loaded model '%s'" % model)
#     else:
#         nlp = spacy.blank("en")  # create blank Language class
#         print("Created blank 'en' model")
        
    print("MODEL INITIALIZED")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
        
    print("CREATED PIPELINE")

    # add labels
    for _, annotations in to_train_ents:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            
    print("Added LABELS")

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(to_train_ents)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(to_train_ents, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in to_train_ents:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in to_train_ents:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])


if __name__ == "__main__":
    main()
#     plac.call(main)
    # Expected output:
    # Entities [('Shaka Khan', 'PERSON')]
    # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
    # ('Khan', 'PERSON', 1), ('?', '', 2)]
    # Entities [('London', 'LOC'), ('Berlin', 'LOC')]tttt
    # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
    # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]

MODEL INITIALIZED
CREATED PIPELINE
Added LABELS
Losses {'ner': 4857.89730116765}
Losses {'ner': 4832.575065853102}
Losses {'ner': 4938.336039163635}
Losses {'ner': 4945.355514277998}
Losses {'ner': 5016.464647641723}
Losses {'ner': 5050.953552430338}
Losses {'ner': 5076.28055046292}
Losses {'ner': 5061.624060106382}
Losses {'ner': 5144.226772095143}
Losses {'ner': 5067.465040393731}
Losses {'ner': 5037.756024842716}
Losses {'ner': 5298.970778463208}
Losses {'ner': 5301.740285147746}
Losses {'ner': 5244.270046461005}
Losses {'ner': 4854.728194535324}
Losses {'ner': 5179.372884972885}
Losses {'ner': 5092.115091820892}


KeyboardInterrupt: 

In [3]:
!ls
!unzip trainedModel.zip

README.md                            diseases.txt
SpacyNotebook - Implementation.ipynb trainedModel.zip
diseaseList.txt
Archive:  trainedModel.zip
   creating: modelTrained/
  inflating: modelTrained/meta.json  
  inflating: modelTrained/tokenizer  
   creating: modelTrained/vocab/
  inflating: modelTrained/vocab/vectors  
  inflating: modelTrained/vocab/strings.json  
 extracting: modelTrained/vocab/key2row  
  inflating: modelTrained/vocab/lexemes.bin  
   creating: modelTrained/ner/
  inflating: modelTrained/ner/cfg    
  inflating: modelTrained/ner/model  
  inflating: modelTrained/ner/moves  
