In [12]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The car was navigating to the Oxford Street")
    # text containing locality entity "Oxford Street"

# The issue with pretrained model, doesn't recognize Oxford Street as Location entity
for ent in doc.ents:
    print(ent.text)
    print(ent.label_)
"""
    the Oxford Street
    ORG
"""

the Oxford Street
ORG


## Training Data Preparation

In [22]:
import spacy
from spacy.training.example import Example

nlp = spacy.load("en_core_web_sm")

doc1 = nlp.make_doc("Patient was prescribed Aspirin")
doc2 = nlp.make_doc("Bill Gates visited SFO Airport")

annotation1 = {"entities": [(24, 31, "MEDICINE")]}
annotation2 = {"entities": [(0, 10, "PERSON"), (19, 29, "LOC")]}

example1 = Example.from_dict(doc1, annotation1)
example2 = Example.from_dict(doc2, annotation2)

print(example1.to_dict())
"""
{
'doc_annotation': {
    'cats': {}, 
    'entities': ['O', 'O', 'O', 'O'], 
    'spans': {}, 
    'links': {}
    },
'token_annotation': {
    'ORTH': ['Patient', 'was', 'prescribed', 'Aspirin'], 
    'SPACY': [True, True, True, False], 
    'TAG': ['', '', '', ''], 
    'LEMMA': ['', '', '', ''], 
    'POS': ['', '', '', ''], 
    'MORPH': ['', '', '', ''], 
    'HEAD': [0, 1, 2, 3], 
    'DEP': ['', '', '', ''], 
    'SENT_START': [1, 0, 0, 0]
    }
}
"""

{'doc_annotation': {'cats': {}, 'entities': ['O', 'O', 'O', 'O'], 'spans': {}, 'links': {}}, 'token_annotation': {'ORTH': ['Patient', 'was', 'prescribed', 'Aspirin'], 'SPACY': [True, True, True, False], 'TAG': ['', '', '', ''], 'LEMMA': ['', '', '', ''], 'POS': ['', '', '', ''], 'MORPH': ['', '', '', ''], 'HEAD': [0, 1, 2, 3], 'DEP': ['', '', '', ''], 'SENT_START': [1, 0, 0, 0]}}


## Training with spacy

In [None]:
import spacy
import random
from spacy.training import Example

# Load small English model
nlp = spacy.load("en_core_web_sm")

# Get the NER pipeline
ner = nlp.get_pipe("ner")

# Example training data (you can add more sentences here)
training_data = [
    ("Barack Obama visited Microsoft headquarters in Seattle on Monday.", {
        "entities": [
            (0, 12, "PERSON"),      # Barack Obama
            (21, 30, "ORG"),        # Microsoft
            (47, 54, "LOC"),        # Seattle
            (58, 64, "DATE")        # Monday
        ]
    }),
    ("Bill Gates founded Microsoft.", {
        "entities": [
            (0, 10, "PERSON"),
            (19, 28, "ORG")
        ]
    }),
    ("Elon Musk is the CEO of SpaceX.", {
        "entities": [
            (0, 9, "PERSON"),
            (27, 33, "ORG")
        ]
    })
]

# Add new labels to the NER
for _, annotations in training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable other pipes except NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # only train NER
    # Create optimizer
    optimizer = nlp.resume_training()

    # Number of iterations
    epochs = 20

    for i in range(epochs):
        losses = {}
        random.shuffle(training_data)
        for text, annotation in training_data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotation)
            nlp.update([example], sgd=optimizer, losses=losses)
        print(f"Epoch {i+1}/{epochs} - Losses: {losses}")

# Save model to disk
nlp.to_disk("custom_ner")

print("Training complete! Model saved as 'custom_ner'")


In [24]:
import random 
import spacy
from spacy.training import Example

nlp = spacy.load("en_core_web_sm")

# Example training data
#   list of tuples of the form
#   2 values in tuples - 
#      sentence string
#      entities dictionary
"""
    ("Sentence", {
        "entities": [(start, end, "label), ...]
    })

"""
training_data = [
    ("Barack Obama visited Microsoft headquarters in Seattle on Monday.", {
        "entities": [
            (0, 12, "PERSON"),      # Barack Obama
            (21, 30, "ORG"),        # Microsoft
            (47, 54, "LOC"),        # Seattle
            (58, 64, "DATE")        # Monday
        ]
    }),
    ("Bill Gates founded Microsoft.", {
        "entities": [
            (0, 10, "PERSON"),
            (19, 28, "ORG")
        ]
    }),
    ("Elon Musk is the CEO of SpaceX.", {
        "entities": [
            (0, 9, "PERSON"),
            (27, 33, "ORG")
        ]
    })
]

# Getting the ner pipe
ner = nlp.get_pipe("ner")

# Disabling Other Pipes
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
nlp.disable_pipes(*other_pipes)

# Creating Optimizer
optmizer = nlp.resume_training()

# Updating model weights
losses = {}
epochs = 20

for i in range(epochs):
    random.shuffle(training_data)
    for text, annotation in training_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotation)
        nlp.update([example], sgd = optmizer, losses = losses)




In [None]:
import spacy
from spacy.training.example import Example
import random

# Load pre-trained model
nlp = spacy.load("en_core_web_sm")

# Disable other components except NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    # Sample training data
    TRAIN_DATA = [
        {
            "sentence": "Patient was prescribed Aspirin.",
            "entities": [(24, 31, "MEDICINE")]
        }
    ]
    examples = []
    for record in TRAIN_DATA:
        doc = nlp.make_doc(record["sentence"])
        annotations = {"entities": record["entities"]}
        examples.append(Example.from_dict(doc, annotations))

    # Create optimizer
    optimizer = nlp.create_optimizer()
    losses = {}

    # Train for 5 epochs
    for epoch in range(5):
        random.shuffle(examples)
        nlp.update(examples, sgd=optimizer, losses=losses)
        print(f"Epoch {epoch+1}, Losses: {losses}")
# Output (example):
# Epoch 1, Losses: {'ner': 0.8}
# Epoch 2, Losses: {'ner': 0.5}
# Epoch 3, Losses: {'ner': 0.3}
# Epoch 4, Losses: {'ner': 0.2}
# Epoch 5, Losses: {'ner': 0.1}

# Save NER component
ner = nlp.get_pipe("ner")
ner.to_disk("ner_model")

# Load trained NER component
nlp2 = spacy.load("en_core_web_sm")
ner2 = nlp2.create_pipe("ner")
ner2.from_disk("ner_model")
nlp2.add_pipe(ner2, name="ner")

# Inference
doc = nlp2("Aspirin was given to the patient.")
for ent in doc.ents:
    print(ent.text, ent.label_)
# Output:
# Aspirin MEDICINE

## Customizing Spacy Model DataCamp example

In [None]:
# Append a tuple of (entities text, entities label) if Jumbo is in the entity
target_entities = []
for doc in documents:
  target_entities.extend([(ent.text, ent.label_) for ent in doc.ents if "Jumbo" in ent.text])
print(target_entities)

# Append True to the correct_labels list if the entity label is `PRODUCT`
correct_labels = []
for ent in target_entities:
  if target_entities[1] == "PRODUCT":
    correct_labels.append(True)
  else:
    correct_labels.append(False)
print(correct_labels)




text = "A patient with chest pain had hyperthyroidism."
entity_1 = "chest pain"
entity_2 = "hyperthyroidism"

# Store annotated data information in the correct format
annotated_data = {"sentence": text, "entities": [{"label": "SYMPTOM", "value": entity_1}, {"label": "DISEASE", "value": entity_2}]}

# Extract start and end characters of each entity
entity_1_start_char = text.find(entity_1)
entity_1_end_char = entity_1_start_char + len(entity_1)
entity_2_start_char = text.find(entity_2)
entity_2_end_char = entity_2_start_char + len(entity_2)

# Store the same input information in the proper format for training
training_data = [(text, {"entities": [(entity_1_start_char,entity_1_end_char,"SYMPTOM"), 
                                      (entity_2_start_char,entity_2_end_char,"DISEASE")]})]
print(training_data)




example_text = 'A patient with chest pain had hyperthyroidism.'
training_data = [(example_text, {'entities': [(15, 25, 'SYMPTOM'), (30, 45, 'DISEASE')]})]

all_examples = []
# Iterate through text and annotations and convert text to a Doc container
for text, annotations in training_data:
  doc = nlp(text)
  
  # Create an Example object from the doc contianer and annotations
  example_sentence = Example.from_dict(doc, annotations)
  print(example_sentence.to_dict(), "\n")
  
  # Append the Example object to the list of all examples
  all_examples.append(example_sentence)
  
print("Number of formatted training data: ", len(all_examples))





nlp = spacy.load("en_core_web_sm")

# Disable all pipeline components of  except `ner`
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
nlp.disable_pipes(*other_pipes)

# Convert a text and its annotations to the correct format usable for training
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
print("Example object for training: \n", example.to_dict())






nlp = spacy.load("en_core_web_sm")
print("Before training: ", [(ent.text, ent.label_) for ent in nlp(test).ents])
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
nlp.disable_pipes(*other_pipes)
optimizer = nlp.create_optimizer()

# Shuffle training data and the dataset using random package per epoch
for i in range(epochs):
  random.shuffle(training_data)
  for text, annotations in training_data:
    doc = nlp.make_doc(text)
    # Update nlp model after setting sgd argument to optimizer
    example = Example.from_dict(doc, annotations)
    nlp.update([example], sgd = optimizer)
print("After training: ", [(ent.text, ent.label_) for ent in nlp(test).ents])







# Load a blank English model, add NER component, add given labels to the ner pipeline
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
for ent in labels:
    ner.add_label(ent)

# Disable other pipeline components, complete training loop and run training loop
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
nlp.disable_pipes(*other_pipes)
losses = {}
optimizer = nlp.begin_training()
for text, annotation in training_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotation)
    nlp.update([example], sgd=optimizer, losses=losses)
    print(losses)
