In [2]:
import spacy
from spacy.training.example import Example

# Load the CoNLL-2003 dataset
TRAIN_DATA = spacy.datasets.conll2003.read_training("path/to/train.txt")


AttributeError: module 'spacy' has no attribute 'datasets'

In [None]:
nlp = spacy.load("en_core_web_sm")

# Add a new entity label to the model
ner = nlp.get_pipe("ner")
ner.add_label("MY_ENTITY")


In [None]:
from spacy.util import minibatch, compounding

# Define hyperparameters
n_iter = 100
dropout = 0.5
batch_size = 32
learn_rate = 0.001

# Get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Only train NER
with nlp.disable_pipes(*other_pipes):
    # Initialize optimizer and loss function
    optimizer = nlp.begin_training()
    loss_function = nlp.entity.create_loss(ner)

# Iterate over training data
for i in range(n_iter):
    losses = {}
    # Create batches of data using spaCy's minibatch function
    batches = minibatch(TRAIN_DATA, size=compounding(batch_size, 32, 1.001))
    for batch in batches:
        # Extract the text and annotations from the batch
        texts, annotations = zip(*batch)
        examples = []
        # Convert the text and annotations into Example objects
        for i in range(len(texts)):
            examples.append(Example.from_dict(nlp.make_doc(texts[i]), annotations[i]))
        # Update the NER model with the examples
        nlp.update(examples, sgd=optimizer, drop=dropout, losses=losses)

        # Print the loss every 10 iterations
        if i % 10 == 0:
            print(f"Loss at iteration {i}: {losses['ner']}")

# Save the trained model
nlp.to_disk("path/to/model")


In [None]:
# Load the test dataset
TEST_DATA = spacy.datasets.conll2003.read_data("path/to/test.txt")

# Disable other pipes to only use NER in the pipeline
with nlp.select_pipes(enable=["ner"]):
    # Evaluate the model
    scorer = nlp.evaluate(TEST_DATA)

# Print the scores
print(scorer.scores)
