In [1]:
!pip install "datasets<3.0.0" seqeval
!python -m spacy download en_core_web_sm

Collecting datasets<3.0.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets<3.0.0)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.0
    Uninstalling fsspec-2025.3.0:
      Successfully uninstalled fsspec-2025.3.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
[31mERROR: pip's d

In [2]:
import spacy
import numpy as np
from datasets import load_dataset
from spacy.tokens import Doc

# 1. Load Real-World Data (CoNLL-2003: News Articles)
print("Loading CoNLL-2003 dataset...")
# With datasets < 3.0.0, we use trust_remote_code=True to allow the CoNLL script to run
dataset = load_dataset("conll2003", trust_remote_code=True)

# Use the 'test' split for evaluation
test_data = dataset['test']

# Check a sample
print("\nSample Data (Tokens):", test_data[0]['tokens'])
print("Sample Labels (IDs):", test_data[0]['ner_tags'])


Loading CoNLL-2003 dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]


Sample Data (Tokens): ['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.']
Sample Labels (IDs): [0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]


In [3]:
# 2. Load Pre-trained NER System (spaCy)
nlp = spacy.load("en_core_web_sm")

# Map CoNLL numerical IDs to Label Names (e.g., 1 -> B-PER)
label_list = dataset['train'].features['ner_tags'].feature.names

# Lists to store True Labels and Predicted Labels
true_labels = []
pred_labels = []

print("Running NER on test set... (This may take a moment)")

for item in test_data:
    # Get the tokens and true tags from the dataset
    tokens = item['tokens']
    ner_ids = item['ner_tags']

    # Create a spaCy Doc using the pre-existing tokens (prevents tokenization mismatch)
    doc = Doc(nlp.vocab, words=tokens)

    # Run the NER component only
    nlp.get_pipe("ner")(doc)

    # Extract True Labels (convert IDs to strings like 'B-ORG')
    t_labels = [label_list[i] for i in ner_ids]
    true_labels.append(t_labels)

    # Extract Predicted Labels
    # We initialize all as 'O' (Outside/No Entity)
    p_labels = ['O'] * len(tokens)

    # Fill in predictions from spaCy
    for ent in doc.ents:
        # ent.start and ent.end are token indices
        p_labels[ent.start] = f"B-{ent.label_}"
        for i in range(ent.start + 1, ent.end):
            p_labels[i] = f"I-{ent.label_}"

    pred_labels.append(p_labels)

print("Prediction complete.")

Running NER on test set... (This may take a moment)
Prediction complete.


In [4]:
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

# 3. Calculate Metrics
print("\n--- Model Performance Metrics ---\n")

# Calculate overall metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")

print("\n--- Detailed Classification Report ---\n")
print(classification_report(true_labels, pred_labels))


--- Model Performance Metrics ---

Accuracy:  0.7108
Precision: 0.0646
Recall:    0.0970
F1-Score:  0.0776

--- Detailed Classification Report ---



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    CARDINAL       0.00      0.00      0.00         0
        DATE       0.00      0.00      0.00         0
       EVENT       0.00      0.00      0.00         0
         FAC       0.00      0.00      0.00         0
         GPE       0.00      0.00      0.00         0
    LANGUAGE       0.00      0.00      0.00         0
         LAW       0.00      0.00      0.00         0
         LOC       0.56      0.02      0.04      1668
        MISC       0.00      0.00      0.00       702
       MONEY       0.00      0.00      0.00         0
        NORP       0.00      0.00      0.00         0
     ORDINAL       0.00      0.00      0.00         0
         ORG       0.43      0.31      0.36      1661
         PER       0.00      0.00      0.00      1617
     PERCENT       0.00      0.00      0.00         0
      PERSON       0.00      0.00      0.00         0
     PRODUCT       0.00      0.00      0.00         0
    QUANTITY       0.00    