In [1]:
import spacy
from spacy.training.example import Example
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import warnings

In [3]:
nlp = spacy.load("en_core_web_sm")

print(f"Model loaded: {nlp.meta['name']}")
print("Pipeline components:", nlp.pipe_names)

Model loaded: core_web_sm
Pipeline components: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [28]:
validation_data = [
    (
        "by selling apples U.K. earns $123.50 million per year. that contributes 7% GDP of U.K.",
        {"entities": [(17, 21, "GPE"), (28, 43, "MONEY"), (63, 65, "PERCENT"), (73, 77, "GPE")]}
    ),
    (
        "Elon Musk leads Tesla and SpaceX from Texas.",
        {"entities": [(0, 9, "PERSON"), (16, 21, "ORG"), (26, 32, "ORG"), (38, 43, "GPE")]}
    ),
    (
        "The Olympics will be held in Paris in 2024.",
        {"entities": [(4, 12, "EVENT"), (29, 34, "GPE"), (38, 42, "DATE")]}
    ),
    (
        "Google Cloud announced new AI tools on Monday.",
        {"entities": [(0, 12, "ORG"), (39, 45, "DATE")]}
    ),
    (
        "Amazon shares fell by 5% in New York trading.",
        {"entities": [(0, 6, "ORG"), (22, 24, "PERCENT"), (28, 36, "GPE")]}
    )
]

print(f"Dataset prepared with {len(validation_data)} examples.")

Dataset prepared with 5 examples.


In [29]:
sample_text = validation_data[0][0]
doc = nlp(sample_text)

print(f"Text: {sample_text}\n")
print(f"{'Entity':<15} {'Label':<10}")
print("-" * 22)

for ent in doc.ents:
    print(f"{ent.text:<15} {ent.label_:<10}")

Text: by selling apples U.K. earns $123.50 million per year. that contributes 7% GDP of U.K.

Entity          Label     
----------------------
U.K.            GPE       
$123.50 million MONEY     
7%              PERCENT   
U.K.            GPE       


In [30]:
def get_token_labels(data, nlp_model):
    y_true = []
    y_pred = []

    for text, annotations in data:
        doc = nlp_model(text)

        # Create an Example object to handle the "Gold Standard" alignment
        # This maps the character offsets to the actual tokens
        example = Example.from_dict(doc, annotations)

        # Extract ground truth tags (aligned to the tokenization)
        # 0 = No entity, other integers map to entity IDs
        true_tags = [token.ent_type_ if token.ent_type_ else "O" for token in example.reference]

        # Extract predicted tags
        pred_tags = [token.ent_type_ if token.ent_type_ else "O" for token in doc]

        y_true.extend(true_tags)
        y_pred.extend(pred_tags)

    return y_true, y_pred

# Run the function
y_true, y_pred = get_token_labels(validation_data, nlp)

print(f"Total tokens evaluated: {len(y_true)}")
print(f"Sample True: {y_true[:10]}")
print(f"Sample Pred: {y_pred[:10]}")

Total tokens evaluated: 57
Sample True: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Sample Pred: ['O', 'O', 'O', 'GPE', 'O', 'MONEY', 'MONEY', 'MONEY', 'O', 'O']




In [31]:
accuracy = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)

print("--- Overall Performance ---")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

print("\n------------- Detailed Report by Entity Type ------------")
report = classification_report(y_true, y_pred, zero_division=0)
print(report)

--- Overall Performance ---
Accuracy:  0.7895
Precision: 0.8937
Recall:    0.7895
F1 Score:  0.8043

------------- Detailed Report by Entity Type ------------
              precision    recall  f1-score   support

        DATE       1.00      1.00      1.00         2
       EVENT       1.00      1.00      1.00         1
         GPE       0.57      1.00      0.73         4
       MONEY       0.00      0.00      0.00         0
           O       0.94      0.80      0.87        41
         ORG       1.00      0.20      0.33         5
     PERCENT       0.50      1.00      0.67         2
      PERSON       0.50      1.00      0.67         2

    accuracy                           0.79        57
   macro avg       0.69      0.75      0.66        57
weighted avg       0.89      0.79      0.80        57

