In [5]:
import spacy
from spacy.training import Example
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report


In [7]:

# Load SpaCy's small English model
nlp = spacy.load("en_core_web_lg")

# Function to convert IOB2 tagged sentence to SpaCy's format
def convert_iob_to_spacy_format(sentence, tags):
    entities = []
    start = None
    for i, (word, tag) in enumerate(zip(sentence.split(), tags)):
        if tag.startswith("B-"):  # Beginning of an entity
            if start is not None:  # Close previous entity
                entities.append((start, i, entity_label))  # (start, end, label)
            start = i
            entity_label = tag[2:]  # Extract entity label
        elif tag.startswith("I-") and start is not None:
            continue  # Inside an entity
        else:  # Not an entity
            if start is not None:  # Close previous entity
                entities.append((start, i, entity_label))
                start = None
    # Close last entity if exists
    if start is not None:
        entities.append((start, len(sentence.split()), entity_label))

    return {"entities": entities}

# Function to evaluate NER performance
def evaluate_ner(sentence, tags):
    # Convert IOB tags to SpaCy format
    annotations = convert_iob_to_spacy_format(sentence, tags)
    example = Example.from_dict(nlp.make_doc(sentence), annotations)

    # Get predictions
    doc = nlp(sentence)

    # Extract true and predicted entities
    true_entities = set((start, end, label) for start, end, label in annotations["entities"])
    predicted_entities = set((ent.start_char, ent.end_char, ent.label_) for ent in doc.ents)

    # Entity level evaluation
    tp = len(true_entities & predicted_entities)  # True positives
    fp = len(predicted_entities - true_entities)  # False positives
    fn = len(true_entities - predicted_entities)  # False negatives

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    print(f"Entity-level scores: Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

    # Token level evaluation
    y_true = []
    y_pred = []
    
    for token in doc:
        label = "O"  # Default to outside
        for start, end, entity_label in annotations["entities"]:
            if start <= token.i < end:
                label = "B-" + entity_label if token.i == start else "I-" + entity_label
                break
        y_true.append(label)
        y_pred.append(token.ent_iob_ + '-' + token.ent_type_)

    # Calculate token-level metrics
    precision_token = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall_token = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1_token = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    print(f"Token-level scores: Precision: {precision_token:.4f}, Recall: {recall_token:.4f}, F1-Score: {f1_token:.4f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred))



In [12]:
# Example IOB2 tagged sentence
sentence = "India"
tags = ["B-GPE"]

# Evaluate NER
evaluate_ner(sentence, tags)


Entity-level scores: Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000
Token-level scores: Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000

Classification Report:
               precision    recall  f1-score   support

       B-GPE       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



