In [3]:
def evaluate_ner(gold_spans, predicted_spans):
    """
    Evaluate NER predictions using strict and overlapping measures.

    Args:
        gold_spans (list of tuples): Gold standard spans, each represented as (start, end).
        predicted_spans (list of tuples): Predicted spans, each represented as (start, end).

    Returns:
        dict: A dictionary containing precision, recall, and F1 for strict and overlapping measures.
    """
    def overlap(span1, span2):
        """Check if two spans overlap."""
        return max(span1[0], span2[0]) < min(span1[1], span2[1])

    # Strict measure
    strict_matches = set(gold_spans) & set(predicted_spans)

    # Overlapping measure
    overlapping_matches = {
        pred for pred in predicted_spans
        for gold in gold_spans
        if overlap(pred, gold)
    }

    def calculate_metrics(matches, total_gold, total_pred):
        """Calculate precision, recall, and F1 score."""
        precision = len(matches) / total_pred if total_pred > 0 else 0
        recall = len(matches) / total_gold if total_gold > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        return precision, recall, f1

    # Calculate metrics for strict and overlapping measures
    strict_metrics = calculate_metrics(strict_matches, len(gold_spans), len(predicted_spans))
    overlapping_metrics = calculate_metrics(overlapping_matches, len(gold_spans), len(predicted_spans))

    return {
        "strict": {"precision": strict_metrics[0], "recall": strict_metrics[1], "f1": strict_metrics[2]},
        "overlapping": {"precision": overlapping_metrics[0], "recall": overlapping_metrics[1], "f1": overlapping_metrics[2]},
    }

# Example usage
gold_spans = [(0, 5), (10, 15), (20, 25)]  # Gold standard spans
predicted_spans = [(0, 5), (10, 15), (20, 23)]  # Predicted spans

results = evaluate_ner(gold_spans, predicted_spans)
print("Strict Measures:", results["strict"])
print("Overlapping Measures:", results["overlapping"])


Strict Measures: {'precision': 0.6666666666666666, 'recall': 0.6666666666666666, 'f1': 0.6666666666666666}
Overlapping Measures: {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
