In [38]:
import json
import numpy as np
from pathlib import Path
from pydantic import BaseModel, field_validator
from typing import List

from sklearn.metrics import (
    jaccard_score,
    hamming_loss,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
)
from ollama import chat

For this experment we will benchmark against many differnt models

In [39]:
class Entity(BaseModel):
    entity: str
    types: str
    
    @field_validator('entity')
    @classmethod
    def clean_entity(cls, v):
        # Take only the text before the comma if there is one
        if isinstance(v, str) and "," in v:
            return v.split(",")[0].strip()
        return v

class Result(BaseModel):
    text: str
    entities: List[Entity]

class LLMResult(BaseModel):
    interval: str
    organization: str
    money: str
    date: str
    phone: str
    address: str
    person: str
    faculty: str
    payment_method: str
    email: str
    gift_type: str
    frequency: str
    distribution: str


labels = ["Interval", "Organization", "Money", "Date", "Phone", "Address", "Person", "Faculty", "PaymentMethod", "Email", "Gift Type", "Frequency", "Distribution"]

In [40]:
test_set_path = Path("results.jsonl")

results = []

with open(test_set_path, "r") as f:
    for line in f:
        raw_data = json.loads(line)
        try:
            result = Result(**raw_data)
            results.append(result)

            filtered_entities = [entity for entity in result.entities if entity.types in labels]
            
            result.entities = filtered_entities
            results.append(result)
        except Exception as e:
            print(f"Error parsing line: {e}")
            print(f"Problematic data: {raw_data}")

In [41]:
models = [
    "qwen2.5:3b",
    "deepseek-r1:1.5b",
    "gemma2:2b",
    "llama3.2:latest",
]

In [42]:
def prompt(model: str):
    llm_results = []
    for result in results:
        response = chat(
            messages=[
                {"role": "system", "content":
            """
            You are an AI-powered email parsing tool designed to extract donation-related information with high precision and consistency.

            ### **Extraction Guidelines:**
            - **Currency Detection:** Identify donation amounts in multiple currencies (e.g., $, €, £, ¥).  
            - **Name Recognition:** Extract full names and first names.  
            - **Interval Parsing:** Detect donation frequencies, including variations like:
            - "monthly" / "month"
            - "yearly" / "annual" / "per year"
            - "one-time" / "single" / "once"  
            - **Faculty Identification:** Extract faculty or department names (e.g., "Computer Science", "Price Faculty of Engineering").  
            - **Payment Method Recognition:** Identify payment methods (e.g., "credit card", "check", "bank transfer", "cash").  
            - **Gift Type Recognition:** Extract gift types (e.g., "one-time gift", "recurring gift", "pledge", "payment for a pledge", "payment for a recurring gift").  
            - **Distribution Identification:** Recognize how the donation is allocated (e.g., "scholarship fund", "research fund", "general fund"). 
            - **Email Extraction:** Identify email addresses.

            ### **Parsing Considerations:**
            - Extract information **regardless of its position in the email** (subject, body, signature).  
            - Perform **case-insensitive matching** for all keywords.  
            - Handle **variations in amount formatting** (e.g., "$50", "50 USD", "50.00").  

            ### **Error Handling:**
            - If multiple conflicting values exist, prioritize in this order:  
            1. Most recent mention in the email.  
            2. Most explicitly stated value.  
            3. Full amounts over partial mentions.  
            - If a field **cannot be confidently extracted**, return `null` instead of an empty string or an uncertain response.  

            ### **Extraction Precision:**
            - Aim for **90%+ accuracy** in extracting donation-related details.  
            - **Do not guess**—if a value is unclear, return `null`.  
            - Ensure all outputs strictly adhere to the JSON format without additional explanations.  

                        ### **Output Rules:**
            - If a field cannot be confidently identified, return null.
            - Do not use empty strings ("") or vague statements like "not specified" or "unknown".
            - If the email contains no donation-related information, return an empty JSON object: {}.
                ```json
                {}
                ```
            - Do not return any additional text, explanations, or formatting outside the JSON response.
            
            ### **Output Format:**
            You must always return the extracted data in the **following JSON format**, with **no additional text, comments, or explanations**:  

            ```json
            {
            "interval": "[Extracted Interval or null]",
            "organization": "[Extracted Organization or null]",
            "money": "[Extracted Money or null]",
            "date": "[Extracted Date or null]",
            "phone": "[Extracted Phone or null]",
            "address": "[Extracted Address or null]",
            "person": "[Extracted Person or null]",
            "faculty": "[Extracted Faculty or null]",
            "payment_method": "[Extracted Payment Method or null]",
            "email": "[Extracted Email or null]",
            "gift_type": "[Extracted Gift Type or null]",
            "frequency": "[Extracted Frequency or null]",
            "distribution": "[Extracted Distribution or null]"
            }
            """},
                {"role": "user", "content": result.text}
            ],
            model=model,
            format=LLMResult.model_json_schema(),
            options={
                "temperature": 0.0,
            }
        )
        
        llm_results.append(LLMResult.model_validate_json(response.message.content))
    
    return llm_results

In [43]:
model_results = {}

for model in models:
    model_results[model] = prompt(model)

In [44]:
def llm_result_to_entities(llm_result):
    entities = []
    for label in labels:
        value = getattr(llm_result, label.lower(), None)
        if (
            value
            and value not in ["null", "", "unknown", "not specified", "n/a", "not"]
            and "unknown" not in value.lower()
        ):
            entities.append(Entity(entity=value, types=label))
    return entities


model_ner_results = {}
for model in models:
    model_ner_results[model] = [Result(text="", entities=llm_result_to_entities(llm_result)) for llm_result in model_results[model]]

In [45]:
labels = sorted(labels)

def sort_entities_by_type(results):
    for result in results:
        result.entities.sort(key=lambda x: x.types)
    return results


def multi_hot_encode(entities, labels):
    """Convert a list of entities to a multi-hot encoded vector."""
    vector = np.zeros(len(labels), dtype=int)
    for entity in entities:
        if entity.types in labels:
            vector[labels.index(entity.types)] = 1
    return vector


def hamming_score(y_true, y_pred):
    """Calculate Hamming Score as 1 - Hamming Loss."""
    return 1 - hamming_loss(y_true, y_pred)

In [46]:
model_scores = {}

for model, ner_results in model_ner_results.items():
    # Ensure sorting for consistency
    sorted_results = sort_entities_by_type(results)  # Ground truth
    sorted_ner_results = sort_entities_by_type(ner_results)  # Model predictions

    # Multi-hot encode ground truth and predictions
    y_true = np.array([multi_hot_encode(gt.entities, labels) for gt in sorted_results])
    y_pred = np.array(
        [multi_hot_encode(pred.entities, labels) for pred in sorted_ner_results]
    )

    # Compute Jaccard and Hamming scores
    jaccard_macro = jaccard_score(y_true, y_pred, average="macro")
    jaccard_micro = jaccard_score(y_true, y_pred, average="micro")
    jaccard_samples = jaccard_score(y_true, y_pred, average="samples")
    hamming_macro = hamming_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_samples = f1_score(y_true, y_pred, average='samples')

    precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
    precision_micro = precision_score(y_true, y_pred, average='micro', zero_division=0)
    precision_samples = precision_score(y_true, y_pred, average='samples', zero_division=0)

    recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
    recall_micro = recall_score(y_true, y_pred, average='micro', zero_division=0)
    recall_samples = recall_score(y_true, y_pred, average='samples', zero_division=0)

    accuracy = accuracy_score(y_true, y_pred)


    # Store results
    model_scores[model] = {
        "Jaccard Score (Macro)": jaccard_macro,
        "Jaccard Score (Micro)": jaccard_micro,
        "Jaccard Score (Samples)": jaccard_samples,
        "Hamming Score (Macro)": hamming_macro,
        "F1 Score (Macro)": f1_macro,
        "F1 Score (Micro)": f1_micro,
        "F1 Score (Samples)": f1_samples,
        "Precision (Macro)": precision_macro,
        "Precision (Micro)": precision_micro,
        "Precision (Samples)": precision_samples,
        "Recall (Macro)": recall_macro,
        "Recall (Micro)": recall_micro,
        "Recall (Samples)": recall_samples,
        "Accuracy": accuracy,
    }

# Print results
for model, scores in model_scores.items():
    print(f"Model: {model}")
    for metric, score in scores.items():
        print(f"  {metric}: {score:.4f}")
    print()

Model: qwen2.5:3b
  Jaccard Score (Macro): 0.5763
  Jaccard Score (Micro): 0.6013
  Jaccard Score (Samples): 0.6200
  Hamming Score (Macro): 0.7685
  F1 Score (Macro): 0.6537
  F1 Score (Micro): 0.7510
  F1 Score (Samples): 0.7467
  Precision (Macro): 0.5794
  Precision (Micro): 0.6262
  Precision (Samples): 0.6429
  Recall (Macro): 0.8143
  Recall (Micro): 0.9380
  Recall (Samples): 0.9259
  Accuracy: 0.0700

Model: deepseek-r1:1.5b
  Jaccard Score (Macro): 0.3061
  Jaccard Score (Micro): 0.3766
  Jaccard Score (Samples): 0.3595
  Hamming Score (Macro): 0.5562
  F1 Score (Macro): 0.4250
  F1 Score (Micro): 0.5471
  F1 Score (Samples): 0.4942
  Precision (Macro): 0.3628
  Precision (Micro): 0.4411
  Precision (Samples): 0.4363
  Recall (Macro): 0.6424
  Recall (Micro): 0.7200
  Recall (Samples): 0.7330
  Accuracy: 0.0000

Model: gemma2:2b
  Jaccard Score (Macro): 0.5776
  Jaccard Score (Micro): 0.6070
  Jaccard Score (Samples): 0.6235
  Hamming Score (Macro): 0.7812
  F1 Score (Macro):