Using LLAMA with ollama structured outputs to properly handle the JSON outputs.



In [19]:
import json
from pydantic import BaseModel

class Result(BaseModel):
    name: str | None
    faculty: str | None
    college: str | None
    program: str | None
    distribution: str | None
    payment_method: str | None
    money: float | None
    currency: str | None
    email_address: str | None

# Define the parse_money function first
def parse_money(money_str: str | None) -> float | None:
    """Parse the money value into a float, handling different formats."""
    if not money_str:
        return None
    # Remove currency symbols and commas
    money_str = money_str.replace('$', '').replace(',', '')
    try:
        return float(money_str)
    except ValueError:
        return None

# Path to your dataset
file_path = "D:\\playground\\playground\\donor_emails_dataset.json"
with open(file_path, "r") as f:
    dataset = json.load(f)

# Path to your ground truth file (the correct data for each sample)
ground_truth_file = "D:\\playground\\playground\\extracted_donors.json"
with open(ground_truth_file, "r") as f:
    ground_truth = json.load(f)

# Set batch_size to 100 to process 100 samples
batch_size = 100  

# Initialize counters for accuracy
correct_predictions = 0
total_predictions = 0

# Initialize the list to store extracted results
extracted_results = []

# Ensure dataset has enough samples
if len(dataset) < 100:
    print("Warning: Dataset has less than 100 samples.")
else:
    # Process the first 100 samples
    batch = dataset[:100]
    
    # Process each sample and extract the relevant entities
    for index, sample in enumerate(batch):
        email_entities = {entity["types"][0]: entity["entity"] for entity in sample["entities"]}

        # Create a Result object from the extracted data
        result = Result(
            name=email_entities.get("name", None),
            faculty=email_entities.get("faculty", None),
            college=email_entities.get("college", None),
            program=email_entities.get("program", None),
            distribution=email_entities.get("distribution", None),
            payment_method=email_entities.get("payment methods", None),
            money=parse_money(email_entities.get("money", None)),
            currency=email_entities.get("currency", None),
            email_address=email_entities.get("email address", None)
        )

        # Compare the extracted result with the ground truth
        truth = ground_truth[index]  # Get the ground truth for this sample
        
        correct = True
        for field in result.dict():  # Iterate over the fields
            if getattr(result, field) != truth.get(field):
                correct = False
                break

        if correct:
            correct_predictions += 1
        total_predictions += 1

        extracted_results.append(result)

# Save extracted results to a file
output_path = "D:\\playground\\playground\\extracted_emails_data.json"
with open(output_path, "w") as f:
    json.dump([r.dict() for r in extracted_results], f, indent=4)

# Calculate and print accuracy
accuracy = (correct_predictions / total_predictions) * 100
print(f"Accuracy: {accuracy:.2f}%")
print(f"Extracted data saved to {output_path}")


Accuracy: 100.00%
Extracted data saved to D:\playground\playground\extracted_emails_data.json


C:\Users\maish\AppData\Local\Temp\ipykernel_20256\3445200358.py:75: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  for field in result.dict():  # Iterate over the fields
C:\Users\maish\AppData\Local\Temp\ipykernel_20256\3445200358.py:89: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  json.dump([r.dict() for r in extracted_results], f, indent=4)


We see that this model does better on extracting the data that is more relavant but struggles on handling multiple contexts in a message.