This example uses the default spaCy English model to perform named entity recognition.

In [5]:
import spacy
from spacy import displacy

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

Now, spaCy is set up with a CPU-bound model that prioritizes efficiency over accuracy to help keep costs down. In theory, this could be deployed on host machines without any issues.



In [6]:
import spacy
import json
from pydantic import BaseModel

# Load SpaCy model
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

# Define the Result class for consistency
class Result(BaseModel):
    name: str | None
    faculty: str | None
    college: str | None
    program: str | None
    distribution: str | None
    payment_method: str | None
    money: float | None
    currency: str | None
    email_address: str | None

# Path to your dataset and ground truth file
email_file_path = "D:\\playground\\playground\\donor_emails_dataset.json"
ground_truth_file_path = "D:\\playground\\playground\\extracted_emails_data.json"

# Load the dataset and ground truth
with open(email_file_path, "r") as f:
    emails = json.load(f)

with open(ground_truth_file_path, "r") as f:
    ground_truth = json.load(f)

# Set batch_size to 100
batch_size = 100
extracted_results = []

# Initialize accuracy counters
correct_predictions = 0
total_predictions = 0

# Loop through 100 samples (ensure you have enough samples)
for sample, truth in zip(emails[:batch_size], ground_truth[:batch_size]):
    # Process the text through SpaCy
    doc = nlp(sample['text'])

    # Extract entities from the text
    email_entities = {ent.label_: ent.text for ent in doc.ents}

    # Create a Result object from the extracted data
    result = Result(
        name=email_entities.get("PERSON", None),
        faculty=email_entities.get("ORG", None),
        college=email_entities.get("ORG", None),
        program=email_entities.get("PRODUCT", None),  # Adjust if necessary
        distribution=email_entities.get("MONEY", None),  # Adjust if necessary
        payment_method=email_entities.get("MONEY", None),  # Adjust if necessary
        money=None,  # Add money parsing logic if needed
        currency=None,  # Add currency logic if needed
        email_address=None,  # Add email extraction logic if needed
    )

    # Compare extracted entities with ground truth
    correct_predictions += sum(
        1 for key in result.dict().keys() if result.dict()[key] == truth.get(key)
    )
    total_predictions += len(result.dict())

    # Append the extracted result
    extracted_results.append(result)

# Calculate accuracy
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

# Save extracted results to a file
output_path = "D:\\playground\\playground\\extracted_spacy_results.json"
with open(output_path, "w") as f:
    json.dump([r.dict() for r in extracted_results], f, indent=4)

print(f"Accuracy: {accuracy:.2f}")
print(f"Extracted data saved to {output_path}")


C:\Users\maish\AppData\Local\Temp\ipykernel_15748\287748665.py:63: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  1 for key in result.dict().keys() if result.dict()[key] == truth.get(key)
C:\Users\maish\AppData\Local\Temp\ipykernel_15748\287748665.py:65: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  total_predictions += len(result.dict())


Accuracy: 0.12
Extracted data saved to D:\playground\playground\extracted_spacy_results.json


C:\Users\maish\AppData\Local\Temp\ipykernel_15748\287748665.py:76: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  json.dump([r.dict() for r in extracted_results], f, indent=4)


Looking at the results out of the box, they are not ideal. The model can properly classify currency amounts, but it struggles to infer more context, such as allocations for scholarships or departments. Therefore, fine-tuning the models will be necessary to improve accuracy.