This example uses the default GLiNER model to perform named entity recognition.



In [5]:
import json

from gliner import GLiNER

# Load the dataset
file_path = "C:\\Users\\vonbr\\Documents\\playground\\donor_emails_dataset.json"
with open(file_path, "r") as f:
    dataset = json.load(f)

#load GLiNER model
model = GLiNER.from_pretrained("urchade/gliner_large-v2.1")

labels = ["Name", "Faculty","College","Program","Distribution","PaymentMethod","Money", "Currency","Email"]
correct = 0
total = 0
missed = 0
false_positives = 0
for sample in dataset:
    text = sample["text"]
    ground_truth_entities = {ent["entity"]: ent["types"][0] for ent in sample["entities"]}
    # Run entity extraction
    extracted_entities = model.predict_entities(text, labels, threshold=0.5)
    extracted_set = {ent["text"]: ent["label"] for ent in extracted_entities}
    # Compare extracted entities with ground truth
    for entity, label in ground_truth_entities.items():
        total += 1
        if entity in extracted_set and extracted_set[entity] == label:
            correct += 1
        else:
            missed += 1
    # Count false positives
    for entity in extracted_set:
        if entity not in ground_truth_entities:
            false_positives += 1
# Accuracy calculation
accuracy = correct / total if total > 0 else 0
precision = correct / (correct + false_positives) if (correct + false_positives) > 0 else 0
recall = correct / total if total > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f"Total Entities: {total}, Correct: {correct}, Missed: {missed}, False Positives: {false_positives}")
print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1 Score: {f1_score:.2%}")
correct = 0
total = 0
missed = 0
for sample in dataset:
    text = sample["text"]
    ground_truth_entities = {ent["entity"]: ent["types"][0] for ent in sample["entities"]}
    # Run entity extraction
    extracted_entities = model.predict_entities(text, labels, threshold=0.5)
    extracted_set = {ent["text"]: ent["label"] for ent in extracted_entities}
    # Compare extracted entities with ground truth
    for entity, label in ground_truth_entities.items():
        total += 1
        if entity in extracted_set and extracted_set[entity] == label:
            correct += 1
        else:
            print(f"label: {label}, entity: {entity}, expected: {ground_truth_entities[entity]}, extracted: {extracted_set.get(entity)}")
            missed += 1
# Accuracy calculation
accuracy = correct / total if total > 0 else 0
print(f"Total Entities: {total}, Correct: {correct}, Missed: {missed}")
print(f"Accuracy: {accuracy:.2%}")

Fetching 4 files: 100%|██████████| 4/4 [00:00<?, ?it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Total Entities: 1000, Correct: 621, Missed: 379, False Positives: 218
Accuracy: 62.10%
Precision: 74.02%
Recall: 62.10%
F1 Score: 67.54%
label: Name, entity: Professor Lee, expected: Name, extracted: Faculty
label: Faculty, entity: Department of Computer Science, expected: Faculty, extracted: College
label: College, entity: Maple University, expected: College, extracted: None
label: Program, entity: Actuarial Mathematics - Business, expected: Program, extracted: None
label: Email, entity: sarah.johnson@donor.net, expected: Email, extracted: None
label: Name, entity: Dr. Taylor, expected: Name, extracted: Faculty
label: Faculty, entity: Department of Computer Science, expected: Faculty, extracted: College
label: Program, entity: Mechanical Engineering, expected: Program, extracted: None
label: Email, entity: david.brown@email.com, expected: Email, extracted: None
label: Program, entity: Environmental Studies, expected: Program, extracted: None
label: Email, entity: sarah.johnson@donor.n

We see that this model does a good job at extracting monitary, department, gift type and persons but fails when extracting intervals.