This example uses the default GLiNER model to perform named entity recognition.



In [68]:
import json
import numpy as np
from pathlib import Path
from pydantic import BaseModel, field_validator
from typing import List
from sklearn.metrics import jaccard_score, hamming_loss
from gliner import GLiNER

In [69]:
class Entity(BaseModel):
    entity: str
    types: str
    
    @field_validator('entity')
    @classmethod
    def clean_entity(cls, v):
        # Take only the text before the comma if there is one
        if isinstance(v, str) and "," in v:
            return v.split(",")[0].strip()
        return v

class Result(BaseModel):
    text: str
    entities: List[Entity]

labels = ["Interval", "Organization", "Money", "Date", "Phone", "Address", "Person", "Faculty", "PaymentMethod", "Email", "Gift Type", "Frequency", "Distribution"]

In [70]:
test_set_path = Path("results.jsonl")

results = []

with open(test_set_path, "r") as f:
    for line in f:
        raw_data = json.loads(line)
        try:
            result = Result(**raw_data)
            results.append(result)

            filtered_entities = [entity for entity in result.entities if entity.types in labels]
            
            result.entities = filtered_entities
            results.append(result)
        except Exception as e:
            print(f"Error parsing line: {e}")
            print(f"Problematic data: {raw_data}")

In [77]:
model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")

ner_results = []
for result in results:
    text = result.text
    entities = model.predict_entities(text, labels)
    classified = [Entity(entity=entity["text"], types=entity["label"]) for entity in entities]
    ner_results.append(Result(text=text, entities=classified))

    print(result.dict())
    print(ner_results[-1].dict())
    print()

Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
C:\Users\vonbr\AppData\Local\Temp\ipykernel_3448\4027223018.py:10: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  print(result.dict())
C:\Users\vonbr\AppData\Local\Temp\ipykernel_3448\4027223018.py:11: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  print(ner_results[-1].dict())


{'text': "Dear University of Manitoba Development Office,\r\nI am delighted to be supporting your cause through a one-time gift of $20,000.\r\nI would like this donation to go towards the Computer Science Department's Fellowship Program. Please let me know if this is feasible and how we can make it happen. \r\nThank you for your time and consideration.\r\nSincerely,\r\nMichael Roberts.", 'entities': [{'entity': '$20', 'types': 'Money'}, {'entity': 'University of Manitoba Development Office', 'types': 'Organization'}]}
{'text': "Dear University of Manitoba Development Office,\r\nI am delighted to be supporting your cause through a one-time gift of $20,000.\r\nI would like this donation to go towards the Computer Science Department's Fellowship Program. Please let me know if this is feasible and how we can make it happen. \r\nThank you for your time and consideration.\r\nSincerely,\r\nMichael Roberts.", 'entities': [{'entity': 'University of Manitoba Development Office', 'types': 'Organi

In [79]:
labels = sorted(labels)

def sort_entities_by_type(results):
    for result in results:
        result.entities.sort(key=lambda x: x.types)
    return results

def multi_hot_encode(entities, labels):
    vector = np.zeros(len(labels), dtype=int)
    for entity in entities:
        if entity.types in labels:
            vector[labels.index(entity.types)] = 1
    return vector

def hamming_score(y_true, y_pred):
    return 1 - hamming_loss(y_true, y_pred)

results = sort_entities_by_type(results)
ner_results = sort_entities_by_type(ner_results)

y_true = np.array([multi_hot_encode(gt.entities, labels) for gt in results])
y_pred = np.array([multi_hot_encode(pred.entities, labels) for pred in ner_results])

jaccard_macro = jaccard_score(y_true, y_pred, average='macro')
jaccard_micro = jaccard_score(y_true, y_pred, average='micro')
jaccard_samples = jaccard_score(y_true, y_pred, average='samples')
hamming_macro = hamming_score(y_true, y_pred)

# Print results
print(f"Jaccard Score (Macro): {jaccard_macro:.4f}")
print(f"Jaccard Score (Micro): {jaccard_micro:.4f}")
print(f"Jaccard Score (Samples): {jaccard_samples:.4f}")
print(f"Hamming Score (Macro): {hamming_macro:.4f}")

Jaccard Score (Macro): 0.5657
Jaccard Score (Micro): 0.5919
Jaccard Score (Samples): 0.6163
Hamming Score (Macro): 0.8069
