In [1]:
%%capture
!pip install gliner protobuf==3.20.3
!pip install accelerate
!pip install --upgrade transformers accelerate
!pip install seqeval

In [None]:
from datasets import load_dataset

dataset = load_dataset("nqdhocai/ner-covid19-test")
test_data = dataset['train']

README.md:   0%|          | 0.00/437 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/150 [00:00<?, ? examples/s]

# Eval GLINER

In [None]:
import re
import torch
import pandas as pd
from tqdm import tqdm
from collections import Counter, defaultdict
from sklearn.metrics import classification_report

from gliner import GLiNER

from seqeval.metrics import classification_report as seqeval_report


# =========================================================
# CONFIG
# =========================================================
BATCH_SIZE = 64
THRESHOLD = 0.5

TARGET_LABELS = [
    "PATIENT_ID", "NAME", "AGE", "GENDER", "JOB",
    "LOCATION", "ORGANIZATION", "SYMPTOM_AND_DISEASE", "TRANSPORTATION", "DATE"
]

MODEL_PATH = "urchade/gliner_multi-v2.1"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = GLiNER.from_pretrained(MODEL_PATH).to(DEVICE)

# =========================================================
# TAG CLEANING (BIO -> clean label) + alignment util
# =========================================================
def clean_original_tags(tags, id2label=None):
    """
    tags: list[str] BIO ho·∫∑c list[int]
    Return: list[str] clean label (b·ªè B-/I-), ch·ªâ c√≤n {O, LOCATION, ...}
    """
    if len(tags) == 0:
        return []

    if not isinstance(tags[0], str):
        if id2label is None:
            raise ValueError("tags l√† s·ªë nh∆∞ng b·∫°n ch∆∞a cung c·∫•p id2label.")
        tags = [id2label[int(t)] for t in tags]

    cleaned = []
    for tag in tags:
        if tag == "O" or tag is None:
            cleaned.append("O")
            continue
        if "-" in tag:
            _, lb = tag.split("-", 1)
        else:
            lb = tag

        if lb in TARGET_LABELS:
            cleaned.append(lb)
        else:
            cleaned.append("O")
    return cleaned


def spans_to_token_tags(tokens, pred_spans, target_labels):
    """
    Map span-level prediction -> token-level clean tags.
    tokens: list[str]
    pred_spans: list[dict] with keys: start, end, label (char offsets on full text)
    """

    token_maps = []
    cur = 0
    for tok in tokens:
        s = cur
        e = s + len(tok)
        token_maps.append((s, e))
        cur = e + 1  

    out = ["O"] * len(tokens)
    for sp in pred_spans:
        lb = sp.get("label")
        if lb not in target_labels:
            continue
        ps, pe = int(sp["start"]), int(sp["end"])

        for i, (ts, te) in enumerate(token_maps):
            if max(ts, ps) < min(te, pe):
                out[i] = lb
    return out


# =========================================================
# BIO -> spans (entity-level gold) ƒë·ªÉ l√†m entity-level eval
# =========================================================
def bio_to_spans(tokens, bio_tags):
    """
    tokens: list[str]
    bio_tags: list[str] d·∫°ng BIO (B-XXX/I-XXX/O) ho·∫∑c clean label (XXX/O)
    Return spans: list of (start_token, end_token_exclusive, label)
    """
    spans = []
    i = 0
    n = min(len(tokens), len(bio_tags))

    def norm_tag(t):
        if t is None:
            return "O"
        t = t.strip()
        return t

    while i < n:
        t = norm_tag(bio_tags[i])
        if t == "O":
            i += 1
            continue

        if "-" not in t:
            label = t
            j = i + 1
            while j < n and norm_tag(bio_tags[j]) == label:
                j += 1
            spans.append((i, j, label))
            i = j
            continue

        prefix, label = t.split("-", 1)
        if prefix != "B":
            prefix = "B"

        j = i + 1
        while j < n:
            tj = norm_tag(bio_tags[j])
            if tj == f"I-{label}":
                j += 1
            else:
                break
        spans.append((i, j, label))
        i = j

    return spans


def pred_token_tags_to_spans(tokens, clean_tags):
    """
    clean_tags: list[str] ch·ªâ O ho·∫∑c LABEL
    Return spans token-level contiguous segments.
    """
    spans = []
    n = min(len(tokens), len(clean_tags))
    i = 0
    while i < n:
        lb = clean_tags[i]
        if lb == "O":
            i += 1
            continue
        j = i + 1
        while j < n and clean_tags[j] == lb:
            j += 1
        spans.append((i, j, lb))
        i = j
    return spans


def spans_to_bio(tokens, spans):
    """
    spans: list (start, end, label) token indices
    Return BIO tags list[str]
    """
    tags = ["O"] * len(tokens)
    for s, e, lb in spans:
        if s < 0 or e > len(tokens) or s >= e:
            continue
        tags[s] = f"B-{lb}"
        for i in range(s + 1, e):
            tags[i] = f"I-{lb}"
    return tags


# =========================================================
#  GLiNER inference in batches
# =========================================================
def gliner_batch_infer(texts, labels, batch_size=32, threshold=0.5):
    """
    Return list[list[span_dict]] same length with texts.
    Uses model.inference (non-deprecated).
    """
    all_out = []
    for i in tqdm(range(0, len(texts), batch_size), desc="üöÄ GLiNER inference"):
        batch = texts[i:i+batch_size]
        preds = model.inference(batch, labels=labels, threshold=threshold)
        all_out.extend(preds)
    return all_out


# =========================================================
# predict -> report -> error analysis
# =========================================================
all_tokens = [row["words"] for row in test_data]
all_texts = [" ".join(toks) for toks in all_tokens]

ID2LABEL_GOLD = None

gold_bio = []
gold_clean = []
for row in test_data:
    tags = row["tags"]
    if len(tags) > 0 and isinstance(tags[0], str):
        gold_bio.append(tags)
        gold_clean.append(clean_original_tags(tags, id2label=None))
    else:
        if ID2LABEL_GOLD is None:
            raise ValueError("ch∆∞a set ID2LABEL_GOLD")
        tags_str = [ID2LABEL_GOLD[int(x)] for x in tags]
        gold_bio.append(tags_str)
        gold_clean.append(clean_original_tags(tags_str, id2label=None))

batch_preds = gliner_batch_infer(all_texts, TARGET_LABELS, batch_size=BATCH_SIZE, threshold=THRESHOLD)

pred_clean = []
for toks, pred_spans in zip(all_tokens, batch_preds):
    pred_clean.append(spans_to_token_tags(toks, pred_spans, TARGET_LABELS))

# -------------------------
# TOKEN-LEVEL REPORT
# -------------------------
y_true_tok = []
y_pred_tok = []
for tclean, pclean in zip(gold_clean, pred_clean):
    L = min(len(tclean), len(pclean))
    y_true_tok.extend(tclean[:L])
    y_pred_tok.extend(pclean[:L])

labels = sorted(list(set(y_true_tok) | set(y_pred_tok)))
if "O" in labels:
    labels.remove("O")

print(classification_report(y_true_tok, y_pred_tok, labels=labels, zero_division=0, digits=4))

# -------------------------
# ENTITY-LEVEL REPORT via seqeval
# -------------------------
y_true_seq = []
y_pred_seq = []
for toks, bio_tags, pclean in zip(all_tokens, gold_bio, pred_clean):
    gold_spans = bio_to_spans(toks, bio_tags)
    gold_bio_norm = spans_to_bio(toks, gold_spans)

    pred_spans = pred_token_tags_to_spans(toks, pclean)
    pred_bio = spans_to_bio(toks, pred_spans)

    y_true_seq.append(gold_bio_norm)
    y_pred_seq.append(pred_bio)

print("\n" + "="*70)
print("ENTITY-LEVEL REPORT")
print("="*70)
print(seqeval_report(y_true_seq, y_pred_seq, digits=4))


# =========================================================
# ERROR ANALYSIS
# =========================================================
def is_year(tok: str) -> bool:
    return bool(re.fullmatch(r"(19|20)\d{2}", tok))

def has_disease_variant(tok: str) -> bool:
    t = tok.lower()
    return ("covid" in t) or ("sars" in t)

def is_time_piece(tok: str) -> bool:
    if tok in [":", "h", "gi·ªù", "s√°ng", "chi·ªÅu", "t·ªëi", "ƒë√™m"]:
        return True
    return bool(re.fullmatch(r"\d{1,2}", tok))

def guess_error_type(tokens, t_true, t_pred, idx):
    tok = tokens[idx]

    # FN
    if t_true != "O" and t_pred == "O":
        if t_true == "DATE" and is_time_piece(tok):
            return "FN_format_DATE"
        if t_true == "AGE" and is_year(tok):
            return "FN_format_AGE_year"
        if t_true == "SYMPTOM_AND_DISEASE" and has_disease_variant(tok):
            return "FN_normalization_disease_variant"
        if t_true in ["ORGANIZATION", "LOCATION", "TRANSPORTATION"] and len(tok) <= 4:
            return "FN_boundary_or_segmentation"
        return "FN_other"

    # FP
    if t_true == "O" and t_pred != "O":
        if t_pred == "TRANSPORTATION" and re.search(r"\d", tok):
            return "FP_id_or_plate_like"
        return "FP_other"

    # Confusion
    if t_true != t_pred:
        if (t_true, t_pred) in [
            ("TRANSPORTATION", "LOCATION"),
            ("ORGANIZATION", "LOCATION"),
            ("LOCATION", "ORGANIZATION"),
        ]:
            return "Confusion_LOC_ORG_TRANS"
        return "Confusion_other"

    return "Correct"


def build_error_analysis(tokens_list, true_clean_list, pred_clean_list, max_examples_per_type=8):
    confusion = Counter()
    err_type_counter = Counter()
    fn_by_label = Counter()
    fp_by_label = Counter()
    examples = defaultdict(list)

    for toks, tclean, pclean in zip(tokens_list, true_clean_list, pred_clean_list):
        L = min(len(toks), len(tclean), len(pclean))
        for i in range(L):
            t = tclean[i]
            p = pclean[i]
            if t == p:
                continue

            confusion[(t, p)] += 1

            if t != "O" and p == "O":
                fn_by_label[t] += 1
            if t == "O" and p != "O":
                fp_by_label[p] += 1

            et = guess_error_type(toks, t, p, i)
            err_type_counter[et] += 1

            if len(examples[et]) < max_examples_per_type:
                examples[et].append({
                    "text": " ".join(toks),
                    "token": toks[i],
                    "true": t,
                    "pred": p,
                    "pos": i
                })

    confusion_df = (
        pd.DataFrame([{"true": k[0], "pred": k[1], "count": v} for k, v in confusion.items()])
        .sort_values("count", ascending=False)
    )
    err_type_df = (
        pd.DataFrame([{"error_type": k, "count": v} for k, v in err_type_counter.items()])
        .sort_values("count", ascending=False)
    )
    fn_df = pd.DataFrame(fn_by_label.items(), columns=["label", "FN_count"]).sort_values("FN_count", ascending=False)
    fp_df = pd.DataFrame(fp_by_label.items(), columns=["label", "FP_count"]).sort_values("FP_count", ascending=False)

    return confusion_df, err_type_df, fn_df, fp_df, examples


confusion_df, err_type_df, fn_df, fp_df, err_examples = build_error_analysis(all_tokens, gold_clean, pred_clean)

print("\n" + "="*70)
print("ERROR ANALYSIS ‚Äî Top Confusion Pairs")
print("="*70)
print(confusion_df.head(25).to_string(index=False))

print("\n" + "="*70)
print("ERROR ANALYSIS ‚Äî Error Type Distribution")
print("="*70)
print(err_type_df.to_string(index=False))

print("\n" + "="*70)
print("ERROR ANALYSIS ‚Äî False Negatives by Label (missed entities)")
print("="*70)
print(fn_df.to_string(index=False))

print("\n" + "="*70)
print("ERROR ANALYSIS ‚Äî False Positives by Label (spurious entities)")
print("="*70)
print(fp_df.to_string(index=False))

2025-12-21 09:19:45.414004: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766308785.588241      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766308785.635087      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

gliner_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

üöÄ GLiNER inference:   0%|          | 0/3 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
üöÄ GLiNER inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:02<00:00,  1.31it/s]



üìä TOKEN-LEVEL CLASSIFICATION REPORT (clean labels, O removed)
                     precision    recall  f1-score   support

                AGE     0.5222    0.8393    0.6438        56
               DATE     0.8333    0.9223    0.8756       103
             GENDER     0.8214    0.7931    0.8070        29
                JOB     0.9455    0.2241    0.3624       232
           LOCATION     0.9229    0.5870    0.7176       632
               NAME     0.7349    0.8551    0.7905       214
       ORGANIZATION     0.4569    0.8971    0.6055       136
         PATIENT_ID     0.5854    0.4848    0.5304        99
SYMPTOM_AND_DISEASE     0.9444    0.6733    0.7861       101
     TRANSPORTATION     0.8543    0.5890    0.6973       219

          micro avg     0.7536    0.6249    0.6833      1821
          macro avg     0.7621    0.6865    0.6816      1821
       weighted avg     0.8245    0.6249    0.6718      1821


üìä ENTITY-LEVEL REPORT (seqeval, BIO)
                     precision    re

# Eval Standard NER

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

from seqeval.metrics import classification_report as seqeval_report
from sklearn.metrics import classification_report


# =========================================================
# CONFIG
# =========================================================
MODEL_PATH = "nqdhocai/vihealthbert-ner-v1"  
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 128  

# =========================================================
# LOAD MODEL + TOKENIZER
# =========================================================
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH).to(DEVICE)
model.eval()

id2label = model.config.id2label
label2id = model.config.label2id

is_fast = getattr(tokenizer, "is_fast", False)



# =========================================================
# normalize gold tags 
# =========================================================
def normalize_true_tags(example_tags, id2label):
    if not example_tags:
        return []
    if isinstance(example_tags[0], str):
        return [t.strip() for t in example_tags]
    return [id2label[int(t)] for t in example_tags]


def bio_to_clean(tags_bio):
    out = []
    for t in tags_bio:
        if t == "O" or t is None:
            out.append("O")
        elif "-" in t:
            out.append(t.split("-", 1)[1])
        else:
            out.append(t)
    return out


# =========================================================
# ALIGNMENT CORE
# =========================================================
_word_tokenlen_cache = {}

def word_to_subtoken_len(word: str) -> int:
    """
    For slow tokenizer: encode single word (no special tokens) -> length
    Cached for speed.
    """
    if word in _word_tokenlen_cache:
        return _word_tokenlen_cache[word]
    ids = tokenizer(word, add_special_tokens=False).get("input_ids", [])
    n = len(ids)
    if n == 0:
        n = 1
    _word_tokenlen_cache[word] = n
    return n


@torch.no_grad()
def predict_and_align_batch(batch_words):
    """
    batch_words: list[list[str]]
    Return: list[list[str]] predicted BIO tags at word level
    Supports:
      - fast tokenizer: word_ids()
      - slow tokenizer: subtoken length per word
    """
    if is_fast:
        enc = tokenizer(
            batch_words,
            truncation=True,
            is_split_into_words=True,
            padding=True,
            return_tensors="pt"
        ).to(DEVICE)

        outputs = model(**enc)
        pred_ids = torch.argmax(outputs.logits, dim=-1).detach().cpu().numpy()

        batch_preds = []
        for bi in range(len(batch_words)):
            word_ids = enc.word_ids(batch_index=bi)
            prev_wid = None
            word_preds = []
            for ti, wid in enumerate(word_ids):
                if wid is None:
                    continue
                if wid != prev_wid:
                    pid = int(pred_ids[bi][ti])
                    word_preds.append(id2label[pid])
                    prev_wid = wid
            batch_preds.append(word_preds)
        return batch_preds


    batch_texts = [" ".join(ws) for ws in batch_words]
    enc = tokenizer(
        batch_texts,
        truncation=True,
        padding=True,
        return_tensors="pt",
    ).to(DEVICE)

    outputs = model(**enc)
    pred_ids = torch.argmax(outputs.logits, dim=-1).detach().cpu().numpy()

    input_ids = enc["input_ids"].detach().cpu().numpy()
    special_ids = set()
    for k in ["cls_token_id", "sep_token_id", "pad_token_id"]:
        v = getattr(tokenizer, k, None)
        if v is not None:
            special_ids.add(int(v))

    batch_preds = []
    for bi, words in enumerate(batch_words):
        valid_token_positions = []
        for ti, tid in enumerate(input_ids[bi]):
            if int(tid) in special_ids:
                continue
            valid_token_positions.append(ti)

        lens = [word_to_subtoken_len(w) for w in words]
        total_expected = sum(lens)

        usable = min(len(valid_token_positions), total_expected)

        word_preds = []
        cursor = 0
        token_cursor = 0
        for _, L in enumerate(lens):
            if token_cursor >= usable:
                break
            tok_pos = valid_token_positions[token_cursor]
            pid = int(pred_ids[bi][tok_pos])
            word_preds.append(id2label[pid])

            token_cursor += L
            cursor += 1

        batch_preds.append(word_preds)

    return batch_preds


# =========================================================
# ERROR TAXONOMY
# =========================================================
def is_year(tok: str) -> bool:
    return bool(re.fullmatch(r"(19|20)\d{2}", tok))

def has_disease_variant(tok: str) -> bool:
    t = tok.lower()
    return ("covid" in t) or ("sars" in t) or ("sarscov" in t)

def is_time_piece(tok: str) -> bool:
    if tok in [":", "h", "gi·ªù", "s√°ng", "chi·ªÅu", "t·ªëi", "ƒë√™m", "ph√∫t"]:
        return True
    return bool(re.fullmatch(r"\d{1,2}", tok))

def guess_error_type(tokens, true_clean, pred_clean, idx):
    tok = tokens[idx]

    # FN
    if true_clean != "O" and pred_clean == "O":
        if true_clean == "DATE" and is_time_piece(tok):
            return "FN_format_DATE"
        if true_clean == "AGE" and is_year(tok):
            return "FN_format_AGE_year"
        if true_clean == "SYMPTOM_AND_DISEASE" and has_disease_variant(tok):
            return "FN_normalization_disease_variant"
        if true_clean in ["ORGANIZATION", "LOCATION", "TRANSPORTATION", "NAME"] and len(tok) <= 4:
            return "FN_boundary_or_segmentation"
        return "FN_other"

    # FP
    if true_clean == "O" and pred_clean != "O":
        if re.search(r"\d", tok) and pred_clean in ["DATE", "TRANSPORTATION", "PATIENT_ID", "AGE"]:
            return "FP_numeric_like"
        return "FP_other"

    # Confusion
    if true_clean != pred_clean:
        if (true_clean, pred_clean) in [
            ("TRANSPORTATION", "LOCATION"),
            ("ORGANIZATION", "LOCATION"),
            ("LOCATION", "ORGANIZATION"),
            ("NAME", "ORGANIZATION"),
            ("NAME", "LOCATION"),
        ]:
            return "Confusion_semantic_neighbor"
        return "Confusion_other"

    return "Correct"


def build_error_analysis(tokens_list, true_clean_list, pred_clean_list, max_examples_per_type=8):
    confusion = Counter()
    err_type_counter = Counter()
    fn_by_label = Counter()
    fp_by_label = Counter()
    examples = defaultdict(list)

    for toks, tclean, pclean in zip(tokens_list, true_clean_list, pred_clean_list):
        L = min(len(toks), len(tclean), len(pclean))
        for i in range(L):
            t = tclean[i]
            p = pclean[i]
            if t == p:
                continue

            confusion[(t, p)] += 1
            if t != "O" and p == "O":
                fn_by_label[t] += 1
            if t == "O" and p != "O":
                fp_by_label[p] += 1

            et = guess_error_type(toks, t, p, i)
            err_type_counter[et] += 1

            if len(examples[et]) < max_examples_per_type:
                examples[et].append({
                    "text": " ".join(toks),
                    "token": toks[i],
                    "true": t,
                    "pred": p,
                    "pos": i
                })

    confusion_df = (
        pd.DataFrame([{"true": k[0], "pred": k[1], "count": v} for k, v in confusion.items()])
        .sort_values("count", ascending=False)
    )
    err_type_df = (
        pd.DataFrame([{"error_type": k, "count": v} for k, v in err_type_counter.items()])
        .sort_values("count", ascending=False)
    )
    fn_df = pd.DataFrame(fn_by_label.items(), columns=["label", "FN_count"]).sort_values("FN_count", ascending=False)
    fp_df = pd.DataFrame(fp_by_label.items(), columns=["label", "FP_count"]).sort_values("FP_count", ascending=False)

    return confusion_df, err_type_df, fn_df, fp_df, examples


# =========================================================
# RUN EVAL
# =========================================================
y_true_bio_sent, y_pred_bio_sent = [], []
tokens_sent, true_clean_sent, pred_clean_sent = [], [], []
error_samples = []

batch_words, batch_gold = [], []

def flush_batch():
    global batch_words, batch_gold

    if not batch_words:
        return

    preds_bio = predict_and_align_batch(batch_words)

    for words, gold_bio, pred_bio in zip(batch_words, batch_gold, preds_bio):
        L = min(len(words), len(gold_bio), len(pred_bio))
        words = words[:L]
        gold_bio = gold_bio[:L]
        pred_bio = pred_bio[:L]

        y_true_bio_sent.append(gold_bio)
        y_pred_bio_sent.append(pred_bio)

        tc = bio_to_clean(gold_bio)
        pc = bio_to_clean(pred_bio)

        tokens_sent.append(words)
        true_clean_sent.append(tc)
        pred_clean_sent.append(pc)

        mismatches = []
        for w, t, p in zip(words, tc, pc):
            if t != p:
                mismatches.append({"token": w, "true": t, "pred": p})
        if mismatches and len(error_samples) < 20:
            error_samples.append({"text": " ".join(words), "errors": mismatches[:60]})

    batch_words, batch_gold = [], []


for ex in tqdm(test_data, desc="Loop"):
    words = ex["words"]
    gold_bio = normalize_true_tags(ex["tags"], id2label)

    batch_words.append(words)
    batch_gold.append(gold_bio)

    if len(batch_words) >= BATCH_SIZE:
        flush_batch()

flush_batch()


# =========================================================
# REPORTS
# =========================================================
print("\n" + "="*70)
print("ENTITY-LEVEL REPORT")
print("="*70)
print(seqeval_report(y_true_bio_sent, y_pred_bio_sent, digits=4))

y_true_tok = [t for sent in true_clean_sent for t in sent]
y_pred_tok = [p for sent in pred_clean_sent for p in sent]
labels = sorted(list(set(y_true_tok) | set(y_pred_tok)))
if "O" in labels:
    labels.remove("O")

print("\n" + "="*70)
print("TOKEN-LEVEL REPORT")
print("="*70)
print(classification_report(y_true_tok, y_pred_tok, labels=labels, zero_division=0, digits=4))

confusion_df, err_type_df, fn_df, fp_df, err_examples = build_error_analysis(
    tokens_sent, true_clean_sent, pred_clean_sent
)

print("\n" + "="*70)
print("ERROR ANALYSIS ‚Äî Top Confusion Pairs")
print("="*70)
print(confusion_df.head(25).to_string(index=False))

print("\n" + "="*70)
print("ERROR ANALYSIS ‚Äî Error Type Distribution")
print("="*70)
print(err_type_df.to_string(index=False))

print("\n" + "="*70)
print("ERROR ANALYSIS ‚Äî False Negatives by Label")
print("="*70)
print(fn_df.to_string(index=False))

print("\n" + "="*70)
print("ERROR ANALYSIS ‚Äî False Positives by Label")
print("="*70)
print(fp_df.to_string(index=False))


‚ö° Loading model nqdhocai/vihealthbert-ner-v1 on cuda ...
‚úÖ Tokenizer fast = False
‚úÖ Loaded test_data: 150 samples
üöÄ Evaluating on 150 samples ...


Loop:   0%|          | 0/150 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Loop: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 150/150 [00:00<00:00, 541.08it/s]


üìä ENTITY-LEVEL REPORT (seqeval, BIO)
                     precision    recall  f1-score   support

                AGE     0.9762    0.8542    0.9111        48
               DATE     0.9333    0.8750    0.9032        32
             GENDER     1.0000    0.9655    0.9825        29
                JOB     0.3214    0.1098    0.1636        82
           LOCATION     0.6417    0.8000    0.7122       150
               NAME     0.0529    0.1538    0.0787        65
       ORGANIZATION     0.6579    0.7143    0.6849        35
         PATIENT_ID     0.7895    0.3093    0.4444        97
SYMPTOM_AND_DISEASE     0.9767    0.8750    0.9231        48
     TRANSPORTATION     0.1059    0.1286    0.1161        70

          micro avg     0.4831    0.5213    0.5015       656
          macro avg     0.6456    0.5785    0.5920       656
       weighted avg     0.5879    0.5213    0.5275       656


üìä TOKEN-LEVEL REPORT (clean labels, O removed)
                     precision    recall  f1-score 




# Eval SLM