In [1]:
!pip install datasets



In [2]:
from random import shuffle
from math import ceil

import torch
import torch.nn as nn

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import datasets

from tqdm.auto import tqdm

from collections import defaultdict
from urllib import request
import json
import pandas as pd

In [3]:
def parse_conllu_using_pandas(block):
    records = []
    for line in block.splitlines():
        if not line.startswith('#'):
            records.append(line.strip().split('\t'))
    return pd.DataFrame.from_records(
        records,
        columns=['ID', 'FORM', 'TAG', 'Misc1', 'Misc2'])

In [4]:
def tokens_to_labels(df):
    return (
        df.FORM.tolist(),
        df.TAG.tolist()
    )

In [5]:
PREFIX = "https://raw.githubusercontent.com/UniversalNER/"
DATA_URLS = {
    "en_ewt": {
        "train": "UNER_English-EWT/master/en_ewt-ud-train.iob2",
        "dev": "UNER_English-EWT/master/en_ewt-ud-dev.iob2",
        "test": "UNER_English-EWT/master/en_ewt-ud-test.iob2"
    },
    "en_pud": {
        "test": "UNER_English-PUD/master/en_pud-ud-test.iob2"
    }
}

In [6]:
# en_ewt is the main train-dev-test split
# en_pud is the OOD test set
data_dict = defaultdict(dict)
for corpus, split_dict in DATA_URLS.items():
    for split, url_suffix in split_dict.items():
        url = PREFIX + url_suffix
        with request.urlopen(url) as response:
            txt = response.read().decode('utf-8')
            data_frames = map(parse_conllu_using_pandas,
                              txt.split('\n\n'))
            token_label_alignments = list(map(tokens_to_labels,
                                              data_frames))
            data_dict[corpus][split] = token_label_alignments

In [7]:
# Saving the data so that you don't have to redownload it each time.
with open('ner_data_dict.json', 'w', encoding='utf-8') as out:
    json.dump(data_dict, out, indent=2, ensure_ascii=False)


In [7]:
!pip install -U bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [7]:
!pip install -U bitsandbytes



In [8]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [9]:
from google.colab import userdata
access_token = userdata.get('HF_TOKEN')

model_id = "meta-llama/Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map="auto", token=access_token)

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [10]:
def icl_input_formatter_factory(tokenizer, icl_examples):
    system_prompt = (
        "You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt "
        " Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O "
        f"The input examples are as follows: {icl_examples}"
    )
    def format_input(text):
        return tokenizer.apply_chat_template(
            [
                {
                    "role": "system",
                    "content":
                    [
                        {
                            # Can also be an image if the model is multimodal.
                            "type": "text",
                            "text": system_prompt
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"Input: {text}"
                        }
                    ]
                }
            ],
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt"
        )
    return format_input

In [11]:
ner_tag_processor = icl_input_formatter_factory(
    tokenizer,
    '''Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.
Input: John is nice. Output: B-PER, O, O.
Input: Apple is huge. Output: B-ORG, O, O.
Input: Harry Kane is good. Output: B-PER, I-PER, O, O''')

In [12]:
ner_prompt = ner_tag_processor('New York is misty.')


In [14]:
inputs = {k: v.cuda() for k, v in ner_prompt.items()}

In [15]:
with torch.inference_mode():
    output = model.generate(**inputs, max_new_tokens=250, do_sample=True, top_k=3)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [17]:
print(tokenizer.decode(output[0], skip_special_tokens=True))

system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice. Output: B-PER, O, O.\nInput: Apple is huge. Output: B-ORG, O, O.\nInput: Harry Kane is good. Output: B-PER, I-PER, O, O'}]user

[{'type': 'text', 'text': 'Input: New York is misty.'}]assistant

B-LOC, I-LOC, O, O


In [16]:
!pip install seqeval pandas


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=9b89743efaf4923bfadb70e9b899914e2696e26700e43fb74093e215a7d4d294
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [24]:
def extract_labels_from_output(text, original_tokens):
    """
    Robust label extraction with:
    - Multiple output format handling
    - Subword token alignment
    - Fallback strategies
    """
    import re

    # 1. Try multiple extraction patterns
    patterns = [
        r'Output:\s*((?:[BILO]-[A-Z]+,?\s*)+)',  # Standard format
        r'Labels?:?\s*((?:[BILO]-[A-Z]+,?\s*)+)',  # Alternative formats
        r'\[([BILO]-[A-Z]+(?:,\s*[BILO]-[A-Z]+)*)\]'  # Bracket-enclosed
    ]

    raw_output = None
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            raw_output = match.group(1).strip()
            break

    # 2. Fallback if no patterns match
    if not raw_output:
        if 'O' in text:  # Last resort for malformed outputs
            return ['O'] * len(original_tokens)
        else:
            return ['O'] * len(original_tokens)

    # 3. Clean and validate tags
    valid_tags = {'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'O'}
    cleaned = []
    for tag in re.split(r'[,\s]+', raw_output):
        tag = tag.strip().upper()
        if tag in valid_tags:
            cleaned.append(tag)
        else:
            cleaned.append('O')  # Replace invalid tags with O

    # 4. Handle subword tokenization alignment
    if len(cleaned) != len(original_tokens):
        # Use word IDs for proper alignment if available
        encoded = tokenizer(original_tokens, is_split_into_words=True, return_offsets_mapping=True)
        word_ids = encoded.word_ids()

        aligned = []
        current_word = None
        for word_id in word_ids:
            if word_id is None:  # Special tokens
                continue
            if word_id != current_word:
                aligned.append(cleaned[word_id] if word_id < len(cleaned) else 'O')
                current_word = word_id
        cleaned = aligned[:len(original_tokens)]  # Final safety trim

    # 5. Length validation and padding
    if len(cleaned) > len(original_tokens):
        return cleaned[:len(original_tokens)]
    elif len(cleaned) < len(original_tokens):
        return cleaned + ['O'] * (len(original_tokens) - len(cleaned))

    return cleaned


In [22]:
from seqeval.metrics import classification_report
import numpy as np
import random
from collections import defaultdict

def bio_to_spans(labels):
    """Convert BIO tags to (start, end, label) spans"""
    spans = []
    current_label = None
    start_idx = -1

    for i, tag in enumerate(labels):
        if tag.startswith('B-'):
            if current_label is not None:
                spans.append((start_idx, i-1, current_label))
            current_label = tag[2:]
            start_idx = i
        elif tag.startswith('I-'):
            if current_label != tag[2:]:
                if current_label is not None:
                    spans.append((start_idx, i-1, current_label))
                current_label = tag[2:] if tag != 'O' else None
                start_idx = i if current_label else -1
        else:
            if current_label is not None:
                spans.append((start_idx, i-1, current_label))
                current_label = None
                start_idx = -1
    if current_label is not None:
        spans.append((start_idx, len(labels)-1, current_label))
    return spans

def calculate_span_metrics(true_spans, pred_spans):
    """Calculate precision/recall/F1 for spans"""
    # Labelled metrics (exact match)
    labelled_tp = len(set(true_spans) & set(pred_spans))
    labelled_fp = len(set(pred_spans) - set(true_spans))
    labelled_fn = len(set(true_spans) - set(pred_spans))

    # Unlabelled metrics (boundary only)
    true_boundaries = set((s,e) for s,e,_ in true_spans)
    pred_boundaries = set((s,e) for s,e,_ in pred_spans)
    unlabelled_tp = len(true_boundaries & pred_boundaries)
    unlabelled_fp = len(pred_boundaries - true_boundaries)
    unlabelled_fn = len(true_boundaries - pred_boundaries)

    def safe_divide(numerator, denominator):
        return numerator / denominator if denominator else 0

    return {
        'labelled': {
            'precision': safe_divide(labelled_tp, labelled_tp + labelled_fp),
            'recall': safe_divide(labelled_tp, labelled_tp + labelled_fn),
            'f1': safe_divide(2 * labelled_tp, (labelled_tp + labelled_fp + labelled_tp + labelled_fn))
        },
        'unlabelled': {
            'precision': safe_divide(unlabelled_tp, unlabelled_tp + unlabelled_fp),
            'recall': safe_divide(unlabelled_tp, unlabelled_tp + unlabelled_fn),
            'f1': safe_divide(2 * unlabelled_tp, (unlabelled_tp + unlabelled_fp + unlabelled_tp + unlabelled_fn))
        }
    }

def evaluate_model(test_data, num_examples=100, debug_limit=3):
    """
    Evaluates the model with span-level metrics added.

    Returns:
        Dictionary with:
        - token_level: seqeval classification report
        - span_level: precision/recall/f1 for labelled and unlabelled spans
        - macro_f1: average across entity types
    """
    np.random.seed(42)
    torch.manual_seed(42)

    eval_data = random.sample(list(test_data), min(num_examples, len(test_data)))

    all_true = []
    all_pred = []
    all_true_spans = []
    all_pred_spans = []

    for i, (tokens, true_labels) in enumerate(tqdm(eval_data, desc="Evaluating")):
        text_input = " ".join(tokens)

        try:
            # Format input
            inputs = ner_tag_processor(text_input)
            inputs = {k: v.cuda() for k, v in inputs.items()}

            # Generate
            with torch.inference_mode():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=len(tokens) * 3,
                    do_sample=False,
                    temperature=0.01
                )

            # Decode
            decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
            pred_labels = extract_labels_from_output(decoded, tokens)

            # Align lengths
            if len(pred_labels) != len(true_labels):
                pred_labels = pred_labels[:len(true_labels)]
                if len(pred_labels) < len(true_labels):
                    pred_labels += ['O'] * (len(true_labels) - len(pred_labels))

            # Store for metrics
            all_true.append(true_labels)
            all_pred.append(pred_labels)

            # Convert to spans
            true_spans = bio_to_spans(true_labels)
            pred_spans = bio_to_spans(pred_labels)

            all_true_spans.extend(true_spans)
            all_pred_spans.extend(pred_spans)

            # Debug output
            if i < debug_limit:
                print("\n=== DEBUG SAMPLE ===")
                print("Sentence       :", text_input)
                print("True Labels    :", true_labels)
                print("Predicted Labels:", pred_labels)
                print("Decoded Output :", decoded)
                print("True Spans     :", true_spans)
                print("Predicted Spans:", pred_spans)

        except Exception as e:
            print(f"Error processing example {i}: {e}")
            continue

    # Calculate metrics
    token_report = classification_report(all_true, all_pred, output_dict=True, zero_division=0)
    span_metrics = calculate_span_metrics(all_true_spans, all_pred_spans)

    return {
        'token_level': token_report,
        'span_level': span_metrics,
        'macro_f1': token_report['macro avg']['f1-score']
    }

# Rest of your existing code remains unchanged...


In [25]:
results = evaluate_model(test_data)
print("Token-Level F1:", results['token_level']['macro avg']['f1-score'])
print("Labelled Span F1:", results['span_level']['labelled']['f1'])
print("Unlabelled Span F1:", results['span_level']['unlabelled']['f1'])


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : Thanks -
True Labels    : ['O', 'O']
Predicted Labels: ['B-LOC', 'I-LOC']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice. Output: B-PER, O, O.\nInput: Apple is huge. Output: B-ORG, O, O.\nInput: Harry Kane is good. Output: B-PER, I-PER, O, O'}]user

[{'type': 'text', 'text': 'Input: Thanks -'}]assistant

B-PER, O
True Spans     : []
Predicted Spans: [(0, 1, 'LOC')]


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : Best regards ,
True Labels    : ['O', 'O', 'O']
Predicted Labels: ['B-LOC', 'I-LOC', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice. Output: B-PER, O, O.\nInput: Apple is huge. Output: B-ORG, O, O.\nInput: Harry Kane is good. Output: B-PER, I-PER, O, O'}]user

[{'type': 'text', 'text': 'Input: Best regards,'}]assistant

B-PER, O
True Spans     : []
Predicted Spans: [(0, 1, 'LOC')]


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : the bast cab in minneapolis
True Labels    : ['O', 'O', 'O', 'O', 'B-LOC']
Predicted Labels: ['B-LOC', 'I-LOC', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice. Output: B-PER, O, O.\nInput: Apple is huge. Output: B-ORG, O, O.\nInput: Harry Kane is good. Output: B-PER, I-PER, O, O'}]user

[{'type': 'text', 'text': 'Input: the bast cab in minneapolis'}]assistant

B-LOC, I-LOC, O, O
True Spans     : [(4, 4, 'LOC')]
Predicted Spans: [(0, 1, 'LOC')]


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Token-Level F1: 0.0
Labelled Span F1: 0.04
Unlabelled Span F1: 0.125


In [26]:
test_data = data_dict['en_pud']['test']
results = evaluate_model(test_data)
print("Token-Level F1:", results['token_level']['macro avg']['f1-score'])
print("Labelled Span F1:", results['span_level']['labelled']['f1'])
print("Unlabelled Span F1:", results['span_level']['unlabelled']['f1'])

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : He then returned to Kirriemuir .
True Labels    : ['O', 'O', 'O', 'O', 'B-LOC', 'O']
Predicted Labels: ['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice. Output: B-PER, O, O.\nInput: Apple is huge. Output: B-ORG, O, O.\nInput: Harry Kane is good. Output: B-PER, I-PER, O, O'}]user

[{'type': 'text', 'text': 'Input: He then returned to Kirriemuir.'}]assistant

B-PER, O, O, B-LOC
True Spans     : [(4, 4, 'LOC')]
Predicted Spans: [(0, 1, 'LOC')]


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : In the corner is a girl in a headscarf and jeans who looks so unassuming I think it ’s another assistant .
True Labels    : ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice. Output: B-PER, O, O.\nInput: Apple is huge. Output: B-ORG, O, O.\nInput: Harry Kane is good. Output: B-PER, I-PER, O, O'}]user

[{'type': 'text',

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : A third - party majority is needed , specifically the votes of 367 MPs ( out of 550 ) , whereas 330 votes are required in order to trigger a referendum .
True Labels    : ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Token-Level F1: 0.0
Labelled Span F1: 0.0
Unlabelled Span F1: 0.037037037037037035


In [18]:
from seqeval.metrics import classification_report, f1_score
import numpy as np
import random

def evaluate_model(test_data, num_examples=100, debug_limit=3):
    """
    Evaluates the model with span-level metrics added.

    Returns:
        Dictionary with:
        - token_level: seqeval classification report
        - span_level: precision/recall/f1 for labelled and unlabelled spans
        - macro_f1: average across entity types
    """
    np.random.seed(42)
    torch.manual_seed(42)

    eval_data = random.sample(list(test_data), min(num_examples, len(test_data)))

    # Initialize collections
    all_true = []
    all_pred = []
    all_true_spans = []
    all_pred_spans = []
    all_true_unlabelled = []
    all_pred_unlabelled = []

    for i, (tokens, true_labels) in enumerate(tqdm(eval_data, desc="Evaluating")):
        text_input = " ".join(tokens)

        try:
            # ... [existing processing code] ...

            # After getting pred_labels:
            all_true.append(true_labels)
            all_pred.append(pred_labels)

            # Extract spans for metrics
            true_spans = extract_spans(true_labels)
            pred_spans = extract_spans(pred_labels)

            all_true_spans.append(true_spans)
            all_pred_spans.append(pred_spans)

            # Unlabelled spans (positions only)
            all_true_unlabelled.append([(s,e) for s,e,_ in true_spans])
            all_pred_unlabelled.append([(s,e) for s,e,_ in pred_spans])

            # ... [existing debug code] ...

        except Exception as e:
            print(f"Error processing example {i}: {e}")
            continue

    # Calculate metrics
    token_report = classification_report(all_true, all_pred, output_dict=True, zero_division=0)

    return {
        'token_level': token_report,
        'span_level': {
            'labelled': calculate_span_metrics(all_true_spans, all_pred_spans),
            'unlabelled': calculate_span_metrics(all_true_unlabelled, all_pred_unlabelled)
        },
        'macro_f1': token_report['macro avg']['f1-score']
    }

def calculate_span_metrics(true, pred):
    """Calculate precision/recall/f1 for spans"""
    # Flatten to (doc_idx, start, end, *label) tuples
    true_set = set((i, s, e) + (tuple(lbl.split(',')) if isinstance(lbl, list) else (lbl,))
                   for i, spans in enumerate(true)
                   for span in spans
                   for s,e,*lbl in [span])

    pred_set = set((i, s, e) + (tuple(lbl.split(',')) if isinstance(lbl, list) else (lbl,))
                   for i, spans in enumerate(pred)
                   for span in spans
                   for s,e,*lbl in [span])

    tp = len(true_set & pred_set)
    fp = len(pred_set - true_set)
    fn = len(true_set - pred_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {'precision': precision, 'recall': recall, 'f1': f1}

def extract_spans(tags):
    """Convert BIO tags to (start, end, label) spans"""
    spans = []
    current_start = None
    current_label = None

    for i, tag in enumerate(tags):
        if tag.startswith('B-'):
            if current_start is not None:
                spans.append((current_start, i-1, current_label))
            current_start = i
            current_label = tag[2:]  # Strip B- prefix
        elif tag.startswith('I-'):
            if current_label != tag[2:]:
                if current_start is not None:
                    spans.append((current_start, i-1, current_label))
                current_start = i
                current_label = tag[2:]
        else:
            if current_start is not None:
                spans.append((current_start, i-1, current_label))
                current_start = None
                current_label = None

    if current_start is not None:
        spans.append((current_start, len(tags)-1, current_label))

    return spans


In [20]:
test_data = data_dict['en_ewt']['test']

In [21]:
results = evaluate_model(test_data)
print("Token-Level Metrics:", results['token_level'])
print("Labelled Span F1:", results['span_level']['labelled']['f1'])
print("Macro F1:", results['macro_f1'])


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

Error processing example 0: name 'pred_labels' is not defined
Error processing example 1: name 'pred_labels' is not defined
Error processing example 2: name 'pred_labels' is not defined
Error processing example 3: name 'pred_labels' is not defined
Error processing example 4: name 'pred_labels' is not defined
Error processing example 5: name 'pred_labels' is not defined
Error processing example 6: name 'pred_labels' is not defined
Error processing example 7: name 'pred_labels' is not defined
Error processing example 8: name 'pred_labels' is not defined
Error processing example 9: name 'pred_labels' is not defined
Error processing example 10: name 'pred_labels' is not defined
Error processing example 11: name 'pred_labels' is not defined
Error processing example 12: name 'pred_labels' is not defined
Error processing example 13: name 'pred_labels' is not defined
Error processing example 14: name 'pred_labels' is not defined
Error processing example 15: name 'pred_labels' is not defined
Er

ValueError: Found input variables with inconsistent numbers of samples:
[32, 13, 9, 6, 19, 9, 40, 1, 21, 9, 8, 46, 2, 3, 1, 13, 5, 9, 17, 13, 21, 27, 1, 17, 3, 7, 5, 42, 4, 5, 21, 49, 15, 12, 7, 3, 2, 11, 1, 15, 17, 11, 6, 2, 5, 18, 11, 24, 18, 6, 7, 2, 3, 6, 7, 16, 8, 3, 35, 8, 3, 5, 13, 1, 9, 9, 13, 30, 24, 1, 9, 14, 13, 41, 3, 15, 1, 7, 15, 3, 22, 7, 10, 23, 8, 7, 4, 4, 6, 1, 20, 34, 26, 8, 11, 10, 7, 2, 10, 15]
[]

In [44]:
test_string = "Input: London is foggy. Output: B-LOC, O, O."
print(extract_labels_from_output(test_string, ["London", "is", "foggy"]))


['B-LOC', 'O', 'O']


In [45]:
from seqeval.metrics import classification_report
import numpy as np
import random

def evaluate_model(test_data, num_examples=100, debug_limit=3):
    """
    Evaluates the model on a given test set.

    Args:
        test_data: List of (tokens, labels) pairs
        num_examples: Number of examples to evaluate
        debug_limit: Number of examples to log for debugging

    Returns:
        SeqEval classification report string
    """
    np.random.seed(42)
    torch.manual_seed(42)

    eval_data = random.sample(list(test_data), min(num_examples, len(test_data)))

    all_true = []
    all_pred = []

    for i, (tokens, true_labels) in enumerate(tqdm(eval_data, desc="Evaluating")):
        text_input = " ".join(tokens)

        try:
            # Format input
            inputs = ner_tag_processor(text_input)
            inputs = {k: v.cuda() for k, v in inputs.items()}

            # Generate
            with torch.inference_mode():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=len(tokens) * 3,
                    do_sample=False,
                    temperature=0.01
                )

            # Decode
            decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
            pred_labels = extract_labels_from_output(decoded, tokens)

            # Align lengths
            if len(pred_labels) != len(true_labels):
                pred_labels = pred_labels[:len(true_labels)]
                if len(pred_labels) < len(true_labels):
                    pred_labels += ['O'] * (len(true_labels) - len(pred_labels))

            all_true.append(true_labels)
            all_pred.append(pred_labels)

            # Debug output
            if i < debug_limit:
                print("\n=== DEBUG SAMPLE ===")
                print("Sentence       :", text_input)
                print("True Labels    :", true_labels)
                print("Predicted Labels:", pred_labels)
                print("Decoded Output :", decoded)

        except Exception as e:
            print(f"Error processing example {i}: {e}")
            continue

    return classification_report(all_true, all_pred, zero_division=0)


def extract_labels_from_output(text, original_tokens):
    """
    Extracts BIO labels from model output text, robust to formatting issues.

    Args:
        text: raw decoded string from model
        original_tokens: list of original sentence tokens

    Returns:
        List of predicted labels, one per token
    """
    import re

    match = re.search(r'Output:\s*(.*?)($|Input:)', text, re.DOTALL)
    if not match:
        return ['O'] * len(original_tokens)

    raw_output = match.group(1)
    labels = re.split(r'[,\s]+', raw_output.strip())

    valid_tags = {'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'O'}
    cleaned = [l if l in valid_tags else 'O' for l in labels]

    # Fix length
    if len(cleaned) > len(original_tokens):
        cleaned = cleaned[:len(original_tokens)]
    elif len(cleaned) < len(original_tokens):
        cleaned += ['O'] * (len(original_tokens) - len(cleaned))

    return cleaned


In [46]:
test_set = data_dict['en_ewt']['test']
print(evaluate_model(test_set, num_examples=50))  # Try more if it works


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : In Wei 's art , a logician could see mathematical logic , a cellist could see a ripple of nine bass notes , a country girl could see straw , and Harry Potter could see two accompanied green blazes .
True Labels    : ['O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER,

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : Cossor Ali , 24 , London E17
True Labels    : ['B-PER', 'I-PER', 'O', 'O', 'O', 'B-LOC', 'O']
Predicted Labels: ['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice. Output: B-PER, O, O.\nInput: Apple is huge. Output: B-ORG, O, O.\nInput: Harry Kane is good. Output: B-PER, I-PER, O, O'}]user

[{'type': 'text', 'text': 'Input: Cossor Ali, 24, London E17'}]assistant

B-PER, O, O, B-LOC, I-LOC


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : Job ID : J12746KM
True Labels    : ['O', 'O', 'O', 'O']
Predicted Labels: ['B-LOC', 'I-LOC', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice. Output: B-PER, O, O.\nInput: Apple is huge. Output: B-ORG, O, O.\nInput: Harry Kane is good. Output: B-PER, I-PER, O, O'}]user

[{'type': 'text', 'text': 'Input: Job ID : J12746KM'}]assistant

B-LOC, O, O, O, O,


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

              precision    recall  f1-score   support

         LOC       0.00      0.00      0.00        11
         ORG       0.00      0.00      0.00         7
         PER       0.00      0.00      0.00         9

   micro avg       0.00      0.00      0.00        27
   macro avg       0.00      0.00      0.00        27
weighted avg       0.00      0.00      0.00        27



In [50]:
test_set = data_dict['en_pud']['test']
print(evaluate_model(test_set, num_examples=50))  # Try more if it works


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : It included three tracks cowritten with Rafferty 's brother Jim , also a singer - songwriter , who had been signed to Decca Records in the 1970s .
True Labels    : ['O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O']
Predicted Labels: ['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice. Output: B-PER, O, O.\nInput: 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : However this was reversed when Julian was killed in battle in 363 .
True Labels    : ['O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice. Output: B-PER, O, O.\nInput: Apple is huge. Output: B-ORG, O, O.\nInput: Harry Kane is good. Output: B-PER, I-PER, O, O'}]user

[{'type': 'text', 'text': 'Input: However this was reversed when Julian was killed in battle in 363.'}]assistant

B-LOC, O, O, O, O, O, O, O, 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : Golden Age Spanish or Early Modern Spanish is the variant of the language that constitutes the transition from Medieval Spanish to Modern Spanish .
True Labels    : ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B-LOC, I-LOC, B-PER, I-PER, B-ORG, I-ORG, O The input examples are as follows: Input: New York is beautiful. Output: B-LOC, I-LOC, O, O.\nInput: John is nice. Output: B-PER, O, O.\nInput: Apple is huge. Output: B-ORG, O, O.\nInput: Harry Kane 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Error processing example 26: `max_new_tokens` must be greater than 0, but is 0.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

              precision    recall  f1-score   support

         LOC       0.02      0.06      0.03        18
         ORG       0.00      0.00      0.00         8
         PER       0.00      0.00      0.00        22

   micro avg       0.02      0.02      0.02        48
   macro avg       0.01      0.02      0.01        48
weighted avg       0.01      0.02      0.01        48

