In [1]:
!pip install datasets



In [2]:
from random import shuffle
from math import ceil

import torch
import torch.nn as nn

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import datasets

from tqdm.auto import tqdm

from collections import defaultdict
from urllib import request
import json
import pandas as pd

In [3]:
def parse_conllu_using_pandas(block):
    records = []
    for line in block.splitlines():
        if not line.startswith('#'):
            records.append(line.strip().split('\t'))
    return pd.DataFrame.from_records(
        records,
        columns=['ID', 'FORM', 'TAG', 'Misc1', 'Misc2'])

In [4]:
def tokens_to_labels(df):
    return (
        df.FORM.tolist(),
        df.TAG.tolist()
    )

In [6]:
PREFIX = "https://raw.githubusercontent.com/UniversalNER/"
DATA_URLS = {
    "en_ewt": {
        "train": "UNER_English-EWT/master/en_ewt-ud-train.iob2",
        "dev": "UNER_English-EWT/master/en_ewt-ud-dev.iob2",
        "test": "UNER_English-EWT/master/en_ewt-ud-test.iob2"
    },
    "en_pud": {
        "test": "UNER_English-PUD/master/en_pud-ud-test.iob2"
    }
}

In [7]:
# en_ewt is the main train-dev-test split
# en_pud is the OOD test set
data_dict = defaultdict(dict)
for corpus, split_dict in DATA_URLS.items():
    for split, url_suffix in split_dict.items():
        url = PREFIX + url_suffix
        with request.urlopen(url) as response:
            txt = response.read().decode('utf-8')
            data_frames = map(parse_conllu_using_pandas,
                              txt.split('\n\n'))
            token_label_alignments = list(map(tokens_to_labels,
                                              data_frames))
            data_dict[corpus][split] = token_label_alignments

In [8]:
# Saving the data so that you don't have to redownload it each time.
with open('ner_data_dict.json', 'w', encoding='utf-8') as out:
    json.dump(data_dict, out, indent=2, ensure_ascii=False)


In [14]:
!pip install -U bitsandbytes accelerate



In [9]:
!pip install -U bitsandbytes



In [10]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [11]:
from google.colab import userdata
access_token = userdata.get('HF_TOKEN')

model_id = "meta-llama/Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map="auto", token=access_token)

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [13]:
def icl_input_formatter_factory(tokenizer, icl_examples):
    system_prompt = (
        "You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt "
        " Do not give an explaination. Valid tags: B, I, O "
        f"The input examples are as follows: {icl_examples}"
    )
    def format_input(text):
        return tokenizer.apply_chat_template(
            [
                {
                    "role": "system",
                    "content":
                    [
                        {
                            # Can also be an image if the model is multimodal.
                            "type": "text",
                            "text": system_prompt
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"Input: {text}"
                        }
                    ]
                }
            ],
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt"
        )
    return format_input

In [14]:
ner_tag_processor = icl_input_formatter_factory(
    tokenizer,
    '''Input: New York is beautiful. Output: B, I, O, O.
Input: John is nice. Output: B, O, O.
Input: Apple is huge. Output: B, O, O.
Input: Harry Kane is good. Output: B, I, O, O''')

In [15]:
ner_prompt = ner_tag_processor('New York is misty.')


In [16]:
inputs = {k: v.cuda() for k, v in ner_prompt.items()}

In [17]:
with torch.inference_mode():
    output = model.generate(**inputs, max_new_tokens=250, do_sample=True, top_k=3)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [18]:
print(tokenizer.decode(output[0], skip_special_tokens=True))

system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B, I, O The input examples are as follows: Input: New York is beautiful. Output: B, I, O, O.\nInput: John is nice. Output: B, O, O.\nInput: Apple is huge. Output: B, O, O.\nInput: Harry Kane is good. Output: B, I, O, O'}]user

[{'type': 'text', 'text': 'Input: New York is misty.'}]assistant

B, O, I, O


In [19]:
!pip install seqeval pandas


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=869d47133b5207beb27f7e56bff1273ad06991fba295e44e926865481b6062b1
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [20]:
from seqeval.metrics import classification_report
import numpy as np
import random

def evaluate_model(test_data, num_examples=100):
    # Set random seed for reproducibility
    np.random.seed(42)
    torch.manual_seed(42)

    # Randomly sample evaluation examples if needed
    eval_data = random.sample(list(test_data), min(num_examples, len(test_data)))

    all_true = []
    all_pred = []

    for sentence_data in tqdm(eval_data, desc="Evaluating"):
        tokens, true_labels = sentence_data
        text_input = " ".join(tokens)

        # Generate prediction
        inputs = ner_tag_processor(text_input)
        inputs = {k: v.cuda() for k, v in inputs.items()}

        with torch.inference_mode():
            outputs = model.generate(**inputs,
                                   max_new_tokens=len(tokens)*3,  # Allow 3x token count for safety
                                   do_sample=False,  # Better reproducibility
                                   temperature=0.01)

        # Decode and clean output
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred_labels = extract_labels_from_output(decoded, tokens)

        # Handle mismatched lengths
        if len(pred_labels) != len(true_labels):
            pred_labels = pred_labels[:len(true_labels)]  # Truncate if too long
            if len(pred_labels) < len(true_labels):  # Pad with O if too short
                pred_labels += ['O']*(len(true_labels)-len(pred_labels))

        all_true.append(true_labels)
        all_pred.append(pred_labels)

    # Generate evaluation report
    return classification_report(all_true, all_pred, zero_division=0)
    print("\n=== DEBUG SAMPLE ===")
    print("Input Sentence: ", text_input)
    print("True Labels   : ", true_labels)
    print("Decoded Output: ", decoded)
    print("Pred Labels   : ", pred_labels)


def extract_labels_from_output(text, original_tokens):
    """Extract labels from model output with validation"""
    # Find the output section
    output_start = text.find("Output:") + len("Output:")
    output_text = text[output_start:].split("Input:")[0].strip()

    # Split and clean labels
    labels = [l.strip() for l in output_text.split(",")]

    # Validate labels
    valid_tags = {'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'O'}
    return [l if l in valid_tags else 'O' for l in labels]

# Usage with your data_dict
test_set = data_dict['en_ewt']['test'][:10]
evaluate_model(test_set, num_examples=50)  # Start with 50 examples for quick test


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [None]:
test_string = "Input: London is foggy. Output: B-LOC, O, O."
print(extract_labels_from_output(test_string, ["London", "is", "foggy"]))


['B-LOC', 'O', 'O']


In [21]:
from seqeval.metrics import classification_report
import numpy as np
import random

def evaluate_model(test_data, num_examples=100, debug_limit=3):
    """
    Evaluates the model on a given test set.

    Args:
        test_data: List of (tokens, labels) pairs
        num_examples: Number of examples to evaluate
        debug_limit: Number of examples to log for debugging

    Returns:
        SeqEval classification report string
    """
    np.random.seed(42)
    torch.manual_seed(42)

    eval_data = random.sample(list(test_data), min(num_examples, len(test_data)))

    all_true = []
    all_pred = []

    for i, (tokens, true_labels) in enumerate(tqdm(eval_data, desc="Evaluating")):
        text_input = " ".join(tokens)

        try:
            # Format input
            inputs = ner_tag_processor(text_input)
            inputs = {k: v.cuda() for k, v in inputs.items()}

            # Generate
            with torch.inference_mode():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=len(tokens) * 3,
                    do_sample=False,
                    temperature=0.01
                )

            # Decode
            decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
            pred_labels = extract_labels_from_output(decoded, tokens)

            # Align lengths
            if len(pred_labels) != len(true_labels):
                pred_labels = pred_labels[:len(true_labels)]
                if len(pred_labels) < len(true_labels):
                    pred_labels += ['O'] * (len(true_labels) - len(pred_labels))

            all_true.append(true_labels)
            all_pred.append(pred_labels)

            # Debug output
            if i < debug_limit:
                print("\n=== DEBUG SAMPLE ===")
                print("Sentence       :", text_input)
                print("True Labels    :", true_labels)
                print("Predicted Labels:", pred_labels)
                print("Decoded Output :", decoded)

        except Exception as e:
            print(f"Error processing example {i}: {e}")
            continue

    return classification_report(all_true, all_pred, zero_division=0)


def extract_labels_from_output(text, original_tokens):
    """
    Extracts BIO labels from model output text, robust to formatting issues.

    Args:
        text: raw decoded string from model
        original_tokens: list of original sentence tokens

    Returns:
        List of predicted labels, one per token
    """
    import re

    match = re.search(r'Output:\s*(.*?)($|Input:)', text, re.DOTALL)
    if not match:
        return ['O'] * len(original_tokens)

    raw_output = match.group(1)
    labels = re.split(r'[,\s]+', raw_output.strip())

    valid_tags = {'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'O'}
    cleaned = [l if l in valid_tags else 'O' for l in labels]

    # Fix length
    if len(cleaned) > len(original_tokens):
        cleaned = cleaned[:len(original_tokens)]
    elif len(cleaned) < len(original_tokens):
        cleaned += ['O'] * (len(original_tokens) - len(cleaned))

    return cleaned


In [22]:
test_set = data_dict['en_ewt']['test']
print(evaluate_model(test_set, num_examples=50))  # Try more if it works


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : i do n't think so .
True Labels    : ['O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B, I, O The input examples are as follows: Input: New York is beautiful. Output: B, I, O, O.\nInput: John is nice. Output: B, O, O.\nInput: Apple is huge. Output: B, O, O.\nInput: Harry Kane is good. Output: B, I, O, O'}]user

[{'type': 'text', 'text': "Input: i don't think so."}]assistant

B, O, O, O, O, O.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : very reasonable prices .
True Labels    : ['O', 'O', 'O', 'O']
Predicted Labels: ['O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B, I, O The input examples are as follows: Input: New York is beautiful. Output: B, I, O, O.\nInput: John is nice. Output: B, O, O.\nInput: Apple is huge. Output: B, O, O.\nInput: Harry Kane is good. Output: B, I, O, O'}]user

[{'type': 'text', 'text': 'Input: very reasonable prices.'}]assistant

B, I, O, O


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : John Key , and no
True Labels    : ['B-PER', 'I-PER', 'O', 'O', 'O']
Predicted Labels: ['O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B, I, O The input examples are as follows: Input: New York is beautiful. Output: B, I, O, O.\nInput: John is nice. Output: B, O, O.\nInput: Apple is huge. Output: B, O, O.\nInput: Harry Kane is good. Output: B, I, O, O'}]user

[{'type': 'text', 'text': 'Input: John Key, and no'}]assistant

B, O, O, O


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

              precision    recall  f1-score   support

         LOC       0.00      0.00      0.00        11
         ORG       0.00      0.00      0.00         5
         PER       0.00      0.00      0.00         9

   micro avg       0.00      0.00      0.00        25
   macro avg       0.00      0.00      0.00        25
weighted avg       0.00      0.00      0.00        25



In [23]:
test_set = data_dict['en_pud']['test']
print(evaluate_model(test_set, num_examples=50))  # Try more if it works


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : She has also been charged with trying to kill her two - year - old daughter .
True Labels    : ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B, I, O The input examples are as follows: Input: New York is beautiful. Output: B, I, O, O.\nInput: John is nice. Output: B, O, O.\nInput: Apple is huge. Output: B, O, O.\nInput: Harry Kane is good. Output: B, I, O, O'}]user

[{'type': 'text', 'text': 'Input: She has also been charged with trying to kill her two - year - old daughter.'}]assistant

B, O, O, O, O, O, O, O, O, O, O, O, O, O

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : The last Olympic Games are believed to have been held in 393 .
True Labels    : ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B, I, O The input examples are as follows: Input: New York is beautiful. Output: B, I, O, O.\nInput: John is nice. Output: B, O, O.\nInput: Apple is huge. Output: B, O, O.\nInput: Harry Kane is good. Output: B, I, O, O'}]user

[{'type': 'text', 'text': 'Input: The last Olympic Games are believed to have been held in 393.'}]assistant

B, O, O, O, O, O, O, O, O


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



=== DEBUG SAMPLE ===
Sentence       : Using an original technique , Shen successfully dredged the canal and demonstrated the formidable value of the silt gathered as a fertilizer .
True Labels    : ['O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Decoded Output : system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

[{'type': 'text', 'text': 'You are an expert NER annotator. Return ONLY comma-separated BIO tags for each token. Tag only the tokens present in the prompt  Do not give an explaination. Valid tags: B, I, O The input examples are as follows: Input: New York is beautiful. Output: B, I, O, O.\nInput: John is nice. Output: B, O, O.\nInput: Apple is huge. Output: B, O, O.\nInput: Harry Kane is good. Output: B, I, O, O'}]user

[{'type': 'text', 'text': 'Input: U

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

              precision    recall  f1-score   support

         LOC       0.00      0.00      0.00        28
         ORG       0.00      0.00      0.00         3
         PER       0.00      0.00      0.00        13

   micro avg       0.00      0.00      0.00        44
   macro avg       0.00      0.00      0.00        44
weighted avg       0.00      0.00      0.00        44

