### Set Up

In [7]:
import pandas as pd
import json

with open("../week-41/test_question.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

df_test = pd.DataFrame(test_data["questions"])

In [8]:
# other imports
import nltk
nltk.download('punkt_tab')      
nltk.download('wordnet')    
nltk.download('omw-1.4') 
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification, AutoModelForSequenceClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorForSeq2Seq
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter, defaultdict
from datasets import Dataset, load_dataset
import torch
import re
from tqdm import tqdm
import numpy as np

import torch.nn.functional as F

import os, gc
import evaluate

from evaluate import load
from peft import LoraConfig, get_peft_model, TaskType

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sarene/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sarene/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/sarene/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/sarene/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [9]:
device = "mps"
# device = "cpu"

lang_codes = {
    "pt": "por_Latn",
    "hi": "hin_Deva",
    "ja": "jpn_Jpan"
}

#### System instructions 

Objective: You are tasked with creating a
dataset containing 10 items. Each item must ad-
here to the following specifications:

• Question: the question text written in the
specified language.

• Answer: provided in both the language of
the question and in English.

• Context Paragraph (English): a short
English paragraph that provides background
information relevant to the question.
Content Requirements:

• The dataset must include a mix of easy, hard,
and unanswerable questions.

• Do not repeat context paragraphs.

>Formatting Requirements: <br>
>{                                                   <br>
>  "question": "\<question text\>",<br>
>  "context": "\<english context paragraph\>",<br>
>  "lang": "\<language of question\>",<br>
>  "answerable": "\<true if the question is answerable given the context\>" <br>
>  "answer_start": "",<br>
>  "answer": "\<if answerable, answer in english\>",<br>
>  "answer_inlang": "\<answer translated into the question language\>" <br>
>}

## Part 1: Rule-based Classifier 
Week 36

In [10]:
lemmatizer = WordNetLemmatizer()

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

model.to(device)

def translate_to_en(texts, src_lang):
    tokenizer.src_lang = lang_codes[src_lang]
    outputs = []
    
    for i in tqdm(range(0, len(texts), 8), desc=f"Translating {src_lang}"):
        batch = texts[i:i+8]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        
        bos_token_id = tokenizer.convert_tokens_to_ids("eng_Latn")
        
        out = model.generate(**inputs, forced_bos_token_id=bos_token_id)
        outputs.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    
    return outputs

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    tokens = text.split()
    stopwords = {"the","a","an","this","that","those",      # articles
    "is","was","has","have","had","be",                     # verbs
    "been","do","does","did","are","were",
    "go","went","goes",                   
    "in","on","at","of","to","for","with",                  # prepositions
    "into","from","above","below","before","after",
    "and","or","but",                                       # conjunctions
    "there","their","its","it",                             # possessive/pronouns
    "who","what","when","where","why","how","which"         # question words
    }
    return set(lemmatizer.lemmatize(t) for t in tokens if t not in stopwords)

def predict_answerable(question, context):
    a = preprocess(question)
    b = preprocess(context)

    return 1 if len(a & b) / len(a) >= 0.5 else 0


results = {}

for lang in df_test["lang"].unique():
    subset = df_test[df_test["lang"] == lang].copy()
    
    subset["question_en"] = translate_to_en(subset["question"].tolist(), lang)
    subset["context_en"] = translate_to_en(subset["context"].tolist(), lang)
    
    subset["pred"] = [predict_answerable(q, c) for q, c in tqdm(zip(subset["question_en"], subset["context_en"]), total=len(subset), desc=f"Predicting {lang}")]
    
    acc = accuracy_score(subset["answerable"], subset["pred"])
    prec, rec, f1, _ = precision_recall_fscore_support(subset["answerable"], subset["pred"], average="binary")
    
    results[lang] = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

print("Performance by language:")
for lang, metrics in results.items():
    print(f"\nLanguage: {lang}")
    print(f"Accuracy : {metrics['accuracy']:.4f}")
    print(f"F1 Score : {metrics['f1']:.4f}")

Translating pt: 100%|██████████| 2/2 [00:01<00:00,  1.73it/s]
Translating pt: 100%|██████████| 2/2 [00:05<00:00,  2.94s/it]
Predicting pt: 100%|██████████| 10/10 [00:01<00:00,  8.54it/s]
Translating hi: 100%|██████████| 2/2 [00:00<00:00,  3.20it/s]
Translating hi: 100%|██████████| 2/2 [00:04<00:00,  2.11s/it]
Predicting hi: 100%|██████████| 10/10 [00:00<00:00, 9339.35it/s]
Translating ja: 100%|██████████| 2/2 [00:00<00:00,  2.92it/s]
Translating ja: 100%|██████████| 2/2 [00:05<00:00,  2.52s/it]
Predicting ja: 100%|██████████| 10/10 [00:00<00:00, 8224.13it/s]

Performance by language:

Language: pt
Accuracy : 0.8000
F1 Score : 0.8889

Language: hi
Accuracy : 0.8000
F1 Score : 0.8889

Language: ja
Accuracy : 0.6000
F1 Score : 0.7143





## Part 3: Trained Answerability Classifier

Week 38

In [11]:
has_labels = "answerable" in df_test.columns

model_checkpoint = "chungimungi/week-38-multilingual-distilbert-all"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    tokenized_input = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=512,
        padding="max_length"
    )
    if has_labels:
        tokenized_input["labels"] = [int(ans) for ans in examples["answerable"]]
    return tokenized_input

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

dataset_test = Dataset.from_pandas(df_test)
tokenized_test = dataset_test.map(preprocess_function, batched=True)

args = TrainingArguments(
    output_dir="./results_eval",
    per_device_eval_batch_size=8,
    dataloader_drop_last=False
)

trainer = Trainer(
    model=model,
    args=args,
    compute_metrics=compute_metrics
)

languages = df_test["lang"].unique()

results_by_lang = {}

for lang in languages:
    df_lang = df_test[df_test["lang"] == lang]
    dataset_lang = Dataset.from_pandas(df_lang)
    tokenized_lang = dataset_lang.map(preprocess_function, batched=True)

    predictions = trainer.predict(tokenized_lang)
    pred_labels = np.argmax(predictions.predictions, axis=1)
    true_labels = predictions.label_ids

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='binary')
    acc = accuracy_score(true_labels, pred_labels)

    results_by_lang[lang] = {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

for lang, metrics in results_by_lang.items():
    print(f"\nLanguage: {lang}")
    print(f"Accuracy : {metrics['accuracy']:.4f}")
    print(f"F1 Score : {metrics['f1']:.4f}")

predictions = trainer.predict(tokenized_test)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

df_compare = pd.DataFrame({
    "Question": df_test["question"],
    "Predicted": pred_labels,
    "Actual": true_labels
})

print(df_compare.head(30).to_string(index=False))


Map: 100%|██████████| 30/30 [00:00<00:00, 3300.26 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 2748.74 examples/s]


Map: 100%|██████████| 10/10 [00:00<00:00, 2496.16 examples/s]


Map: 100%|██████████| 10/10 [00:00<00:00, 2637.76 examples/s]





Language: pt
Accuracy : 0.8000
F1 Score : 0.8889

Language: hi
Accuracy : 0.8000
F1 Score : 0.8889

Language: ja
Accuracy : 0.8000
F1 Score : 0.8889


                                             Question  Predicted  Actual
                             Quem pintou a Mona Lisa?          1       1
                        Qual é a capital de Portugal?          1       0
    Em que ano o homem pisou a Lua pela primeira vez?          1       1
Qual é o elemento mais abundante na crosta terrestre?          1       1
                         Onde se originou o Carnaval?          1       1
               Qual é a camada mais externa da Terra?          1       1
              Qual é a cidade mais populosa do mundo?          1       1
            Qual é a distância média da Terra ao Sol?          1       1
                           Como funcionam as vacinas?          1       1
         Qual é o nome do rio mais longo de Portugal?          1       0
                              ताजमहल किसने बनवाया था?          1       1
                   भारत के प्रथम प्रधानमंत्री कौन थे?          1       1
              मानव शरीर की सबसे बड़ी हड्डी कौन सी ह

## Part 4: Open QA
Week 39

In [12]:
torch.manual_seed(42)

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

model_checkpoint = "chungimungi/mbart-te-qc"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

max_length = 512

def format_prompt_qc(question, context, lang):
    """
    Improved prompt with clear structure and task instruction.
    Uses delimiters and explicit task framing.
    """
    if lang == "pt":
        prompt = (
            f"Answer the following question based on the given context. "
            f"Provide a concise and accurate answer in Portuguese.\n\n"
            f"Question: {question}\n\n"
            f"Context: {context}\n\n"
            f"Answer:"
        )
    elif lang == "hi":
        prompt = (
            f"Answer the following question based on the given context. "
            f"Provide a concise and accurate answer in Hindi.\n\n"
            f"Question: {question}\n\n"
            f"Context: {context}\n\n"
            f"Answer:"
        )
    else: # lang == ja
        prompt = (
            f"Answer the following question based on the given context. "
            f"Provide a concise and accurate answer in Japanese.\n\n"
            f"Question: {question}\n\n"
            f"Context: {context}\n\n"
            f"Answer:"
        )
    return prompt

def preprocess_function(examples):
    inputs = []
    targets = []
    for q, c, a_inlang, a_en, l in zip(examples["question"], examples["context"], examples["answer_inlang"], examples["answer"], examples["lang"]):
        input_text = format_prompt_qc(q, c, l)
        target_text = a_inlang if (a_inlang is not None and len(a_inlang) > 0) else (a_en if a_en is not None else "")
        inputs.append(input_text)
        targets.append(target_text)
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding=False)
    labels = tokenizer(targets, max_length=max_length, truncation=True, padding=False)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def eval_dataset(df_test, lang_code):
    torch.cuda.empty_cache()
    gc.collect()

    dataset = Dataset.from_pandas(df_test)
    model.eval()

    gen_kwargs = dict(
        max_new_tokens=100,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        num_beams=8,
        repetition_penalty=1.6,
        length_penalty=1.5,
    )

    answerable_predictions = []
    answerable_references = []
    unanswerable_predictions = []
    unanswerable_references = []
    with torch.no_grad():
        for i, ex in enumerate(dataset):            
            prompt = format_prompt_qc(ex["question"], ex["context"], ex["lang"])
            input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length).input_ids.to(model.device)
            outputs = model.generate(input_ids, **gen_kwargs)
            pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            if ex["answerable"]:
                if ex["answer_inlang"] and len(ex["answer_inlang"]) > 0:
                    ref_text = ex["answer_inlang"]
                else:
                    ref_text = ex["answer"]
                answerable_predictions.append(pred_text.strip())
                answerable_references.append(ref_text.strip())
            else:
                unanswerable_predictions.append(pred_text.strip())
                unanswerable_references.append("")

    answerable_rouge = rouge.compute(predictions=answerable_predictions, references=answerable_references, use_stemmer=True) if answerable_predictions else {"rouge1": 0, "rouge2": 0, "rougeL": 0}
    try:
        answerable_bleu = bleu.compute(predictions=answerable_predictions, references=[[r] for r in answerable_references]) if answerable_predictions else {"bleu": 0}
    except ZeroDivisionError:
        answerable_bleu = {"bleu": 0}
    unanswerable_rouge = rouge.compute(predictions=unanswerable_predictions, references=unanswerable_references, use_stemmer=True) if unanswerable_predictions else {"rouge1": 0, "rouge2": 0, "rougeL": 0}
    try:
        unanswerable_bleu = bleu.compute(predictions=unanswerable_predictions, references=[[r] for r in unanswerable_references]) if unanswerable_predictions else {"bleu": 0}
    except ZeroDivisionError:
        unanswerable_bleu = {"bleu": 0}
    all_predictions = answerable_predictions + unanswerable_predictions
    all_references = answerable_references + unanswerable_references
    overall_rouge = rouge.compute(predictions=all_predictions, references=all_references, use_stemmer=True) if all_predictions else {"rouge1": 0, "rouge2": 0, "rougeL": 0}
    try:
        overall_bleu = bleu.compute(predictions=all_predictions, references=[[r] for r in all_references]) if all_predictions else {"bleu": 0}
    except ZeroDivisionError:
        overall_bleu = {"bleu": 0}
    results = {
        "answerable_rouge1": round(answerable_rouge.get("rouge1", 0), 4),
        "answerable_rouge2": round(answerable_rouge.get("rouge2", 0), 4),
        "answerable_rougeL": round(answerable_rouge.get("rougeL", 0), 4),
        "answerable_bleu": round(answerable_bleu.get("bleu", 0), 4),
        "unanswerable_rouge1": round(unanswerable_rouge.get("rouge1", 0), 4),
        "unanswerable_rouge2": round(unanswerable_rouge.get("rouge2", 0), 4),
        "unanswerable_rougeL": round(unanswerable_rouge.get("rougeL", 0), 4),
        "unanswerable_bleu": round(unanswerable_bleu.get("bleu", 0), 4),
        "overall_rouge1": round(overall_rouge.get("rouge1", 0), 4),
        "overall_rouge2": round(overall_rouge.get("rouge2", 0), 4),
        "overall_rougeL": round(overall_rouge.get("rougeL", 0), 4),
        "overall_bleu": round(overall_bleu.get("bleu", 0), 4),
        "answerable_count": len(answerable_predictions),
        "unanswerable_count" : len(unanswerable_predictions)
    }

    return results

# def prompt_fn_qc(example):
#     return format_prompt_qc(example["question"], example["context"])

languages = df_test["lang"].unique()

results_by_lang = {}

print("\n" + "="*80)
print("Evaluating Model 1: Question in Language + English Context → Answer in Language")

languages = df_test["lang"].unique()
results_by_lang = {}

for lang in languages:
    df_lang = df_test[df_test["lang"] == lang]
    results_by_lang[lang] = eval_dataset(df_lang, lang)


for lang, results in results_by_lang.items():
    print(f"\nLanguage: {lang}")
    print(f"Answerable Examples: {results['answerable_count']}")
    print(f"Unanswerable Examples: {results['unanswerable_count']}")
    
    print("\nAnswerable Performance:")
    print(f"  ROUGE-1: {results['answerable_rouge1']}")
    print(f"  ROUGE-2: {results['answerable_rouge2']}")
    print(f"  ROUGE-L: {results['answerable_rougeL']}")
    print(f"  BLEU: {results['answerable_bleu']}")

    print("\nUnanswerable Performance:")
    print(f"  ROUGE-1: {results['unanswerable_rouge1']}")
    print(f"  ROUGE-2: {results['unanswerable_rouge2']}")
    print(f"  ROUGE-L: {results['unanswerable_rougeL']}")
    print(f"  BLEU: {results['unanswerable_bleu']}")

    print("\nOverall Performance:")
    print(f"  ROUGE-1: {results['overall_rouge1']}")
    print(f"  ROUGE-2: {results['overall_rouge2']}")
    print(f"  ROUGE-L: {results['overall_rougeL']}")
    print(f"  BLEU: {results['overall_bleu']}")
    
print("="*80)



Evaluating Model 1: Question in Language + English Context → Answer in Language

Language: pt
Answerable Examples: 8
Unanswerable Examples: 2

Answerable Performance:
  ROUGE-1: 0.1667
  ROUGE-2: 0.15
  ROUGE-L: 0.125
  BLEU: 0.0

Unanswerable Performance:
  ROUGE-1: 0.0
  ROUGE-2: 0.0
  ROUGE-L: 0.0
  BLEU: 0

Overall Performance:
  ROUGE-1: 0.1333
  ROUGE-2: 0.12
  ROUGE-L: 0.1333
  BLEU: 0.0

Language: hi
Answerable Examples: 8
Unanswerable Examples: 2

Answerable Performance:
  ROUGE-1: 0.2
  ROUGE-2: 0.1667
  ROUGE-L: 0.2
  BLEU: 0.0

Unanswerable Performance:
  ROUGE-1: 0.0
  ROUGE-2: 0.0
  ROUGE-L: 0.0
  BLEU: 0

Overall Performance:
  ROUGE-1: 0.16
  ROUGE-2: 0.1333
  ROUGE-L: 0.16
  BLEU: 0.0

Language: ja
Answerable Examples: 8
Unanswerable Examples: 2

Answerable Performance:
  ROUGE-1: 0.2083
  ROUGE-2: 0.0
  ROUGE-L: 0.2083
  BLEU: 0.0

Unanswerable Performance:
  ROUGE-1: 0.0
  ROUGE-2: 0.0
  ROUGE-L: 0.0
  BLEU: 0

Overall Performance:
  ROUGE-1: 0.1667
  ROUGE-2: 0.0
 

## Part 5: Sequence Labeler 

Week 40

In [14]:
import warnings
warnings.filterwarnings("ignore")

test_dataset = Dataset.from_pandas(df_test)

model_checkpoint = "chungimungi/week-40-multilingual-distilbert-sequence-label"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

label_list = ["O", "ANS"]
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

max_length = 384
doc_stride = 128
pad_on_right = tokenizer.padding_side == "right"

def create_token_labels(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized["overflow_to_sample_mapping"]
    offsets_mapping = tokenized["offset_mapping"]

    labels = []
    langs = []
    for i, offsets in enumerate(offsets_mapping):
        sequence_ids = tokenized.sequence_ids(i)
        sample_idx = sample_mapping[i]
        context = examples["context"][sample_idx]
        answer = (examples.get("answer", [""])[sample_idx] or "").strip()
        answerable = examples.get("answerable", [True])[sample_idx]

        if not answerable or not answer:
            answer_start, answer_end = -1, -1
        else:
            match = re.search(re.escape(answer), context, flags=re.IGNORECASE)
            if match:
                answer_start, answer_end = match.start(), match.end()
            else:
                answer_start, answer_end = -1, -1

        langs.append(examples.get("lang", ["unk"])[sample_idx])

        example_labels = []
        context_id = 1 if pad_on_right else 0

        for idx, offset in enumerate(offsets):
            if sequence_ids[idx] is None:
                example_labels.append(-100)
            elif sequence_ids[idx] != context_id:
                example_labels.append(-100)
            else:
                if answer_start == -1 or offset is None:
                    example_labels.append(label_to_id["O"])
                else:
                    start, end = offset
                    if start >= answer_end or end <= answer_start:
                        example_labels.append(label_to_id["O"])
                    else:
                        example_labels.append(label_to_id["ANS"])
        labels.append(example_labels)

    tokenized["labels"] = labels
    tokenized["lang"] = langs
    tokenized["offset_mapping"] = offsets_mapping
    return tokenized

def extract_answer_from_predictions(input_ids, predictions, offset_mapping, sequence_ids, context_id=1):
    spans = []
    start = None
    for i, (pred, sid) in enumerate(zip(predictions, sequence_ids)):
        if sid == context_id and pred == label_to_id["ANS"]:
            if start is None:
                start = i
        else:
            if start is not None:
                spans.append((start, i - 1))
                start = None
    if start is not None:
        spans.append((start, len(predictions) - 1))
    best_text = ""
    best_len = -1
    for s, e in spans:
        if offset_mapping[s] is not None and offset_mapping[e] is not None:
            tokens = input_ids[s:e+1]
            text = tokenizer.decode(tokens, skip_special_tokens=True).strip()
            if len(text) > best_len:
                best_text = text
                best_len = len(text)
    return best_text

def compute_token_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    true_labels, pred_labels = [], []

    for p, l in zip(preds, labels):
        for pi, li in zip(p, l):
            if li != -100:
                true_labels.append(li)
                pred_labels.append(pi)
    
    acc = accuracy_score(true_labels, pred_labels)

    return {
        "accuracy": acc
    }

def compute_f1(trainer, eval_dataset, original_dataset):
    predictions = trainer.predict(eval_dataset)
    pred_logits = predictions.predictions
    pred_labels = np.argmax(pred_logits, axis=-1)

    f1_scores = []

    sample_to_predictions = defaultdict(list)

    for i in range(len(eval_dataset)):
        input_ids = eval_dataset[i]["input_ids"]
        offset_mapping = eval_dataset[i]["offset_mapping"]

        sequence_ids = [
            0 if idx < input_ids.index(tokenizer.sep_token_id)
            else 1 if idx > input_ids.index(tokenizer.sep_token_id)
            else None
            for idx in range(len(input_ids))
        ]

        pred_answer = extract_answer_from_predictions(
            input_ids, pred_labels[i], offset_mapping, sequence_ids, 
            context_id=1 if pad_on_right else 0
        )
        sample_to_predictions[i].append(pred_answer)

    for sample_idx, pred_answers in sample_to_predictions.items():
        true_answer = original_dataset[sample_idx]["answer"].strip().lower()
        
        best_f1 = 0.0
        for pred_answer in pred_answers:
            pred_answer = pred_answer.strip().lower()
            if pred_answer == "":
                f1_scores.append(0.0)
                continue
            true_chars = set(range(len(true_answer)))
            pred_chars = set(range(len(pred_answer)))
            common = min(len(true_chars), len(pred_chars))
            precision = common / len(pred_chars) if pred_chars else 0
            recall = common / len(true_chars) if true_chars else 0
            f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
            if f1 > best_f1:
                best_f1 = f1
        f1_scores.append(best_f1)

    avg_f1 = np.mean(f1_scores) if f1_scores else 0.0
    return avg_f1, len(f1_scores)

tokenized_test = test_dataset.map(
    create_token_labels, 
    batched=True, 
    remove_columns=test_dataset.column_names
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model, 
    tokenizer=tokenizer,
    data_collator=data_collator, 
    compute_metrics=compute_token_metrics
)

print("Running evaluation...")
token_metrics = trainer.evaluate(tokenized_test)
f1_score, f1_count = compute_f1(trainer, tokenized_test, test_dataset)

print(f"Evaluating per language...")
results_by_lang = {}

for lang in languages:
    print(f"\n{'='*65}")
    print(f"Evaluating on {lang.lower()}")
    
    lang_data = test_dataset.filter(lambda ex: ex["lang"] == lang)
    tokenized_lang_data = tokenized_test.filter(lambda ex: ex["lang"] == lang)
    token_metrics = trainer.evaluate(tokenized_lang_data)
    f1_score, f1_count = compute_f1(trainer, tokenized_lang_data, lang_data)
    
    results_by_lang[lang] = {
        **token_metrics,
        "f1": f1_score,
        "f1_count": f1_count
    }
    
    print(f"Results for {lang.lower()}:")
    print(f"Token-level Accuracy:  {token_metrics['eval_accuracy']:.4f}")
    print(f"F1 Score:              {f1_score:.4f} (n={f1_count})")
    print(f"{'='*65}\n")

print(f"\n{'='*50}")
print("CROSS-LANGUAGE COMPARISON")
print(f"{'='*50}")
print(f"{'Language':<12} {'Acc':<8}  {'F1':<8}")
print(f"{'-'*50}")
lang_names = {"pt": "Portuguese", "hi": "Hindi", "ja": "Japanese"}
for lang in languages:
    metrics = results_by_lang[lang]
    print(f"{lang.upper():<12}"
        f"{metrics['eval_accuracy']:.4f}   "
        f"{metrics['f1']:.4f}")
print(f"{'='*50}")

Map: 100%|██████████| 30/30 [00:00<00:00, 2932.53 examples/s]


Running evaluation...


Evaluating per language...

Evaluating on pt


Filter: 100%|██████████| 30/30 [00:00<00:00, 18867.76 examples/s]
Filter: 100%|██████████| 30/30 [00:00<00:00, 1958.61 examples/s]


Results for pt:
Token-level Accuracy:  0.8775
F1 Score:              0.5394 (n=10)


Evaluating on hi


Filter: 100%|██████████| 30/30 [00:00<00:00, 16582.65 examples/s]
Filter: 100%|██████████| 30/30 [00:00<00:00, 1917.28 examples/s]


Results for hi:
Token-level Accuracy:  0.8975
F1 Score:              0.6450 (n=10)


Evaluating on ja


Filter: 100%|██████████| 30/30 [00:00<00:00, 17647.84 examples/s]
Filter: 100%|██████████| 30/30 [00:00<00:00, 1930.49 examples/s]


Results for ja:
Token-level Accuracy:  0.8440
F1 Score:              0.5310 (n=10)


CROSS-LANGUAGE COMPARISON
Language     Acc       F1      
--------------------------------------------------
PT          0.8775   0.5394
HI          0.8975   0.6450
JA          0.8440   0.5310
