# This notebook ran on RTX 4090: 24 VRAM

In [2]:
import sys
print(sys.executable)

/venv/main/bin/python


In [3]:
!{sys.executable} -m pip install pandas datasets

[0m

In [4]:
import os
import pandas as p

In [5]:
from datasets import load_dataset

train, val, test = load_dataset("HuggingFaceM4/ChartQA", split=["train", "val", "test"])



In [6]:
test

Dataset({
    features: ['image', 'query', 'label', 'human_or_machine'],
    num_rows: 2500
})

## Benchmark QwenVL before finetuning

In [7]:
!{sys.executable} -m pip install transformers accelerate pillow sentencepiece

[0m

In [8]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    dtype=torch.bfloat16,
    device_map="auto"
)
model.eval()


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


Loading weights:   0%|          | 0/729 [00:00<?, ?it/s]

Qwen2_5_VLForConditionalGeneration(
  (model): Qwen2_5_VLModel(
    (visual): Qwen2_5_VisionTransformerPretrainedModel(
      (patch_embed): Qwen2_5_VisionPatchEmbed(
        (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
      )
      (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-31): 32 x Qwen2_5_VLVisionBlock(
          (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
          (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
          (attn): Qwen2_5_VLVisionAttention(
            (qkv): Linear(in_features=1280, out_features=3840, bias=True)
            (proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (mlp): Qwen2_5_VLMLP(
            (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
            (up_proj): Linear(in_features=1280, out_features=3420, bias=True)
            (down_proj): Linear(in_features=3420, out_features=1280, bias=True)
            (act_fn): SiLUAc

In [9]:
def infer_one(sample):
    img = sample['image']
    question = sample['query']

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": question}
            ]
        }
    ]

    text = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = processor(
        text = text,
        images = img,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=64
        )

    pred = processor.decode(output[0], skip_special_tokens=True)
    return pred

In [10]:
# from tqdm import tqdm

# test_subset = test.select(range(10))

# results = []
# for sample in tqdm(test_subset, desc="Benchmarking", total=len(test_subset)):
#     pred = infer_one(sample)
#     results.append({
#         "pred": pred,
#         "gt": sample["query"]
#     })


In [11]:
import re

def extract_number(text):
    if text is None:
        return None
    nums = re.findall(r"-?\d+\.?\d*", text)
    if len(nums) == 0:
        return None
    try:
        return float(nums[0])
    except:
        return None

In [12]:
import json
from tqdm import tqdm
import torch

out_path = "chartqa_qwenvl_test_predictions.jsonl"

torch.cuda.empty_cache()
torch.set_grad_enabled(False)
model.eval()

with open(out_path, "w", encoding="utf-8") as f:
    for idx, sample in enumerate(
        tqdm(test, desc="Infer test set") ## Sửa ở đây để infer full hoặc subset. test(full) và test_subset(infer một phần)
    ):
        pred_text = infer_one(sample)

        record = {
            "idx": idx,
            "query": sample["query"],
            "pred_text": pred_text,
            "pred_num": extract_number(pred_text),
            "gts": sample["label"],                 # list
            "human_or_machine": sample["human_or_machine"]
        }

        f.write(json.dumps(record, ensure_ascii=False) + "\n")

        # optional: tránh GPU chậm dần
        if idx % 50 == 0:
            torch.cuda.empty_cache()

Infer test set: 100%|██████████| 2500/2500 [1:11:15<00:00,  1.71s/it]


In [None]:
import re

def clean_prediction(pred_text: str):
    if pred_text is None:
        return ""

    # lấy phần sau "assistant"
    if "assistant" in pred_text:
        pred_text = pred_text.split("assistant", 1)[-1]

    # strip whitespace
    return pred_text.strip()

def extract_number(text):
    nums = re.findall(r"-?\d+\.?\d*", text)
    if len(nums) == 0:
        return None
    try:
        return float(nums[0])
    except:
        return None

def get_answer_type(gt: str):
    gt = gt.strip().lower()

    if gt in ["yes", "no"]:
        return "boolean"

    try:
        float(gt)
        return "numeric"
    except:
        return "string"

def evaluate_sample(pred_text, gt):
    pred_text = clean_prediction(pred_text)
    answer_type = get_answer_type(gt)

    # YES / NO
    if answer_type == "boolean":
        pred = pred_text.lower()
        if "yes" in pred:
            return gt.lower() == "yes"
        if "no" in pred:
            return gt.lower() == "no"
        return False

    # NUMERIC
    if answer_type == "numeric":
        pred_num = extract_number(pred_text)
        gt_num = float(gt)

        if pred_num is None:
            return False

        # exact match
        if pred_num == gt_num:
            return True

        # relaxed (5%)
        if abs(pred_num - gt_num) / max(1.0, abs(gt_num)) <= 0.05:
            return True

        return False

    # STRING
    pred = pred_text.lower()
    gt = gt.lower()

    return gt in pred

In [None]:
import json

def load_jsonl(path):
    records = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            records.append(json.loads(line))
    return records

In [None]:
def evaluate_record(record):
    pred_text = record["pred_text"]
    gts = record["gts"]  # list

    for gt in gts:
        if evaluate_sample(pred_text, gt):
            return True
    return False
def benchmark_jsonl(path):
    records = load_jsonl(path)

    total = 0
    correct = 0

    # optional: breakdown theo loại
    stats = {
        "boolean": {"correct": 0, "total": 0},
        "numeric": {"correct": 0, "total": 0},
        "string": {"correct": 0, "total": 0},
    }

    for rec in records:
        total += 1
        is_correct = evaluate_record(rec)
        if is_correct:
            correct += 1

        # thống kê theo loại answer (dựa trên GT đầu tiên)
        gt0 = rec["gts"][0]
        ans_type = get_answer_type(gt0)
        stats[ans_type]["total"] += 1
        if is_correct:
            stats[ans_type]["correct"] += 1

    acc = correct / total if total > 0 else 0.0
    return acc, stats

In [None]:
acc, stats = benchmark_jsonl("predictions.jsonl")

print(f"Overall Accuracy: {acc:.4f}")

for k, v in stats.items():
    if v["total"] > 0:
        print(f"{k}: {v['correct']}/{v['total']} = {v['correct']/v['total']:.4f}")
