- dataset: HuggingFaceM4/ChartQA
- model: Qwen/Qwen2.5-VL-3B-Instruct  
(used 'fine-tuning' conda environment in RTX5090 server. installed torch with `pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128`)

In [1]:
from datasets import load_dataset
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
from trl import SFTConfig, SFTTrainer
from qwen_vl_utils import process_vision_info
from peft import LoraConfig, PeftModel

import transformers
transformers.logging.set_verbosity_info()

# pip install torchvision pillow datasets trl transformers qwen-vl-utils peft wandb

In [2]:
import wandb
# wandb.init(mode="disabled")
wandb.init(project="chartqa-finetuning")

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Create a W&B account'
[34m[1mwandb[0m: Create an account here: https://wandb.ai/authorize?signup=true&ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ryu5090/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhagyeong929[0m ([33mhagyeong929-kyonggi-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# 시스템 메시지
system_message = "You are a chart analysis model that extracts precise answers from charts and graphs."

# evaluate_prompts.py에서 가장 점수가 높은 프롬프트
prompt = """Based on the chart image, answer the question.
Question: {question}

Provide only the answer (number or short text), no explanation.
Answer:"""

In [6]:
def format_data(sample, prompt_template):
    """ChartQA 데이터를 학습 포맷으로 변환"""
    image = sample["image"]
    
    return {
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": prompt_template.format(question=sample["query"])},
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": sample["label"][0]}],
            },
        ],
    }

In [7]:
# 데이터셋 로드
dataset = load_dataset("HuggingFaceM4/ChartQA")
train_dataset = dataset["train"]
val_dataset = dataset["val"]
test_dataset = dataset["test"]

print(f"Train: {len(train_dataset)} samples")
print(f"Val: {len(val_dataset)} samples")
print(f"Test: {len(test_dataset)} samples")

# 샘플 확인
print("\nSample data:")
print(train_dataset[0])

Train: 28299 samples
Val: 1920 samples
Test: 2500 samples

Sample data:
{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=422x359 at 0x73D0D31B7BC0>, 'query': 'Is the value of Favorable 38 in 2015?', 'label': ['Yes'], 'human_or_machine': 0}


In [8]:
# 모델 및 프로세서 로드
model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    device_map="auto",
    dtype=torch.bfloat16,
)
processor = AutoProcessor.from_pretrained(
    model_id,
    min_pixels=512 * 28 * 28,
    max_pixels=512 * 28 * 28,   # 동일하게 고정 (배치 안정성)
)

loading configuration file config.json from cache at /home/ryu5090/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-3B-Instruct/snapshots/66285546d2b821cf421d4f5eb2576359d3770cd3/config.json
Model config Qwen2_5_VLConfig {
  "architectures": [
    "Qwen2_5_VLForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "dtype": "bfloat16",
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "image_token_id": 151655,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 128000,
  "max_window_layers": 70,
  "model_type": "qwen2_5_vl",
  "num_attention_heads": 16,
  "num_hidden_layers": 36,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": {
    "mrope_section": [
      16,
      24,
      24
    ],
    "rope_type": "default",
    "type": "default"
  },
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "text_config": {
    "architectures": [
      "Qwen2_5_VLForConditionalGeneration

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file generation_config.json from cache at /home/ryu5090/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-3B-Instruct/snapshots/66285546d2b821cf421d4f5eb2576359d3770cd3/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 151643,
  "do_sample": true,
  "eos_token_id": [
    151645,
    151643
  ],
  "pad_token_id": 151643,
  "repetition_penalty": 1.05,
  "temperature": 1e-06
}

Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-VL-3B-Instruct.
loading configuration file preprocessor_config.json from cache at /home/ryu5090/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-3B-Instruct/snapshots/66285546d2b821cf421d4f5eb2576359d3770cd3/preprocessor_config.json
loading configuration file preprocessor_config.json from cache at /home/ryu5090/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-3B-Instruct/snapshots/66285546d2b821cf421d4f5eb2576359d3770cd3/preprocessor_config.json
The image processor of type `Qwen2VLImageProcessor` is now 

In [9]:
def relaxed_match(output, expected):
    """ChartQA relaxed accuracy: 숫자는 ±5% 허용"""
    output, expected = output.strip(), expected.strip()
    if output.lower() == expected.lower():
        return True
    try:
        out_num = float(output.replace(",", "").replace("%", ""))
        exp_num = float(expected.replace(",", "").replace("%", ""))
        if exp_num == 0:
            return out_num == 0
        return abs(out_num - exp_num) <= abs(exp_num) * 0.05
    except:
        return False


def evaluate_model(model, processor, test_samples, num_samples=100):
    """테스트 샘플에 대해 정확도 측정"""
    model.eval()
    correct = 0
    total = min(num_samples, len(test_samples))
    
    for i, sample in enumerate(test_samples[:total]):
        # 추론 수행 (system + user 메시지만 사용)
        messages = sample["messages"][:2]
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        # 이미지 추출
        image = sample["messages"][1]["content"][0]["image"]
        
        # 입력 준비
        inputs = processor(text=[text], images=[image], return_tensors="pt").to(model.device)
        
        # 생성
        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=50)
        
        # 디코딩
        output = processor.batch_decode(
            generated_ids[:, inputs.input_ids.shape[1]:], 
            skip_special_tokens=True
        )[0].strip()
        
        # 정답 비교 (relaxed match)
        expected = sample["messages"][2]["content"][0]["text"].strip()
        if relaxed_match(output, expected):
            correct += 1
        
        if (i + 1) % 20 == 0:
            print(f"Evaluated {i + 1}/{total} samples, current accuracy: {correct/(i+1):.2%}")
    
    return correct / total

In [10]:
# 최적 프롬프트로 학습/테스트 데이터 준비
train_formatted = [format_data(row, prompt) for row in train_dataset]
val_formatted = [format_data(row, prompt) for row in val_dataset]
test_formatted = [format_data(row, prompt) for row in test_dataset]

print(f"Train formatted: {len(train_formatted)} samples")
print(f"Validation formatted: {len(val_formatted)} samples")
print(f"Test formatted: {len(test_formatted)} samples")

Train formatted: 28299 samples
Validation formatted: 1920 samples
Test formatted: 2500 samples


In [11]:
# collate_fn 정의
def collate_fn(examples):
    """
    텍스트와 이미지가 포함된 대화 데이터를 모델 학습에 적합한 형태로 변환
    """
    # 1. 텍스트 전처리 - 채팅 템플릿 적용
    texts = [processor.apply_chat_template(ex["messages"], tokenize=False) for ex in examples]
    
    # 2. 이미지 데이터 추출 및 전처리
    image_inputs = [process_vision_info(ex["messages"])[0] for ex in examples]

    # 3. 텍스트 토크나이징 + 이미지 인코딩
    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)

    # 4. 라벨 생성 (손실 계산용)
    labels = batch["input_ids"].clone()
    
    # 5. 패딩 토큰 손실 계산에서 제외
    labels[labels == processor.tokenizer.pad_token_id] = -100

    # 6. 이미지 토큰 손실 계산에서 제외
    image_tokens = [151652, 151653, 151655]
    for token_id in image_tokens:
        labels[labels == token_id] = -100

    # 7. assistant 응답 이전 토큰들 마스킹 (NEW)
    # <|im_start|>assistant\n 토큰 시퀀스 찾기
    assistant_start_tokens = processor.tokenizer.encode(
        "<|im_start|>assistant\n", add_special_tokens=False
    )
    
    for i, input_ids in enumerate(batch["input_ids"]):
        input_ids_list = input_ids.tolist()
        
        # assistant 시작 위치 찾기
        for j in range(len(input_ids_list) - len(assistant_start_tokens) + 1):
            if input_ids_list[j:j + len(assistant_start_tokens)] == assistant_start_tokens:
                # assistant 시작 토큰 이후부터 응답 시작
                response_start = j + len(assistant_start_tokens)
                # 그 이전은 모두 마스킹
                labels[i, :response_start] = -100
                break

    batch["labels"] = labels
    return batch


In [12]:
# Full Fine-tuning 설정
full_ft_args = SFTConfig(
    output_dir="qwen25vl-chartqa-full-ft",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="epoch",
    bf16=True,
    learning_rate=2e-5,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    remove_unused_columns=False,
    dataset_kwargs={"skip_prepare_dataset": True},
    # report_to=None,
    report_to="wandb"
)

PyTorch: setting up devices


In [None]:
# Full Fine-tuning Trainer 생성 및 학습
full_ft_trainer = SFTTrainer(
    model=model,
    args=full_ft_args,
    train_dataset=train_formatted,
    eval_dataset=val_formatted,
    data_collator=collate_fn,
    processing_class=processor.tokenizer,
)

# 학습 시작
full_ft_trainer.train()

# 모델 저장
full_ft_trainer.save_model()

You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
***** Running training *****
  Num examples = 28,299
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 1,769
  Number of trainable parameters = 3,754,622,976
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
200,0.2346,0.324176
400,0.2206,0.303575



***** Running Evaluation *****
  Num examples = 1920
  Batch size = 4

***** Running Evaluation *****
  Num examples = 1920
  Batch size = 4


LoRA fine-tuning

In [None]:
# LoRA를 위해 모델 재로드 (fresh)
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    device_map="auto",
    dtype=torch.bfloat16,
)

# LoRA 설정
peft_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.05,
    r=32,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj"],
    task_type="CAUSAL_LM",
)

In [None]:
# LoRA Fine-tuning 설정
lora_args = SFTConfig(
    output_dir="qwen25vl-chartqa-lora",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="epoch",
    bf16=True,
    learning_rate=1e-4,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    remove_unused_columns=False,
    dataset_kwargs={"skip_prepare_dataset": True},
    report_to="wandb"
)

In [None]:
# LoRA Fine-tuning Trainer 생성 및 학습
lora_trainer = SFTTrainer(
    model=model,
    args=lora_args,
    train_dataset=train_formatted,
    eval_dataset=val_formatted,
    data_collator=collate_fn,
    peft_config=peft_config,
    processing_class=processor.tokenizer
)

# 학습 시작
lora_trainer.train()

# 모델 저장
lora_trainer.save_model()

## Evaluation

In [None]:
NUM_SAMPLES = 1000

In [None]:
# Base 모델 평가
print("Loading Base model...")
base_model_eval = AutoModelForImageTextToText.from_pretrained(
    model_id, 
    device_map="auto", 
    dtype=torch.bfloat16
)
base_accuracy = evaluate_model(base_model_eval, processor, test_formatted, num_samples=NUM_SAMPLES)

In [None]:
# Full Fine-tuned 모델 평가
print("Loading Full FT model...")
full_ft_model = AutoModelForImageTextToText.from_pretrained(
    "qwen25vl-chartqa-full-ft", 
    device_map="auto", 
    dtype=torch.bfloat16
)

print("Evaluating Full FT model...")
full_ft_accuracy = evaluate_model(full_ft_model, processor, test_formatted, num_samples=NUM_SAMPLES)
print(f"\nFull FT model accuracy: {full_ft_accuracy:.2%}")

In [None]:
# LoRA 모델 평가
print("Loading LoRA model...")
base_model = AutoModelForImageTextToText.from_pretrained(
    model_id, 
    device_map="auto", 
    dtype=torch.bfloat16
)
lora_model = PeftModel.from_pretrained(base_model, "qwen25vl-chartqa-lora")

print("Evaluating LoRA model...")
lora_accuracy = evaluate_model(lora_model, processor, test_formatted, num_samples=NUM_SAMPLES)
print(f"\nLoRA model accuracy: {lora_accuracy:.2%}")

In [None]:
# 결과를 마크다운 표로 출력
print("\n## Results Summary\n")
print("| Model | Accuracy |")
print("|-------|----------|")
print(f"| Base | {base_accuracy:.2%} |")
print(f"| Full FT | {full_ft_accuracy:.2%} |")
print(f"| LoRA | {lora_accuracy:.2%} |")