In [1]:
import os
import torch
from datasets import load_dataset
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration

from trl import (
    SFTConfig,
    SFTTrainer,
)

In [2]:
os.environ["ACCELERATE_CONFIG_FILE"] = "/home/eoeldroal/WorkPlace/Culture/accelerate_ds_zero3.yaml"  # ← 너의 YAML 절대경로

In [3]:
MODEL_ID = "NCSOFT/VARCO-VISION-2.0-14B"
DATASET = "HuggingFaceH4/llava-instruct-mix-vsft"
OUTPUT_DIR = "runs/varco-14b-sft"

In [19]:
################
# Dataset
################
dataset = load_dataset(DATASET)


Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/23 [00:00<?, ?it/s]

In [20]:
processor = AutoProcessor.from_pretrained(MODEL_ID)

In [37]:
def convert_message_format(messages):
    """메시지 형식을 VLM 표준으로 변환"""
    converted = []
    for message in messages:
        converted_message = {
            "role": message["role"],
            "content": []
        }
        for content in message["content"]:
            if content["type"] == "text" and content["text"]:
                converted_message["content"].append({
                    "type": "text", 
                    "text": content["text"].strip()
                })
            elif content["type"] == "image":
                converted_message["content"].append({"type": "image"})
        
        if converted_message["content"]:
            converted.append(converted_message)
    return converted

In [38]:
def preprocess_messages(example):
    example["messages"] = convert_message_format(example["messages"])
    return example

dataset = dataset.map(preprocess_messages, desc="Converting message format")


Converting message format:   0%|          | 0/259155 [00:00<?, ? examples/s]

Converting message format:   0%|          | 0/13640 [00:00<?, ? examples/s]

In [39]:

# 기존 collate_fn 그대로 사용
def collate_fn(examples):
    # Get the texts and images, and apply the chat template
    texts = [processor.apply_chat_template(example["messages"], tokenize=False) for example in examples]
    images = [example["images"][0] for example in examples]

    # Tokenize the texts and process the images
    batch = processor(images=images, text=texts, return_tensors="pt", padding=True)

    # The labels are the input_ids, and we mask the padding tokens in the loss computation
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    batch["labels"] = labels

    return batch

In [45]:
# 메시지 변환 테스트
sample = dataset["train"][0]
converted = convert_message_format(sample["messages"])

print("변환 후 첫 번째 메시지:")
print(converted[0])

# apply_chat_template 테스트
try:
    text = processor.apply_chat_template(converted, tokenize=False)
    print("✅ 템플릿 적용 성공!")
except Exception as e:
    print(f"❌ 아직 문제 있음: {e}")

변환 후 첫 번째 메시지:
{'role': 'user', 'content': [{'type': 'text', 'text': 'Who wrote this book?'}, {'type': 'image'}]}
✅ 템플릿 적용 성공!


In [14]:
args = SFTConfig(
    output_dir=OUTPUT_DIR,
    bf16=True, fp16=False,
    learning_rate=6e-6, weight_decay=0.1,
    chat_template_path="HuggingFaceTB/SmolLM3-3B",

    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,

    max_steps = 16,
    logging_steps=1,
    save_strategy="no",
    
    max_length=32768,
    remove_unused_columns=False,
    dataset_kwargs={"skip_prepare_dataset": True},

    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    ddp_find_unused_parameters=False,

    optim="adamw_torch_fused",
    report_to="none",
)

In [15]:
eval_split = "validation" if "validation" in collated_data else ("test" if "test" in collated_data else None)

In [None]:
def train_main() :
    ################
    # Model, Tokenizer & Processor
    ################

    model = LlavaOnevisionForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.bfloat16,
        attn_implementation="sdpa",
        device_map=None,                # ← DeepSpeed/Accelerate가 배치
    )
    model.config.use_cache = False     # grad_ckpt와 충돌 방지
    # tok = processor.tokenizer
    # # (1) pad/eos 보장 (LLaMA/Qwen 계열은 pad 없는 경우 흔함)
    # if tok.pad_token is None:
    #     tok.pad_token = tok.eos_token
    #     tok.pad_token_id = tok.eos_token_id
    # tok.padding_side = "right"

    # # (2) TRL 호환을 위해 Processor에 토큰 관련 속성/메서드 "위임"
    # #     - SFTTrainer가 processing_class.convert_tokens_to_ids(...)를 호출하므로, 토크나이저 메서드로 연결

    # #! 아래가 바로 병목점이다.
    # #! qwen 3 에 없는 토큰들을 SFT 단계에서 요구함.
    # if not hasattr(processor, "convert_tokens_to_ids"):
    #     processor.convert_tokens_to_ids = tok.convert_tokens_to_ids
    # processor.pad_token = tok.pad_token
    # processor.eos_token = tok.eos_token
    # processor.pad_token_id = tok.pad_token_id           # ← 추가
    # processor.eos_token_id = tok.eos_token_id           # ← 추가


    # # (3) 모델 config에도 pad_token_id 반영
    # if getattr(model.config, "pad_token_id", None) is None:
    #     model.config.pad_token_id = tok.pad_token_id
    
    ################
    # Training
    ################
    trainer = SFTTrainer(
        model=model,
        args=args,
        data_collator=collate_fn,
        train_dataset=dataset["train"],
        eval_dataset=(dataset[eval_split] if eval_split else None),
        processing_class=processor,
    )
    
    trainer.train()
    
    trainer.save_model(OUTPUT_DIR)
    

In [17]:
from accelerate import notebook_launcher
notebook_launcher(train_main, num_processes=4)

Launching training on 4 CUDAs.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

ChildFailedError: 
============================================================
train_main FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-08-20_22:13:43
  host      : sv3-a100.tail184f17.ts.net
  rank      : 3 (local_rank: 3)
  exitcode  : 1 (pid: 151981)
  error_file: /tmp/torchelastic_q8z7okcu/none_f6yz99xe/attempt_0/3/error.json
  traceback : Traceback (most recent call last):
    File "/home/eoeldroal/miniconda3/envs/Culture/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
      return f(*args, **kwargs)
    File "/tmp/ipykernel_146922/754687456.py", line 40, in train_main
      train_dataset=dataset["train"],
    File "/home/eoeldroal/miniconda3/envs/Culture/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 2858, in __getitem__
      return Column(self, key)
    File "/home/eoeldroal/miniconda3/envs/Culture/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 656, in __init__
      raise ValueError(f"Column '{column_name}' doesn't exist.")
  ValueError: Column 'train' doesn't exist.
  
============================================================