### Installation

In [3]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
# Unzip dataset
import zipfile
with zipfile.ZipFile("/content/drive/MyDrive/dataset_oxe.zip", 'r') as zip_ref:
    zip_ref.extractall("/content")

# Set dataset path
DATASET_BASE_PATH = "/content/oxe-colab/training_data"

print(f"‚úÖ Dataset ready at: {DATASET_BASE_PATH}")

In [None]:
import wandb

# Login to WandB
print("üîë Login to Weights & Biases")
print("Get your API key from: https://wandb.ai/authorize")

wandb.login()

# Verify login
print("\n‚úÖ WandB login successful!")

# Project configuration
WANDB_PROJECT = "qwen2-vl-behaviortree"
WANDB_RUN_NAME = "qwen2-8b-bt-finetune-v1"  # Change for each run
WANDB_NOTES = "Fine-tuning Qwen2-VL 8B on BehaviorTree dataset with LoRA"

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.57.1
!pip install --no-deps trl==0.22.2

### Unsloth

In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

We now add LoRA adapters for parameter efficient finetuning - this allows us to only efficiently train 1% of all parameters.

**[NEW]** We also support finetuning ONLY the vision part of the model, or ONLY the language part. Or you can select both! You can also select to finetune the attention or the MLP layers!

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

<a name="Data"></a>
### Data Prep

In [None]:
from datasets import load_dataset
from PIL import Image
import os

os.chdir('/content')

# Load dataset
train_dataset_raw = load_dataset("json", data_files="dataset_oxe/train/data.jsonl", split="train")
val_dataset_raw = load_dataset("json", data_files="dataset_oxe/val/data.jsonl", split="train")

# ========================================
# FIX PER QWEN3-VL: Convert format
# ========================================
def convert_for_qwen3(example, base_path):
    """
    Qwen3-VL richiede formato specifico per image placeholder
    """
    # Load image
    img_path = os.path.join(base_path, example["messages"][0]["content"][1]["image"])
    image = Image.open(img_path).convert("RGB")

    # Get texts
    user_text = example["messages"][0]["content"][0]["text"]
    assistant_text = example["messages"][1]["content"][0]["text"]

    # ‚úÖ FIX: Qwen3 usa formato diverso per content
    new_example = {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},  # ‚Üê Image PRIMA
                    {"type": "text", "text": user_text}  # ‚Üê Text DOPO
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": assistant_text}
                ]
            }
        ]
    }

    return new_example

# Convert to list format (no Arrow serialization issues)
print("Converting dataset for Qwen3-VL...")
train_dataset = []
for example in train_dataset_raw:
    converted = convert_for_qwen3(example, "/content/dataset_oxe/train")
    train_dataset.append(converted)

val_dataset = []
for example in val_dataset_raw:
    converted = convert_for_qwen3(example, "/content/dataset_oxe/val")
    val_dataset.append(converted)

print(f"‚úÖ Dataset converted!")
print(f"   Train: {len(train_dataset)} samples")
print(f"   Val: {len(val_dataset)} samples")


In [None]:
# Show first sample
print("="*60)
print("FIRST TRAINING SAMPLE")
print("="*60)

sample = train_dataset[0]
messages = sample["messages"]

# Show user message
user_msg = messages[0]
print("\nüìù USER MESSAGE:")
for content in user_msg["content"]:
    if content["type"] == "text":
        print(f"\nText:\n{content['text'][:300]}...")  # First 300 chars
    elif content["type"] == "image":
        print(f"\nImage: {type(content['image'])} - Size: {content['image'].size}")

# Show assistant message
assistant_msg = messages[1]
print("\nü§ñ ASSISTANT MESSAGE (Target BT):")
bt_xml = assistant_msg["content"][0]["text"]
print(f"\n{bt_xml[:500]}...")  # First 500 chars

# Display image
print("\nüñºÔ∏è FRAME IMAGE:")
display(user_msg["content"][1]["image"].resize((400, 300)))


To format the dataset, all vision finetuning tasks should be formatted as follows:

```python
[
{ "role": "user",
  "content": [{"type": "text",  "text": Q}, {"type": "image", "image": image} ]
},
{ "role": "assistant",
  "content": [{"type": "text",  "text": A} ]
},
]
```

Let's first see before we do any finetuning what the model outputs for the first example!

In [None]:
FastVisionModel.for_inference(model)

# Take sample
sample = train_dataset[2]
image = sample["messages"][0]["content"][0]["image"]
user_text = sample["messages"][0]["content"][1]["text"]
ground_truth = sample["messages"][1]["content"][0]["text"]

# Preview
print("Image:")
display(image)
print(f"\nInstruction: {[l for l in user_text.split('\n') if 'INSTRUCTION:' in l][0]}")
print(f"\nGround Truth:\n{ground_truth}\n")

# Inference
messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": user_text}]}]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(image, input_text, add_special_tokens=False, return_tensors="pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
print("Prediction:")
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=512, use_cache=True, temperature=1.5, min_p=0.1)


<a name="Train"></a>
### Train the model
Now let's train our model. We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

We use our new `UnslothVisionDataCollator` which will help in our vision finetuning setup.

In [None]:
# ========================================
# INITIALIZE WANDB RUN
# ========================================
import wandb

# Project configuration
WANDB_PROJECT = "qwen2-vl-behaviortree"
WANDB_RUN_NAME = "qwen2-8b-bt-finetune-v1"  # Change for each run
WANDB_NOTES = "Fine-tuning Qwen2-VL 8B on BehaviorTree dataset with LoRA"

# Initialize run
wandb.init(
    project=WANDB_PROJECT,
    name=WANDB_RUN_NAME,
    notes=WANDB_NOTES,
    config={
        # Model
        "model_name": "unsloth/Qwen2-VL-8B-Instruct-bnb-4bit",
        "quantization": "4bit",

        # LoRA config
        "lora_r": 16,
        "lora_alpha": 16,
        "lora_dropout": 0,
        "target_modules": "all-linear",

        # Training hyperparameters
        "num_epochs": 3,
        "max_steps": 30,

        "batch_size": 2,
        "gradient_accumulation_steps": 4,
        "learning_rate": 2e-4,
        "warmup_steps": 10,
        "optimizer": "adamw_8bit",
        "weight_decay": 0.01,
        "lr_scheduler": "linear",

        # Dataset
        "train_samples": len(train_dataset),
        "val_samples": len(val_dataset),
        "max_seq_length": 2048,

        # Save strategy
        "save_steps": 100,
        "eval_steps": 100,
    }
)

print(f"‚úÖ WandB run initialized: {WANDB_PROJECT}/{WANDB_RUN_NAME}")
print(f"üìä View at: {wandb.run.get_url()}")


In [None]:
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

# ========================================
# IMPORTANT: Save checkpoints to Google Drive!
# ========================================
DRIVE_OUTPUT_DIR = "/content/drive/MyDrive/qwen2_vl_bt_training_outputs"

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = train_dataset,
    eval_dataset = val_dataset,  # ADD VALIDATION!
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,

        # Training duration
        # num_train_epochs = 3,  # 3 epochs for ~1500 samples
        max_steps = 20,

        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,

        # ========================================
        # CHECKPOINT SETTINGS (SAVE TO DRIVE!)
        # ========================================
        output_dir = DRIVE_OUTPUT_DIR,  # ‚Üê Save to Drive!

        save_strategy = "steps",        # Save every N steps
        save_steps = 100,               # Save every 100 steps
        save_total_limit = 3,           # Keep only last 3 checkpoints

        # Evaluation
        eval_strategy = "steps",
        eval_steps = 100,               # Evaluate every 100 steps
        load_best_model_at_end = True,
        metric_for_best_model = "eval_loss",

        # Report
        # WANDB INTEGRATION
        # ========================================
        report_to = "wandb",  # ‚Üê Changed from "none"!
        run_name = WANDB_RUN_NAME,
        logging_first_step = True,


        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

print(f"üìÅ Checkpoints will be saved to: {DRIVE_OUTPUT_DIR}")
print(f"üíæ Auto-save every 100 steps, keeping last 3 checkpoints")


In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
# ========================================
# LOG TRAINING SUMMARY AND FINISH WANDB
# ========================================
# Calculate final metrics
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

# Training time
training_time_seconds = trainer_stats.metrics['train_runtime']
training_time_minutes = round(training_time_seconds / 60, 2)
training_time_hours = round(training_time_minutes / 60, 2)

# Log summary metrics
wandb.summary["final_train_loss"] = trainer_stats.metrics.get('train_loss', 'N/A')
wandb.summary["final_eval_loss"] = trainer_stats.metrics.get('eval_loss', 'N/A')
wandb.summary["training_time_minutes"] = training_time_minutes
wandb.summary["training_time_hours"] = training_time_hours
wandb.summary["peak_memory_gb"] = used_memory
wandb.summary["peak_memory_percent"] = used_percentage
wandb.summary["lora_memory_gb"] = used_memory_for_lora
wandb.summary["lora_memory_percent"] = lora_percentage

# Print summary
print("\n" + "="*60)
print("üìä TRAINING SUMMARY")
print("="*60)
print(f"‚è±Ô∏è  Training time: {training_time_minutes} min ({training_time_hours} hours)")
print(f"üìâ Final train loss: {trainer_stats.metrics.get('train_loss', 'N/A')}")
print(f"üìâ Final eval loss: {trainer_stats.metrics.get('eval_loss', 'N/A')}")
print(f"üíæ Peak memory: {used_memory} GB ({used_percentage}%)")
print(f"üíæ LoRA memory: {used_memory_for_lora} GB ({lora_percentage}%)")
print("="*60)

# Save artifact to WandB (LoRA weights)
print("\nüì¶ Saving LoRA weights as WandB artifact...")
artifact = wandb.Artifact(
    name=f"qwen2-vl-bt-lora-{wandb.run.id}",
    type="model",
    description="Fine-tuned Qwen2-VL 8B LoRA adapters for BehaviorTree generation"
)
artifact.add_dir("/content/drive/MyDrive/qwen2_vl_8b_bt_lora_FINAL")
wandb.log_artifact(artifact)

print(f"‚úÖ Artifact saved to WandB!")

# Finish WandB run
wandb.finish()

print(f"\nüéâ Training complete! View full report at: {wandb.run.get_url()}")


In [None]:
# from unsloth import FastVisionModel
# from unsloth.trainer import UnslothVisionDataCollator
# from trl import SFTTrainer, SFTConfig
# import torch

# # ========================================
# # 1. LOAD BASE MODEL
# # ========================================
# print("Loading base model...")
# model, tokenizer = FastVisionModel.from_pretrained(
#     "unsloth/Qwen2-VL-8B-Instruct-bnb-4bit",
#     load_in_4bit=True,
#     use_gradient_checkpointing="unsloth",
# )

# # ========================================
# # 2. SETUP LORA (same as initial training)
# # ========================================
# print("Setting up LoRA...")
# model = FastVisionModel.get_peft_model(
#     model,
#     finetune_vision_layers=True,
#     finetune_language_layers=True,
#     finetune_attention_modules=True,
#     finetune_mlp_modules=True,
#     r=16,
#     lora_alpha=16,
#     lora_dropout=0,
#     bias="none",
#     random_state=3407,
#     use_rslora=False,
#     loftq_config=None,
#     target_modules="all-linear",
# )

# FastVisionModel.for_training(model)

# # ========================================
# # 3. CREATE TRAINER (same config)
# # ========================================
# DRIVE_OUTPUT_DIR = "/content/drive/MyDrive/qwen2_vl_bt_training_outputs"

# trainer = SFTTrainer(
#     model = model,
#     tokenizer = tokenizer,
#     data_collator = UnslothVisionDataCollator(model, tokenizer),
#     train_dataset = train_dataset,
#     eval_dataset = val_dataset,
#     args = SFTConfig(
#         per_device_train_batch_size = 2,
#         gradient_accumulation_steps = 4,
#         warmup_steps = 10,

#         # Training duration
#         num_train_epochs = 3,

#         learning_rate = 2e-4,
#         fp16 = not torch.cuda.is_bf16_supported(),
#         bf16 = torch.cuda.is_bf16_supported(),
#         logging_steps = 10,
#         optim = "adamw_8bit",
#         weight_decay = 0.01,
#         lr_scheduler_type = "linear",
#         seed = 3407,

#         # Checkpoint settings
#         output_dir = DRIVE_OUTPUT_DIR,
#         save_strategy = "steps",
#         save_steps = 100,
#         save_total_limit = 3,

#         # Evaluation
#         eval_strategy = "steps",
#         eval_steps = 100,
#         load_best_model_at_end = True,
#         metric_for_best_model = "eval_loss",

#         # Report
#         # WANDB INTEGRATION
          # ========================================
#         report_to = "wandb",
#         run_name = WANDB_RUN_NAME,
#         logging_first_step = True,

#         # Vision finetuning requirements
#         remove_unused_columns = False,
#         dataset_text_field = "",
#         dataset_kwargs = {"skip_prepare_dataset": True},
#         dataset_num_proc = 4,
#         max_seq_length = 2048,
#     ),
# )

# # ========================================
# # 4. RESUME FROM LATEST CHECKPOINT
# # ========================================
# import os

# # Check if checkpoints exist
# checkpoint_dirs = [d for d in os.listdir(DRIVE_OUTPUT_DIR)
#                    if d.startswith("checkpoint-")] if os.path.exists(DRIVE_OUTPUT_DIR) else []

# if checkpoint_dirs:
#     latest_checkpoint = max(checkpoint_dirs, key=lambda x: int(x.split("-")[1]))
#     checkpoint_path = os.path.join(DRIVE_OUTPUT_DIR, latest_checkpoint)
#     print(f"\nüîÑ Resuming from checkpoint: {checkpoint_path}")
#     trainer_stats = trainer.train(resume_from_checkpoint=checkpoint_path)
# else:
#     print("\nüÜï No checkpoint found. Starting fresh training...")
#     trainer_stats = trainer.train()

# print("\n‚úÖ Training completed!")


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

We use `min_p = 0.1` and `temperature = 1.5`. Read this [Tweet](https://x.com/menhguin/status/1826132708508213629) for more information on why.

In [None]:
FastVisionModel.for_inference(model)

# Take sample
sample = val_dataset[5]
image = sample["messages"][0]["content"][0]["image"]
user_text = sample["messages"][0]["content"][1]["text"]
ground_truth = sample["messages"][1]["content"][0]["text"]

# Preview
print("Image:")
display(image)
print(f"\nInstruction: {[l for l in user_text.split('\n') if 'INSTRUCTION:' in l][0]}")
print(f"\nGround Truth:\n{ground_truth}\n")

# Inference
messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": user_text}]}]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(image, input_text, add_special_tokens=False, return_tensors="pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
print("Prediction:")
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=512, use_cache=True, temperature=1.5, min_p=0.1)


In [None]:
# ========================================
# SAVE LORA ADAPTERS TO GOOGLE DRIVE
# ========================================
# This is the MOST IMPORTANT save - small size, reloadable

import os
from google.colab import drive

# Ensure drive is mounted
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# Save path on Google Drive
DRIVE_SAVE_PATH = "/content/drive/MyDrive/qwen2_vl_8b_bt_lora"

# Save LoRA adapters (SMALL - only ~100MB!)
print("Saving LoRA adapters to Google Drive...")
model.save_pretrained(DRIVE_SAVE_PATH)
tokenizer.save_pretrained(DRIVE_SAVE_PATH)

print(f"‚úÖ LoRA adapters saved to: {DRIVE_SAVE_PATH}")
print("üì¶ This is your MAIN checkpoint - use this to resume training or inference!")


In [None]:
# ========================================
# TEST: RELOAD LORA FROM DRIVE
# ========================================
# This tests that your save worked correctly

TEST_RELOAD = False  # Set to True to test reload

if TEST_RELOAD:
    from unsloth import FastVisionModel

    DRIVE_SAVE_PATH = "/content/drive/MyDrive/qwen2_vl_8b_bt_lora"

    print(f"Reloading model from: {DRIVE_SAVE_PATH}")

    # Load base model + LoRA
    model_reloaded, tokenizer_reloaded = FastVisionModel.from_pretrained(
        model_name="unsloth/Qwen2-VL-8B-Instruct-bnb-4bit",  # Base model
        load_in_4bit=True,
    )

    # Load LoRA weights
    from peft import PeftModel
    model_reloaded = PeftModel.from_pretrained(model_reloaded, DRIVE_SAVE_PATH)

    print("‚úÖ Model reloaded successfully!")
    print("üß™ Test inference:")

    # Quick test
    FastVisionModel.for_inference(model_reloaded)
    sample = val_dataset[0]
    image = sample["messages"][0]["content"][1]["image"]
    user_text = sample["messages"][0]["content"][0]["text"]

    messages = [{"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": user_text}
    ]}]

    input_text = tokenizer_reloaded.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer_reloaded(image, input_text, add_special_tokens=False, return_tensors="pt").to("cuda")

    from transformers import TextStreamer
    text_streamer = TextStreamer(tokenizer_reloaded, skip_prompt=True)
    _ = model_reloaded.generate(**inputs, streamer=text_streamer, max_new_tokens=256, use_cache=True)
else:
    print("‚è≠Ô∏è Skipped reload test (set TEST_RELOAD=True to test)")
