# Setting up environment

Check cuda version

In [24]:
!nvidia-smi

Sat Mar 29 07:42:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   43C    P8             10W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

Install packages

In [25]:
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory
!pip install -e ".[torch,metrics]"
!pip install bitsandbytes
!pip install liger-kernel

Cloning into 'LLaMA-Factory'...
remote: Enumerating objects: 345, done.[K
remote: Counting objects: 100% (345/345), done.[K
remote: Compressing objects: 100% (281/281), done.[K
remote: Total 345 (delta 82), reused 172 (delta 48), pack-reused 0 (from 0)[K
Receiving objects: 100% (345/345), 9.54 MiB | 17.73 MiB/s, done.
Resolving deltas: 100% (82/82), done.
/kaggle/working/LLaMA-Factory/LLaMA-Factory
Obtaining file:///kaggle/working/LLaMA-Factory/LLaMA-Factory
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: llamafactory
  Building editable for llamafactory (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llamafactory: filename=llamafactory-0.9.3.dev0-0.editable-py3-none-any.whl size=26055 sha256=cd292e5be2f4fb01decc89f85c0449951bb5

# Loading Dataset

Import packages

In [26]:
from datasets import load_dataset, concatenate_datasets, Value
import json

Load videos mapping

In [27]:
with open("/kaggle/input/d/seanjeanmoey/next-qa-dataset/map_vid_vidorID.json") as file:
    video_dir_map = json.load(file)

Format data

In [28]:
def format_data(sample):
    return {
        "conversations": [
            {
                "from": "human",
                "value": f"<video>{sample['question']}"
            },
            {
                "from": "gpt",
                "value": f"{sample['answer']}"
            }
        ],
        "videos": [
            f"/kaggle/input/d/seanjeanmoey/next-qa-dataset/NExTVideo/NExTVideo/{video_dir_map[sample['video']]}.mp4"
        ]
    }

Format MCQ

In [29]:
def reformat_mcq(sample):
    choice_labels = ["A", "B", "C", "D", "E"]
    choices = [sample[f"a{i}"] for i in range(5)]
    formatted_choices = "\n".join([f"{choice_labels[i]}. {choice}" for i, choice in enumerate(choices)])
    
    return {
        "video": sample["video"],
        "frame_count": sample["frame_count"],
        "width": sample["width"],
        "height": sample["height"],
        "question": f"{sample['question']}\n{formatted_choices}\nSelect one best answer to the above multiple-choice question based on the video. Respond with only the letter (A, B, C, D or E) of the correct option.",
        "answer": choice_labels[sample["answer"]],
        "qid": sample["qid"],
        "type": sample["type"],
        "additional_ref_answer": None
    }

Load dataset

In [None]:
dataset_id = 'lmms-lab/NExTQA'

mcq_dataset = load_dataset(dataset_id, 'MC')['test'].map(reformat_mcq, remove_columns=['a0', 'a1', 'a2', 'a3', 'a4'])
new_features = mcq_dataset.features.copy()
new_features["video"] = Value("string")
new_features["frame_count"] = Value("int32")
new_features["width"] = Value("int32")
new_features["height"] = Value("int32")
new_features["qid"] = Value("int32")
mcq_dataset = mcq_dataset.cast(new_features)
train_test_split = mcq_dataset.train_test_split(test_size=0.3, seed=42)
val_test_split = train_test_split['test'].train_test_split(test_size=2/3, seed=42)
mcq_train_dataset = train_test_split['train']
mcq_eval_dataset = val_test_split['train']
mcq_test_dataset = val_test_split['test']

oe_train_dataset, oe_eval_dataset, oe_test_dataset = load_dataset(dataset_id, 'OE', split=['train', 'validation', 'test'])

train_dataset = concatenate_datasets([mcq_train_dataset, oe_train_dataset])
eval_dataset = concatenate_datasets([mcq_eval_dataset, oe_eval_dataset])
test_dataset = concatenate_datasets([mcq_test_dataset, oe_test_dataset])

train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]
test_dataset = [format_data(sample) for sample in test_dataset]

Create dataset info json

In [None]:
args = { 
    "nextqa": {
        "file_name": "nextqa.json",
        "formatting": "sharegpt",
        "columns": {
            "messages": "conversations",
            "videos": "videos"
        }
    }
}
with open("data/dataset_info.json", "w", encoding="utf-8") as f: 
    json.dump(args, f, ensure_ascii=False, indent=4)

Create dataset json

In [None]:
with open("data/nextqa.json", "w", encoding="utf-8") as f: 
    json.dump(train_dataset, f, ensure_ascii=False, indent=4)

# Fine-tuning Model

Import packages

In [None]:
import json

Create fine-tuning script

In [None]:
args = {
    "model_name_or_path": "Qwen/Qwen2.5-VL-3B-Instruct",
    # "image_max_pixels": 262144,
    # "video_max_pixels": 16384,
    # "trust_remote_code": True,
    # "stage": "sft",
    "do_train": True,
    "finetuning_type": "lora",
    # "lora_rank": 8,
    "lora_target": "all",
    "dataset": "nextqa",
    "template": "qwen2_vl",
    # "cutoff_len": 2048,
    # "max_samples": 1000,
    # "overwrite_cache": True,
    # "preprocessing_num_workers": 1,
    # "dataloader_num_workers": 4,
    "output_dir": "qwen2.5vl_lora",
    # "logging_steps": 10,
    # "save_steps": 500,
    # "plot_loss": True,
    # "overwrite_output_dir": True,
    # "save_only_model": False,
    "per_device_train_batch_size": 1,
    "gradient_accumulation_steps": 8,
    "learning_rate": 1.0e-4,
    "num_train_epochs": 3.0,
    # "lr_scheduler_type": "cosine",
    # "warmup_ratio": 0.1,
    # "bf16": True,
    # "ddp_timeout": 180000000,
    # "resume_from_checkpoint": None,
    # val_size: 0.1
    # per_device_eval_batch_size: 1
    # eval_strategy: steps
    # eval_steps: 500
}
with open("train.json", "w", encoding="utf-8") as f: 
    json.dump(args, f, ensure_ascii=False, indent=4)

Train model

In [None]:
!llamafactory-cli train train.json

# Merging Fine-tuned Model

Import packages

In [None]:
import json

Create merging script

In [None]:
args = {
    "model_name_or_path": "Qwen/Qwen2.5-VL-3B-Instruct",
    "adapter_name_or_path": "qwen2.5vl_lora",
    "template": "qwen2_vl",
    "finetuning_type": "lora",
    # "trust_remote_code": True,
    "export_dir": "qwen2.5vl_3b_instruct_lora_merged",
    "export_size": 5,
    "export_device": "cpu",
    # "export_legacy_format": False,
}
with open("merge.json", "w", encoding="utf-8") as f: 
    json.dump(args, f, ensure_ascii=False, indent=4)

Merge model

In [None]:
!llamafactory-cli export merge.json