# Setting up environment

Check cuda version

In [None]:
!nvidia-smi

Clone GitHub repo

In [None]:
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git

Change directory

In [None]:
%cd LLaMA-Factory

Install packages

In [None]:
!pip install -e ".[torch,metrics]"
!pip install deepspeed triton
!pip install flash-attn --no-build-isolation

# Loading Dataset

Import packages

In [None]:
from datasets import load_dataset, concatenate_datasets, Value, Dataset
import json

Load videos mapping

In [None]:
with open("/kaggle/input/d/seanjeanmoey/next-qa-dataset/map_vid_vidorID.json") as file:
    video_dir_map = json.load(file)

Format data

In [None]:
def format_data(sample):
    return {
        "messages": [
            {
                "content": f"<video>{sample['question']}",
                "role": "user"
            },
            {
                "content": f"{sample['answer']}",
                "role": "assistant"
            }
        ],
        "videos": [
            f"/kaggle/input/d/seanjeanmoey/next-qa-dataset/NExTVideo/NExTVideo/{video_dir_map[sample['video']]}.mp4"
        ]
    }

Format MCQ

In [None]:
def reformat_mcq(sample):
    choice_labels = ["A", "B", "C", "D", "E"]
    choices = [sample[f"a{i}"] for i in range(5)]
    formatted_choices = "\n".join([f"{choice_labels[i]}. {choice}" for i, choice in enumerate(choices)])
    
    return {
        "video": sample["video"],
        "frame_count": sample["frame_count"],
        "width": sample["width"],
        "height": sample["height"],
        "question": f"{sample['question']}\n{formatted_choices}\nSelect one best answer to the above multiple-choice question based on the video. Respond with only the letter (A, B, C, D or E) of the correct option.",
        "answer": choice_labels[sample["answer"]],
        "qid": sample["qid"],
        "type": sample["type"],
        "additional_ref_answer": None
    }

Load dataset

In [None]:
dataset_id = 'lmms-lab/NExTQA'

mcq_dataset = load_dataset(dataset_id, 'MC')['test'].map(reformat_mcq, remove_columns=['a0', 'a1', 'a2', 'a3', 'a4'])
new_features = mcq_dataset.features.copy()
new_features["video"] = Value("string")
new_features["frame_count"] = Value("int32")
new_features["width"] = Value("int32")
new_features["height"] = Value("int32")
new_features["qid"] = Value("int32")
mcq_dataset = mcq_dataset.cast(new_features)
train_test_split = mcq_dataset.train_test_split(test_size=0.3, seed=42)
val_test_split = train_test_split['test'].train_test_split(test_size=2/3, seed=42)
mcq_train_dataset = train_test_split['train']
mcq_eval_dataset = val_test_split['train']
mcq_test_dataset = val_test_split['test']

oe_train_dataset, oe_eval_dataset, oe_test_dataset = load_dataset(dataset_id, 'OE', split=['train', 'validation', 'test'])

train_dataset = concatenate_datasets([mcq_train_dataset, oe_train_dataset])
eval_dataset = concatenate_datasets([mcq_eval_dataset, oe_eval_dataset])
test_dataset = concatenate_datasets([mcq_test_dataset, oe_test_dataset])

train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]
test_dataset = [format_data(sample) for sample in test_dataset]

dataset = train_dataset + eval_dataset + test_dataset

Create dataset info json

In [None]:
args = { 
    "nextqa": {
        "file_name": "/kaggle/working/LLaMA-Factory/data/nextqa.json",
        "formatting": "sharegpt",
        "columns": {
            "messages": "messages",
            "videos": "videos"
        },
        "tags": {
            "role_tag": "role",
            "content_tag": "content",
            "user_tag": "user",
            "assistant_tag": "assistant"
        }
    }
}
with open("data/dataset_info.json", "w", encoding="utf-8") as f: 
    json.dump(args, f, ensure_ascii=False, indent=4)

Create dataset json

In [None]:
with open("data/nextqa.json", "w", encoding="utf-8") as f: 
    json.dump(dataset, f, ensure_ascii=False, indent=4)

# Fine-tuning Model

Import packages

In [None]:
import wandb
from kaggle_secrets import UserSecretsClient

Login to wandb

In [None]:
wandb.login(key=UserSecretsClient().get_secret("WANDB_API_KEY"))

Create train.json

In [None]:
args = {
    "model_name_or_path": "Qwen/Qwen2.5-VL-3B-Instruct",
    "image_max_pixels": 262144,
    "video_max_pixels": 4096,
    "trust_remote_code": True,
    "stage": "sft",
    "do_train": True,
    "finetuning_type": "lora",
    "flash_attn": "auto",
    "lora_rank": 8,
    "lora_alpha": 16,
    "lora_dropout": 0,
    "lora_target": "all",
    "dataset_dir": "/kaggle/working/LLaMA-Factory/data",
    "dataset": "nextqa",
    "template": "qwen2_vl",
    "overwrite_cache": True,
    # "cutoff_len": 2048,
    # "max_samples": 128,
    # "preprocessing_num_workers": 16,
    "dataloader_num_workers": 1,
    "output_dir": "/kaggle/working/finetuned",
    "logging_steps": 5,
    "save_steps": 100,
    "plot_loss": True,
    "overwrite_output_dir": True,
    "save_only_model": False,
    "per_device_train_batch_size": 1,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5.0e-5,
    "num_train_epochs": 3.0,
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.1,
    "bf16": True,
    "ddp_timeout": 180000000,
    "resume_from_checkpoint": None,
    "max_grad_norm": 1,
    "warmup_steps": 0,
    "packing": False,
    "report_to": None,
    "optim": "adamw_torch",
    "streaming": True,
    "max_steps": 1000,
    "buffer_size": 128,
    "preprocessing_batch_size": 128,
    "accelerator_config": {
        "dispatch_batches": False,
    },
}
with open("train.json", "w", encoding="utf-8") as f: 
    json.dump(args, f, ensure_ascii=False, indent=4)

Train model

In [None]:
!llamafactory-cli train train.json

# Merging Fine-tuned Model

Import packages

In [None]:
import json

Create merging script

In [None]:
args = {
    "model_name_or_path": "Qwen/Qwen2.5-VL-3B-Instruct",
    "adapter_name_or_path": "/kaggle/working/finetuned",
    "template": "qwen2_vl",
    "finetuning_type": "lora",
    "trust_remote_code": True,
    "export_dir": "/kaggle/working/merged",
    "export_size": 5,
    "export_device": "cpu",
    "export_legacy_format": False,
}
with open("merge.json", "w", encoding="utf-8") as f: 
    json.dump(args, f, ensure_ascii=False, indent=4)

Merge model

In [None]:
!llamafactory-cli export merge.json

Zip the output

In [None]:
%cd /kaggle/working
!7z a -r finetuned.zip finetuned
!7z a -r merged.zip merged