# Setting up environment

Check cuda version

In [1]:
!nvidia-smi

Wed Apr  9 07:55:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

Change CUDA memory config

In [2]:
!export 'PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True'

Clone GitHub repo

In [3]:
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git

Cloning into 'LLaMA-Factory'...
remote: Enumerating objects: 353, done.[K
remote: Counting objects: 100% (353/353), done.[K
remote: Compressing objects: 100% (277/277), done.[K
remote: Total 353 (delta 94), reused 193 (delta 61), pack-reused 0 (from 0)[K
Receiving objects: 100% (353/353), 9.74 MiB | 27.86 MiB/s, done.
Resolving deltas: 100% (94/94), done.


Change directory

In [4]:
%cd LLaMA-Factory

/kaggle/working/LLaMA-Factory


Install packages

In [5]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
%pip install --upgrade datasets transformers bitsandbytes
%pip install --upgrade qwen-vl-utils[decord]
%pip install -e ".[torch,metrics]"

Looking in indexes: https://download.pytorch.org/whl/cu126
Note: you may need to restart the kernel to use updated packages.
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.51.1-py3-none-any.whl.metadata (38 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading transformers-4.51.1-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)

# Merging Fine-tuned Model

Import packages

In [6]:
import json

Creating merging script

In [7]:
args = {
    "model_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
    "adapter_name_or_path": "/kaggle/input/finetuned/other/default/1/finetuned/checkpoint-100",
    "template": "qwen2_vl",
    "finetuning_type": "lora",
    "trust_remote_code": True,
    "export_dir": "/kaggle/working/merged/merged-100",
    "export_size": 5,
    "export_device": "cpu",
    "export_legacy_format": False,
}
with open("merge.json", "w", encoding="utf-8") as f: 
    json.dump(args, f, ensure_ascii=False, indent=4)

Merge model

In [8]:
!llamafactory-cli export merge.json

2025-04-09 07:56:32.700603: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-09 07:56:33.040477: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-09 07:56:33.141314: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
tokenizer_config.json: 100%|███████████████| 5.70k/5.70k [00:00<00:00, 36.3MB/s]
vocab.json: 100%|██████████████████████████| 2.78M/2.78M [00:00<00:00, 11.8MB/s]
merges.txt: 100%|██████████████████████████| 1.67M/1.67M [00:00<00:00, 32.2MB/s]
tokenizer.json: 100%|██████████████████████| 7.03M/7.03M [00:00<00:00, 22.6MB/s]
[INFO|tokenization_utils_base.py:2060] 20

# Using fine-tuned Qwen 2.5-VL one time

Change directory

In [9]:
%cd /kaggle/working

/kaggle/working


Import packages

In [10]:
import time
import csv
import torch
from datasets import load_dataset
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info

Load dataset and model

In [11]:
dataset = load_dataset("lmms-lab/AISG_Challenge")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "/kaggle/working/merged/merged-100",
    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
    device_map=device,
    local_files_only=True)
processor = AutoProcessor.from_pretrained("/kaggle/working/merged/merged-100", local_files_only=True)

README.md:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/94.1k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Download video function

In [12]:
def retrieve_video(video_id):
    filename = f"{video_id}.mp4"
    video_path = f"../input/videos/videos/{filename}"
    return video_path

Process sample

In [13]:
def process_test_case(example):
    video_id = example["video_id"]
    question = example["question"]
    question_prompt = example["question_prompt"]
    expected_answer = example["answer"]
    video_path = retrieve_video(video_id)

    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": video_path,
                    "max_pixels": 240 * 426,
                    "fps": 1,
                },
                {"type": "text", "text": f"{question}\n{question_prompt}\nAnswer in English only."},
            ],
        }
    ]

    text = processor.apply_chat_template(
        conversation, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs, video_kwargs = process_vision_info(conversation, return_video_kwargs=True)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs,
    )
    inputs = inputs.to(model.device)

    generated_ids = model.generate(**inputs, max_new_tokens=1280)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    '''
    print(f"Video URL: {example['youtube_url']}")
    print(f"Question:\n{question}\n{question_prompt}")
    print(f"Answer: {output_text}")
    '''
    return example['qid'], output_text

Run test cases

In [19]:
start_time = time.time()
output_file = "./results.csv"
with open(output_file, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["qid", "pred"])
    '''
    sample = dataset['test'].filter(lambda x: x['qid'] == '0008-2')[0]
    qid, pred = process_test_case(sample)
    writer.writerow([qid, pred])
    '''
    for sample in dataset['test']:
        print(f"processing qid {sample['qid']}")
        qid, pred = process_test_case(sample)
        writer.writerow([qid, pred])
end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Video URL: https://www.youtube.com/shorts/DtZdE1P45yQ
Question:
Did the substance in the beaker end up on the rod by being pulled out and wrapped around it?
Please state your answer with a brief explanation.
Answer: Yes, the substance was pulled out of the beaker and wrapped around the rod.
Time taken: 31.61251711845398 seconds
