# Setting up environment

Check cuda version

In [None]:
!nvidia-smi

Change CUDA memory config

In [None]:
!export 'PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True'

Install packages

In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
%pip install --upgrade datasets transformers bitsandbytes ffmpeg-python
%pip install --upgrade qwen-vl-utils[decord]

# Using Qwen 2.5-VL one time with better prompt

Import packages

In [None]:
import time
import csv
import torch
import re
import os
import ffmpeg
from datasets import load_dataset
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info

Create tmp folder

In [None]:
!mkdir /kaggle/tmp

Load dataset and model

In [None]:
dataset = load_dataset("lmms-lab/AISG_Challenge")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
    device_map="cuda:0")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

Download video function

In [None]:
def retrieve_video(video_id):
    filename = f"{video_id}.mp4"
    video_path = f"../input/videos/{filename}"
    return video_path

Extract key frames timestamp

In [None]:
def extract_timestamp(question, video_path):
    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": video_path,
                    "max_pixels": 144 * 256,
                    "fps": 1,
                },
                {"type": "text", "text": f"{question}\nOnly return the timestamp relevant to the question in the format [mm:ss-mm:ss]."},
            ],
        }
    ]

    text = processor.apply_chat_template(
        conversation, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs, video_kwargs = process_vision_info(conversation, return_video_kwargs=True)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs,
    )
    inputs = inputs.to(model.device)

    generated_ids = model.generate(**inputs, max_new_tokens=1280)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    match = re.search(r"\[(\d{1,2}):(\d{2})-(\d{1,2}):(\d{2})\]", output_text)
    m1, s1, m2, s2 = map(int, match.groups())
    start = m1 * 60 + s1
    end = m2 * 60 + s2
    
    return start, end

Process sample

In [None]:
def process_test_case(example):
    video_id = example["video_id"]
    question = example["question"]
    question_prompt = example["question_prompt"]
    expected_answer = example["answer"]
    
    video_path = retrieve_video(video_id)
    
    prompt = f"{question}\n{question_prompt}\nAnswer in English only."

    start, end = extract_timestamp(question, video_path)
    duration = end - start
    output_path = os.path.join("/kaggle/tmp", os.path.basename(video_path))
    (
    ffmpeg
    .input(video_path, ss=start, t=duration)
    .output(output_path)
    .run(overwrite_output=True)
    )
    
    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": output_path,
                    "max_pixels": 720*1280,
                    "fps": 1,
                },
                {"type": "text", "text": prompt},
            ],
        }
    ]

    text = processor.apply_chat_template(
        conversation, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs, video_kwargs = process_vision_info(conversation, return_video_kwargs=True)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs,
    )
    inputs = inputs.to(model.device)

    generated_ids = model.generate(**inputs, max_new_tokens=1280)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    print(f"Video URL: {example['youtube_url']}")
    print(f"Question:\n{question}\n{question_prompt}")
    print(f"Answer: {output_text}")
    
    return example['qid'], output_text

Run test cases

In [None]:
start_time = time.time()
output_file = "./results.csv"
with open(output_file, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["qid", "pred"])
    sample = dataset['test'].filter(lambda x: x['qid'] == '0957-0')[0]
    qid, pred = process_test_case(sample)
    writer.writerow([qid, pred])
    '''
    for sample in dataset['test']:
        print(f"processing qid {sample['qid']}")
        qid, pred = process_test_case(sample)
        writer.writerow([qid, pred])
    '''
end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")