# Setting up environment

Check cuda version

In [None]:
!nvidia-smi

Change CUDA memory config

In [None]:
!export 'PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True'

Install packages

In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
%pip install --upgrade datasets transformers bitsandbytes
%pip install --upgrade qwen-vl-utils[decord]

# Using Qwen 2.5-VL one time with better prompt

Import packages

In [2]:
import time
import csv
import torch
from datasets import load_dataset
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
from qwen_vl_utils import process_vision_info
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

Login to HuggingFace

In [3]:
login(token=UserSecretsClient().get_secret("HuggingFace"))

Load dataset and model

In [4]:
dataset = load_dataset("lmms-lab/AISG_Challenge")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
    device_map="cuda:0")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
    device_map="cuda:1")
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Download video function

In [5]:
def retrieve_video(video_id):
    filename = f"{video_id}.mp4"
    video_path = f"../input/videos/{filename}"
    return video_path

Improve prompt

In [28]:
def improve_prompt(question):
    actual_question = question.split('\n')[0]
    prompt = f"Generalize the question using only one of the most appropriate 5W1H formats (Who, What, When, Where, Why, How).\nOriginal question: {actual_question}\nRewritten question:"
    model_inputs = mistral_tokenizer([prompt], return_tensors="pt").to(mistral_model.device)
    generated_ids = mistral_model.generate(**model_inputs, max_new_tokens=100)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    output_text = mistral_tokenizer.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    return output_text + " " + question

Process sample

In [26]:
def process_test_case(example):
    video_id = example["video_id"]
    question = example["question"]
    question_prompt = example["question_prompt"]
    expected_answer = example["answer"]
    video_path = retrieve_video(video_id)

    prompt = f"{improve_prompt(question)}\n{question_prompt}\nAnswer in English only."
    
    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": video_path,
                    "max_pixels": 240 * 426,
                    "fps": 1,
                },
                {"type": "text", "text": prompt},
            ],
        }
    ]

    text = processor.apply_chat_template(
        conversation, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs, video_kwargs = process_vision_info(conversation, return_video_kwargs=True)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs,
    )
    inputs = inputs.to(model.device)

    generated_ids = model.generate(**inputs, max_new_tokens=1280)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    print(f"Prompt:\n{prompt}")
    print(f"Video URL: {example['youtube_url']}")
    print(f"Question:\n{question}\n{question_prompt}")
    print(f"Answer: {output_text}")
    return example['qid'], output_text

Run test cases

In [29]:
start_time = time.time()
output_file = "./results.csv"
with open(output_file, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["qid", "pred"])
    sample = dataset['test'].filter(lambda x: x['qid'] == '0008-3')[0]
    qid, pred = process_test_case(sample)
    writer.writerow([qid, pred])
    '''
    for sample in dataset['test']:
        print(f"processing qid {sample['qid']}")
        qid, pred = process_test_case(sample)
        writer.writerow([qid, pred])
    '''
end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prompt: How did the last person in the video open the bottle? Did the last person in the video open the bottle with a knife while the first two people failed in their attempts?
Please state your answer with a brief explanation.
Answer in English only.
Video URL: https://www.youtube.com/shorts/sj81PWrerDk
Question:
Did the last person in the video open the bottle with a knife while the first two people failed in their attempts?
Please state your answer with a brief explanation.
Answer: No, the last person in the video successfully opened the Coca-Cola bottle using a knife. The first two individuals in the video were shown attempting to open bottles but did not use a knife; instead, they seemed to be struggling or failing in their methods.
Time taken: 11.261261940002441 seconds
