In [None]:
import os
import json
import torch
import transformers
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from transformers import AutoConfig, AutoTokenizer
from qwen_vl_utils import process_vision_info

from PIL import Image
from qwen_vl_utils import smart_resize

In [None]:
seed = 42
transformers.set_seed(seed)

In [None]:
model_path = "Qwen/Qwen2.5-VL-7B-Instruct"
max_length = 65536

# 1. Load the original configuration
config = AutoConfig.from_pretrained(model_path)
    
# 2. Modify the configuration
config.sliding_window = max_length
config.max_position_embeddings = max_length
config.model_max_length = max_length
    
# 3. Initialize the model with the modified configuration
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    config=config,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
    
# 4. Initialize and update the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.model_max_length = max_length
    
# 5. Initialize and update the processor
processor = AutoProcessor.from_pretrained(model_path)
processor.tokenizer = tokenizer  # Use the updated tokenizer

In [None]:
with open("eval_videomme.json", "r") as f:
    eval_videomme = json.load(f)

In [None]:
subtitles_dict = {}
for example in eval_videomme:
    if example["duration"] == "long":
        video_id = example["videoID"]
        if video_id in subtitles_dict:
            continue
        subtitle_path = f"video_mme_long/{video_id}/subtitles.txt"
        lines = []
        with open(subtitle_path, "r") as f:
            for line in f:
                line = line.strip()
                if line:
                    lines.append(line)
                else:
                    lines.append("N/A")
        if len(lines) == 767:
            lines.append("N/A")
        if len(lines) == 0:
            lines = ["N/A"] * 768
        assert len(lines) == 768
        subtitles_dict[video_id] = lines

print(len(subtitles_dict))

In [None]:
def get_all_file_paths(directory_path):
    file_paths = [os.path.join(directory_path, filename) for filename in os.listdir(directory_path)]
    return file_paths

def get_text_prompt(question, options):
    options_text = "\n".join(options)
    return f"""Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
{question}
{options_text}
The best answer is:"""

def get_text_prompt_with_subtitles(question, options, subtitles):
    options_text = "\n".join(options)
    return f"""This video's subtitles are listed below:
{subtitles}
Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
{question}
{options_text}
The best answer is:"""

In [None]:
IMAGE_FACTOR = 28
MIN_PIXELS = 4 * 28 * 28
MAX_PIXELS = 16384 * 28 * 28
MAX_RATIO = 200

VIDEO_MIN_PIXELS = 128 * 28 * 28
VIDEO_MAX_PIXELS = 768 * 28 * 28
FRAME_FACTOR = 2
FPS = 2.0
FPS_MIN_FRAMES = 4
FPS_MAX_FRAMES = 768

# Set the maximum number of video token inputs.
# Here, 128K represents the maximum number of input tokens for the VLLM model.
# Remember to adjust it according to your own configuration.
VIDEO_TOTAL_PIXELS = int(float(os.environ.get('VIDEO_MAX_PIXELS', 128000 * 28 * 28 * 0.9)))

### Using full 768 frames

In [None]:
output_file = "videomme_long_qwen_2_5_vl_7B_with_subtitles_results.json"

results = []
result_video = None

for idx, example in enumerate(eval_videomme):
    if example["duration"] == "long":
        video_id = example["videoID"]
        if result_video is None:
            result_video = {}
            result_video["video_id"] = video_id
            result_video["duration"] = example["duration"]
            result_video["domain"] = example["domain"]
            result_video["sub_category"] = example["sub_category"]
            result_video["questions"] = []
        if video_id != result_video["video_id"]:
            results.append(result_video)
            with open(output_file, "w") as f:
                json.dump(results, f, indent=4)
            result_video = {}
            result_video["video_id"] = video_id
            result_video["duration"] = example["duration"]
            result_video["domain"] = example["domain"]
            result_video["sub_category"] = example["sub_category"]
            result_video["questions"] = []
        qa = {
                "question_id": example["question_id"],
                "task_type": example["task_type"],
                "question": example["question"],
                "options": example["options"],
                "answer": example["answer"],
        }
        min_pixels = VIDEO_MIN_PIXELS
        total_pixels = 65536 * 28 * 28 * 0.8
        nframes = 768
        max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
        subtitles = ""
        for line in subtitles_dict[video_id]:
            if line == "N/A":
                continue
            subtitles += line
        if subtitles == "":
            subtitles = "N/A"
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": get_all_file_paths(f"video_mme_long/{video_id}/frames"),
                        "min_pixels": min_pixels,
                        "max_pixels": max_pixels,
                    },
                    {
                        "type": "text",
                        # "text": get_text_prompt(example["question"], example["options"])
                        "text": get_text_prompt_with_subtitles(example["question"], example["options"], subtitles)
                    }
                ]
            }
        ]
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
            **video_kwargs,
        )
        inputs = inputs.to("cuda")
        generated_ids = model.generate(**inputs, max_new_tokens=128)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        qa["response"] = output_text[0]
        result_video["questions"].append(qa)
        if idx == len(eval_videomme) - 1:
            results.append(result_video)

with open(output_file, "w") as f:
    json.dump(results, f, indent=4)

### Using CLaMR selected frames

In [None]:
with open("videomme_long_CLaMR_selected_results.json", "r") as f:
    videomme_long_clamr_selected = json.load(f)

output_file = "videomme_long_qwen_2_5_vl_7B_clamr_video_asr_similarity_nframes100_with_subtitles_results.json"

results = []
result_video = None

for idx, example in enumerate(eval_videomme):
    if example["duration"] == "long":
        video_id = example["videoID"]
        if result_video is None:
            result_video = {}
            result_video["video_id"] = video_id
            result_video["duration"] = example["duration"]
            result_video["domain"] = example["domain"]
            result_video["sub_category"] = example["sub_category"]
            result_video["questions"] = []
        if video_id != result_video["video_id"]:
            results.append(result_video)
            with open(output_file, "w") as f:
                json.dump(results, f, indent=4)
            result_video = {}
            result_video["video_id"] = video_id
            result_video["duration"] = example["duration"]
            result_video["domain"] = example["domain"]
            result_video["sub_category"] = example["sub_category"]
            result_video["questions"] = []
        qa = {
                "question_id": example["question_id"],
                "task_type": example["task_type"],
                "question": example["question"],
                "options": example["options"],
                "answer": example["answer"],
        }
        min_pixels = VIDEO_MIN_PIXELS
        total_pixels = 65536 * 28 * 28 * 0.8
        nframes = 768
        max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
        subtitles = ""
        for line in videomme_long_clamr_selected[video_id][example["question_id"]]["subtitles"]:
            if line == "N/A":
                continue
            subtitles += line
        if subtitles == "":
            subtitles = "N/A"
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": videomme_long_clamr_selected[video_id][example["question_id"]]["selected_frame_paths"],
                        "min_pixels": min_pixels,
                        "max_pixels": max_pixels,
                    },
                    {
                        "type": "text",
                        # "text": get_text_prompt(example["question"], example["options"])
                        "text": get_text_prompt_with_subtitles(example["question"], example["options"], subtitles)
                    }
                ]
            }
        ]
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
            **video_kwargs,
        )
        inputs = inputs.to("cuda")
        generated_ids = model.generate(**inputs, max_new_tokens=128)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        qa["response"] = output_text[0]
        result_video["questions"].append(qa)
        if idx == len(eval_videomme) - 1:
            results.append(result_video)

with open(output_file, "w") as f:
    json.dump(results, f, indent=4)