In [None]:
import numpy as np
from PIL import Image
from moviepy import VideoFileClip
from transformers import AutoProcessor, LlavaForConditionalGeneration
import torch
import cv2

def process_video(video_path, max_frames=64):
    video = VideoFileClip(video_path)
    frame_rate = video.fps
    total_frames = int(video.duration * frame_rate)
    frame_indices = np.linspace(0, total_frames - 1, min(max_frames, total_frames), dtype=int)
    frames = []
    for idx in frame_indices:
        frame = video.get_frame(idx / frame_rate)
        frame_pil = Image.fromarray(frame)
        frames.append(frame_pil)
    return frames

# Initialize the processor and model
model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True,
    load_in_4bit=True
).to("cuda")

# Process the video
video_path = r"C:\Users\leege\Downloads\testVideo.MOV"  # Use raw string literal for Windows paths
frames = process_video(video_path)

# Prepare the conversation with image tokens
image_tokens = " ".join(["<image>"] * len(frames))
conversation = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "What are these?"}] + [{"type": "image", "image": frame} for frame in frames],
    }
]

# Apply the chat template
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

# Tokenize the prompt
inputs = processor(images=frames, text=prompt, return_tensors="pt", padding=True).to("cuda", torch.float16)

# Generate response
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=200, do_sample=False)

# Decode and print the response
response = processor.decode(outputs[0], skip_special_tokens=True)
print(response)


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '1', 'compatible_brands': 'isommp41mp42', 'creation_time': '2025-03-22T16:19:30.000000Z'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [720, 1280], 'bitrate': 3032, 'fps': 29.68, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'creation_time': '2025-03-22T16:19:30.000000Z', 'handler_name': 'Core Media Video', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 61, 'metadata': {'Metadata': '', 'creation_time': '2025-03-22T16:19:30.000000Z', 'handler_name': 'Core Media Audio', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 3.1, 'bitrate': 3102, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile

Token indices sequence length is longer than the specified maximum sequence length for this model (46669 > 32768). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
