In [None]:
!pip install -U transformers
!python -m pip install av

In [2]:
import av
import torch
import json
import os
import numpy as np
from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

In [None]:
queries_file = "/content/drive/MyDrive/data/top_50_queries/top_queries.json"
video_dir = "/content/drive/MyDrive/data/extracted_clips"
output_file = "/content/drive/MyDrive/data/predictions.json"  # Output file for predictions
predictions = []


# Load the prompts
with open(queries_file, 'r') as quer_file:
    queries = json.load(quer_file)


# Load the model in half-precision
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", torch_dtype=torch.float16, device_map="auto")
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")


for query in queries:
    prompt = query["query"]
    video_id = query["video_id"]
    ground_truth = query["ground_truth"]
    start_time, end_time = ground_truth

     # Extract the relevant clip using FFmpeg
    video_path = os.path.join(video_dir, f"{video_id}__{start_time}__{end_time}.mp4")

    if not os.path.exists(video_path):
        print(f"Video file not found: {video_path}")
        continue

    try:
        container = av.open(video_path)
        if not container.streams.video:
            print(f"No video stream found in file: {video_path}")
            continue

        total_frames = container.streams.video[0].frames
        if total_frames == 0:
            print(f"No frames in video: {video_path}")
            continue

        indices = np.arange(0, total_frames, total_frames / 8).astype(int)
        video = read_video_pyav(container, indices)
    except Exception as e:
        print(f"Error processing video {video_path}: {e}")
        continue


    # For better results, we recommend to prompt the model in the following format
    prompt = f"USER: <video>\n{prompt} ASSISTANT:"          # 1st attempt: prompt = f"USER: <video>\n{prompt} ASSISTANT:"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = processor(text=prompt, videos=video, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}


    out = model.generate(**inputs, max_new_tokens=20)   # 1st attempt: max_new_tokens=20
    full_output = processor.batch_decode(
        out, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )[0]

    # Extract response (after "ASSISTANT:")
    response = full_output.split("ASSISTANT:")[-1].strip()

    # Save the result in the predictions list
    predictions.append({
        "video_id": video_id,
        "prompt": prompt,
        "prediction": response,
    })

    print(f"Processed video {video_id}")

# Save predictions to a JSON file
with open(output_file, 'w') as f:
    json.dump(predictions, f, indent=4)

print(f"Predictions saved to {output_file}")