In [1]:
!pip install -U transformers
!python -m pip install av

Collecting av
  Downloading av-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Downloading av-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.0/33.0 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-14.0.1


In [2]:
import av
import torch
import json
import os
import numpy as np
from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

In [6]:
queries_file = "/content/drive/MyDrive/data/top_50_queries/top_queries.json"
video_dir = "/content/drive/MyDrive/data/extracted_clips"
output_file = "/content/drive/MyDrive/data/videollava_full_precision_huggingface.json"  # Output file for predictions
predictions = []


# Load the prompts
with open(queries_file, 'r') as quer_file:
    queries = json.load(quer_file)


# Load the model in half-precision
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:  10%|9         | 482M/4.98G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.81G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/148 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/66.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/582 [00:00<?, ?B/s]

In [7]:
# verify the model is full precision

import torch

# Check the dtype of model parameters
dtype = next(model.parameters()).dtype

if dtype == torch.float32:
    print("The model is in 32-bit precision (float32).")
elif dtype == torch.float16:
    print("The model is in half-precision (float16).")
elif dtype == torch.bfloat16:
    print("The model is in bfloat16 precision.")
else:
    print(f"The model is in {dtype} precision.")

The model is in 32-bit precision (float32).


In [None]:
for query in queries:
    prompt = query["query"]
    video_id = query["video_id"]
    ground_truth = query["ground_truth"]
    start_time, end_time = ground_truth


    video_path = os.path.join(video_dir, f"{video_id}__{start_time}__{end_time}.mp4")

    if not os.path.exists(video_path):
        print(f"Video file not found: {video_path}")
        continue

    try:
        container = av.open(video_path)
        if not container.streams.video:
            print(f"No video stream found in file: {video_path}")
            continue

        total_frames = container.streams.video[0].frames
        if total_frames == 0:
            print(f"No frames in video: {video_path}")
            continue

        indices = np.arange(0, total_frames, total_frames / 8).astype(int)
        video = read_video_pyav(container, indices)
    except Exception as e:
        print(f"Error processing video {video_path}: {e}")
        continue


    # For better results, we recommend to prompt the model in the following format
    prompt = f"USER: <video>\n{prompt} ASSISTANT:"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = processor(text=prompt, videos=video, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}


    out = model.generate(**inputs, max_new_tokens=20)
    full_output = processor.batch_decode(
        out, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )[0]

    # Extract response (after "ASSISTANT:")
    response = full_output.split("ASSISTANT:")[-1].strip()

    # Save the result in the predictions list
    predictions.append({
        "video_id": video_id,
        "prompt": prompt,
        "prediction": response,
    })

    print(f"Processed video {video_id}")

# Save predictions to a JSON file
with open(output_file, 'w') as f:
    json.dump(predictions, f, indent=4)

print(f"Predictions saved to {output_file}")