In [1]:
#!pip install -U transformers
!python -m pip install av



In [2]:
import av
import torch
import json
import os
import numpy as np
#from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

In [None]:

# Clona il repository
!git clone https://github.com/PKU-YuanGroup/Video-LLaVA
%cd Video-LLaVA

# Installa una versione compatibile di Python (Colab usa già Python 3.10)
# Aggiorna pip
!pip install --upgrade pip

# Installa il pacchetto Video-LLaVA
!pip install -e .

# Installa le dipendenze per il training
!pip install -e ".[train]"

# Installa FlashAttention (abilita attenzione veloce)
!pip install flash-attn --no-build-isolation

# Installa altre dipendenze necessarie
!pip install decord opencv-python
!pip install git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d

In [None]:
!pip install transformers==4.28.0

In [None]:
import os
import json
import torch
import numpy as np
from videollava.model.builder import load_pretrained_model
from videollava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from videollava.conversation import conv_templates, SeparatorStyle
from videollava.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria

# Percorsi file e configurazioni
queries_file = "/content/drive/MyDrive/data/top_50_queries/top_queries.json"
video_dir = "/content/drive/MyDrive/data/extracted_clips"
output_file = "/content/drive/MyDrive/data/predictions_full_precision.json"
predictions = []

# Configurazioni modello
model_path = "LanguageBind/Video-LLaVA-7B"
cache_dir = "/content/cache_dir"
device = "cuda" if torch.cuda.is_available() else "cpu"
load_4bit, load_8bit = True, False

# Caricamento del modello e del processore
tokenizer, model, processor, _ = load_pretrained_model(model_path, None, "videollava", load_8bit, device=device, cache_dir=cache_dir)

video_processor = processor["video"]

# Caricamento delle query
with open(queries_file, 'r') as quer_file:
    queries = json.load(quer_file)

# Elaborazione delle query
for query in queries:
    prompt = query["query"]
    video_id = query["video_id"]
    ground_truth = query["ground_truth"]
    start_time, end_time = ground_truth

     # For better results, we recommend to prompt the model in the following format
    prompt = f"USER: <video>\n{prompt} ASSISTANT:"

    # Percorso del video clip
    video_path = os.path.join(video_dir, f"{video_id}__{start_time}__{end_time}.mp4")

    if not os.path.exists(video_path):
        print(f"Video file not found: {video_path}")
        continue

    try:
        # Elaborazione del video
        video_tensor = video_processor(video_path, return_tensors="pt")["pixel_values"]
        if isinstance(video_tensor, list):
            video_tensor = [v.to(device, dtype=torch.float16) for v in video_tensor]
        else:
            video_tensor = video_tensor.to(device, dtype=torch.float16)

        # Configurazione della conversazione
        conv_mode = "llava_v1"
        conv = conv_templates[conv_mode].copy()
        roles = conv.roles

        # Formatta il prompt
        formatted_prompt = (
            f"{DEFAULT_IMAGE_TOKEN * model.get_video_tower().config.num_frames}\n{prompt}"
        )
        conv.append_message(roles[0], formatted_prompt)
        conv.append_message(roles[1], None)
        input_prompt = conv.get_prompt()

        # Tokenizzazione e stopping criteria
        input_ids = tokenizer_image_token(
            input_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
        ).unsqueeze(0).to(device)
        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
        stopping_criteria = KeywordsStoppingCriteria([stop_str], tokenizer, input_ids)

        # Generazione della risposta
        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=video_tensor,
                do_sample=True,
                temperature=0.1,
                max_new_tokens=1024,
                stopping_criteria=[stopping_criteria],
            )

        # Decodifica dell'output
        output = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
        response = output.split("ASSISTANT:")[-1].strip()

        # Salvataggio della predizione
        predictions.append({
            "video_id": video_id,
            "prompt": prompt,
            "prediction": response,
        })
        print(f"Processed video {video_id}")

    except Exception as e:
        print(f"Error processing {video_id}: {e}")
        continue

# Salvataggio delle predizioni in un file JSON
with open(output_file, 'w') as f:
    json.dump(predictions, f, indent=4)

print(f"Predictions saved to {output_file}")