In [1]:
#!pip install -U transformers
!python -m pip install av



In [2]:
import av
import torch
import json
import os
import numpy as np
#from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

In [4]:

# Clona il repository
!git clone https://github.com/PKU-YuanGroup/Video-LLaVA
%cd Video-LLaVA

# Installa una versione compatibile di Python (Colab usa già Python 3.10)
# Aggiorna pip
!pip install --upgrade pip

# Installa il pacchetto Video-LLaVA
!pip install -e .

# Installa le dipendenze per il training
!pip install -e ".[train]"

# Installa FlashAttention (abilita attenzione veloce)
!pip install flash-attn --no-build-isolation

# Installa altre dipendenze necessarie
!pip install decord opencv-python
!pip install git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d

fatal: destination path 'Video-LLaVA' already exists and is not an empty directory.
/content/Video-LLaVA
Obtaining file:///content/Video-LLaVA
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: videollava
  Building editable for videollava (pyproject.toml) ... [?25l[?25hdone
  Created wheel for videollava: filename=videollava-1.0.0-0.editable-py3-none-any.whl size=14071 sha256=ec2192e307bcbf1d9d50b4f1e32988957482516aa326c07dc2c90086145ea69a
  Stored in directory: /tmp/pip-ephem-wheel-cache-2_nc_r0o/wheels/2c/aa/87/e87ec400fd8d50e7ca7616464e202d06a54e73967293159deb
Successfully built videollava
Installing collected packages: videollava
  Attempting uninstall: videollava
    Found existing installation: videollava 1.0.0
    Uninstalling videolla

In [5]:
!pip install transformers==4.28.0

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)
Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.31.0
    Uninstalling transformers-4.31.0:
      Successfully uninstalled transformers-4.31.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 3.3.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.28.0 which is incompatible.
videollava 1.0.0 requires transformers==4.31.0, but you have transformers 4.28.0 which is incompatible.[0m[31m
[0mSuccessfully installed transformers-4.28.0


In [6]:
import os
import json
import torch
import numpy as np
from videollava.model.builder import load_pretrained_model
from videollava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from videollava.conversation import conv_templates, SeparatorStyle
from videollava.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria

# Percorsi file e configurazioni
queries_file = "/content/drive/MyDrive/data/top_50_queries/top_queries.json"
video_dir = "/content/drive/MyDrive/data/extracted_clips"
output_file = "/content/drive/MyDrive/data/predictions_full_precision.json"
predictions = []

# Configurazioni modello
model_path = "LanguageBind/Video-LLaVA-7B"
cache_dir = "/content/cache_dir"
device = "cuda" if torch.cuda.is_available() else "cpu"
load_4bit, load_8bit = True, False

# Caricamento del modello e del processore
tokenizer, model, processor, _ = load_pretrained_model(model_path, None, "videollava", load_8bit, device=device, cache_dir=cache_dir)

video_processor = processor["video"]

# Caricamento delle query
with open(queries_file, 'r') as quer_file:
    queries = json.load(quer_file)

# Elaborazione delle query
for query in queries:
    prompt = query["query"]
    video_id = query["video_id"]
    ground_truth = query["ground_truth"]
    start_time, end_time = ground_truth

     # For better results, we recommend to prompt the model in the following format
    prompt = f"USER: <video>\n{prompt} ASSISTANT:"

    # Percorso del video clip
    video_path = os.path.join(video_dir, f"{video_id}__{start_time}__{end_time}.mp4")

    if not os.path.exists(video_path):
        print(f"Video file not found: {video_path}")
        continue

    try:
        # Elaborazione del video
        video_tensor = video_processor(video_path, return_tensors="pt")["pixel_values"]
        if isinstance(video_tensor, list):
            video_tensor = [v.to(device, dtype=torch.float16) for v in video_tensor]
        else:
            video_tensor = video_tensor.to(device, dtype=torch.float16)

        # Configurazione della conversazione
        conv_mode = "llava_v1"
        conv = conv_templates[conv_mode].copy()
        roles = conv.roles

        # Formatta il prompt
        formatted_prompt = (
            f"{DEFAULT_IMAGE_TOKEN * model.get_video_tower().config.num_frames}\n{prompt}"
        )
        conv.append_message(roles[0], formatted_prompt)
        conv.append_message(roles[1], None)
        input_prompt = conv.get_prompt()

        # Tokenizzazione e stopping criteria
        input_ids = tokenizer_image_token(
            input_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
        ).unsqueeze(0).to(device)
        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
        stopping_criteria = KeywordsStoppingCriteria([stop_str], tokenizer, input_ids)

        # Generazione della risposta
        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=video_tensor,
                do_sample=True,
                temperature=0.1,
                max_new_tokens=1024,
                stopping_criteria=[stopping_criteria],
            )

        # Decodifica dell'output
        output = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
        response = output.split("ASSISTANT:")[-1].strip()

        # Salvataggio della predizione
        predictions.append({
            "video_id": video_id,
            "prompt": prompt,
            "prediction": response,
        })
        print(f"Processed video {video_id}")

    except Exception as e:
        print(f"Error processing {video_id}: {e}")
        continue

# Salvataggio delle predizioni in un file JSON
with open(output_file, 'w') as f:
    json.dump(predictions, f, indent=4)

print(f"Predictions saved to {output_file}")

[2025-01-04 11:28:46,656] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/150k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at LanguageBind/Video-LLaVA-7B were not used when initializing LlavaLlamaForCausalLM: ['model.video_tower.video_tower.encoder.layers.20.self_attn.k_proj.bias', 'model.image_tower.image_tower.encoder.layers.23.self_attn.v_proj.weight', 'model.video_tower.video_tower.encoder.layers.0.self_attn.out_proj.bias', 'model.image_tower.image_tower.encoder.layers.10.mlp.fc1.bias', 'model.video_tower.video_tower.encoder.layers.7.mlp.fc2.bias', 'model.video_tower.video_tower.encoder.layers.12.temporal_layer_norm1.weight', 'model.video_tower.video_tower.encoder.layers.7.temporal_attn.v_proj.weight', 'model.image_tower.image_tower.encoder.layers.11.mlp.fc2.weight', 'model.video_tower.video_tower.encoder.layers.10.layer_norm1.weight', 'model.image_tower.image_tower.encoder.layers.1.self_attn.out_proj.weight', 'model.video_tower.video_tower.encoder.layers.5.temporal_attn.k_proj.weight', 'model.video_tower.video_tower.encoder.layers.19.self_attn.v_proj.bias', 'model.

generation_config.json:   0%|          | 0.00/162 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Processed video 8698a0ea-f434-49f9-a8e4-45220e5d4b2c
Processed video f082242c-309f-48a1-97fa-5c1d6bd255fb
Processed video 0971527a-6cd3-4c82-9d94-09b3565f4505
Processed video 4ce119de-0f42-4bd1-b387-9e19643fdddc
Processed video 25e093a8-86d5-47e9-b09f-5a8afef85b74
Processed video 805989f6-0696-4de2-ad9b-0f194e0ac48d
Processed video f681f510-cd33-48e3-bc10-4a8f2a518495
Processed video d340e569-12d3-42ef-a56b-a9a25c37ef95
Processed video eb81442c-a322-49ea-b243-a39d2e288b9b
Processed video 53da674a-089d-428a-a719-e322b2de002b
Processed video 90a0cc01-f498-490a-bda1-01ad94db2946
Processed video 6b6d73b9-e92c-4698-86df-bbe687a9e95e
Processed video ff6d3d52-dda5-46dd-8515-b9b772933030
Processed video 1dcc108c-8bd4-42ad-b2c5-03662be62eda
Processed video 9f28e782-417c-4c8b-a7ae-42fc96a0e94f
Processed video b737cd68-4e0d-440a-9813-a6c90080fac5
Processed video 86343e9e-b932-41d3-ad6f-83f2c2fe5486
Processed video c74c1df9-bc28-4561-bc2a-28767632cb2b
Processed video 38737402-19bd-4689-9e74-3af391