In [None]:
!git clone https://github.com/Vision-CAIR/LongVU
!pip install -r /content/LongVU/requirements.txt

In [None]:
!pip install huggingface-hub

In [None]:
%cd /content/LongVU

In [None]:
# LOAD MODEL

!pwd

import numpy as np
import torch
from longvu.builder import load_pretrained_model
from longvu.constants import (
    DEFAULT_IMAGE_TOKEN,
    IMAGE_TOKEN_INDEX,
)
from longvu.conversation import conv_templates, SeparatorStyle
from longvu.mm_datautils import (
    KeywordsStoppingCriteria,
    process_images,
    tokenizer_image_token,
)
from decord import cpu, VideoReader

tokenizer, model, image_processor, context_len = load_pretrained_model(
    "Vision-CAIR/LongVU_Qwen2_7B", None, "cambrian_qwen",
)

model.eval()

In [28]:
def infer_video(video_path, qs):
  vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
  fps = float(vr.get_avg_fps())
  frame_indices = np.array([i for i in range(0, len(vr), round(fps),)])
  video = []
  for frame_index in frame_indices:
      img = vr[frame_index].asnumpy()
      video.append(img)
  video = np.stack(video)
  image_sizes = [video[0].shape[:2]]
  video = process_images(video, image_processor, model.config)
  video = [item.unsqueeze(0) for item in video]

  qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
  conv = conv_templates["qwen"].copy()
  conv.append_message(conv.roles[0], qs)
  conv.append_message(conv.roles[1], None)
  prompt = conv.get_prompt()

  input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device)
  stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
  keywords = [stop_str]
  stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)

  with torch.inference_mode():
      output_ids = model.generate(
          input_ids,
          images=video,
          image_sizes=image_sizes,
          do_sample=False,
          temperature=0.2,
          max_new_tokens=20,
          use_cache=True,
          stopping_criteria=[stopping_criteria],
      )
  pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

  return pred


In [None]:
from google.colab import drive

drive.mount('/content/drive')


queries_file = "/content/drive/MyDrive/data/top_50_queries/top_queries.json"
video_dir = "/content/drive/MyDrive/data/extracted_clips"
output_file = "/content/drive/MyDrive/data/predictions_full_precision_LongVU.json"
predictions = []

In [None]:
# INFERENCE CODE

import json

# Caricamento delle query
with open(queries_file, 'r') as quer_file:
    queries = json.load(quer_file)

# Elaborazione delle query
for query in queries:
    prompt = query["query"]
    video_id = query["video_id"]
    ground_truth = query["ground_truth"]
    start_time, end_time = ground_truth

    # Percorso del video clip
    video_path = os.path.join(video_dir, f"{video_id}__{start_time}__{end_time}.mp4")

    if not os.path.exists(video_path):
        print(f"Video file not found: {video_path}")
        continue

    response = infer_video(video_path=video_path, qs=prompt)

    # Salvataggio della predizione
    predictions.append({
        "video_id": video_id,
        "prompt": prompt,
        "prediction": response,
    })
    print(f"Processed video {video_id}")



# Salvataggio delle predizioni in un file JSON
with open(output_file, 'w') as f:
    json.dump(predictions, f, indent=4)

print(f"Predictions saved to {output_file}")

