In [1]:
# Cell 0: Clona e installa le dipendenze UniVTG
!git clone https://github.com/showlab/UniVTG.git
%cd UniVTG
!pip install -q -r requirements.txt
!pip install -q timm ftfy  # torch/transformers già in Colab
!pip install -q transformers>=4.42.0 accelerate av einops decord
!pip install -q moviepy

%cd ..

Cloning into 'UniVTG'...
remote: Enumerating objects: 493, done.[K
remote: Counting objects: 100% (110/110), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 493 (delta 97), reused 46 (delta 46), pack-reused 383 (from 1)[K
Receiving objects: 100% (493/493), 23.22 MiB | 16.58 MiB/s, done.
Resolving deltas: 100% (258/258), done.
/content/UniVTG
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.6/45.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m653.6/653.6 kB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build de

In [2]:
# Cell 1: Mount Drive e setup path
from google.colab import drive
import os, sys, torch, argparse
drive.mount('/content/drive', force_remount=True)
os.makedirs('/content/results', exist_ok=True)
sys.path.append('/content/UniVTG')
torch.serialization.add_safe_globals([argparse.Namespace])

ckpt_path = "/content/drive/MyDrive/results/omni/model_best.ckpt"
print("✔️ checkpoint path:", ckpt_path)

Mounted at /content/drive
✔️ checkpoint path: /content/drive/MyDrive/results/omni/model_best.ckpt


In [3]:
import math
import cv2

# 📥 Caricamento video per calcolare la durata
VIDEO_PATH = "/content/drive/MyDrive/examples/Dataset Benelli/Benelli Nova 12ga Review(3).mp4"
cap = cv2.VideoCapture(VIDEO_PATH)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
video_duration_sec = frame_count / fps
video_duration_min = video_duration_sec / 60
cap.release()

# 📊 Parametri dinamici
# Percentuale di contenuto da analizzare (tra 15% e 25%)
analysis_ratio = 0.25
analysis_total_sec = video_duration_sec * analysis_ratio

# Clip più lunghe: tra 8 e 20 secondi
CLIP_DURATION = max(10, min(30, video_duration_min*2))  # aumenta con la durata video

# Calcolo TOPK = quante clip servono per coprire analysis_total_sec
TOPK = math.floor(analysis_total_sec / CLIP_DURATION)

# Framerate costante
FRAMERATE = 0.5

# 📤 Output info
print(f"🎥 Video di {video_duration_min:.2f} minuti")
print(f"🧩 Durata clip: {CLIP_DURATION:.1f} s")
print(f"#️⃣ Numero clip: {TOPK}")
print(f"⏱️ Tempo totale analizzato: {TOPK * CLIP_DURATION:.1f} s")




🎥 Video di 10.33 minuti
🧩 Durata clip: 20.7 s
#️⃣ Numero clip: 7
⏱️ Tempo totale analizzato: 144.6 s


In [4]:
# Cell 2: Carica il modello UniVTG
import numpy as np
from model.univtg import build_model
from model.transformer_encoder_droppath import TransformerEncoderLayer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)
opt        = checkpoint["opt"]; opt.device = device
model, _   = build_model(opt)

# patch per droppath backward-compatible
def safe_forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
    q = k = self.with_pos_embed(src, pos)
    src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
                         key_padding_mask=src_key_padding_mask)[0]
    src  = src + self.droppath1(src2); src = self.norm1(src)
    src2 = self.linear2(self.activation(self.linear1(src)))
    src  = src + self.droppath2(src2); src = self.norm2(src)
    return src
TransformerEncoderLayer.forward = safe_forward_pre

model.load_state_dict(checkpoint["model"])
model.to(device).eval()
print("✔️ UniVTG model loaded in eval()")



✔️ UniVTG model loaded in eval()


In [5]:
# Cell 2.5: Installa ffmpeg-python e imageio-ffmpeg
!pip install -q ffmpeg-python imageio-ffmpeg


In [9]:
import torch
from run_on_video.data_utils import VideoLoader, Preprocessing
from torchvision import models, transforms
from transformers import CLIPTokenizer, CLIPTextModel

# 1) Carica e preprocessa video
video_path   = "/content/drive/MyDrive/examples/Dataset Benelli/Benelli Nova 12ga Review(3).mp4"
video_loader = VideoLoader(framerate=FRAMERATE, size=224, centercrop=True)
preprocessor = Preprocessing()
raw_frames   = video_loader.read_video_from_file(video_path)
video_frames = preprocessor(raw_frames).to(device)

# 2) Estrai feature R3D-18
feat_extractor = models.video.r3d_18(weights=models.video.R3D_18_Weights.KINETICS400_V1)
feat_extractor.fc = torch.nn.Identity()
feat_extractor = feat_extractor.eval().to(device)

feat_list = []
with torch.no_grad():
    for f in video_frames:
        x = f.unsqueeze(0).unsqueeze(2)
        feat_list.append(feat_extractor(x))
video_feat = torch.cat(feat_list, dim=0)

# 3) Allinea dimensione a opt.v_feat_dim
T, fd = video_feat.shape
vfd   = getattr(opt, "v_feat_dim", fd)
if fd < vfd:
    pad = torch.zeros((T, vfd-fd), device=device)
    video_feat = torch.cat([video_feat, pad], dim=1)
else:
    video_feat = video_feat[:, :vfd]

# 4) Prepara input UniVTG
src_vid      = video_feat.unsqueeze(0)
src_vid_mask = torch.ones((1, T), dtype=torch.bool, device=device)

# 5) Embedding CLIP del prompt
tokenizer   = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
text_enc    = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch16").to(device).eval()
prompt      = "Generate a concise summary of the following video"
enc         = tokenizer(prompt, padding="max_length", truncation=True,
                        max_length=opt.max_q_l, return_tensors="pt").to(device)
src_txt     = text_enc(**enc).last_hidden_state
src_txt_mask = ~enc["attention_mask"].bool()

# 6) Inferisci saliency e prendi top-K
with torch.no_grad():
    out = model(src_txt, src_txt_mask, src_vid, src_vid_mask)
saliency    = out["saliency_scores"].cpu().numpy()[0]

# Ordina per saliency decrescente
sorted_indices = saliency.argsort()[::-1]

# Distanza minima tra frame (in frame), dipende da durata clip
min_frame_distance = int(CLIP_DURATION * FRAMERATE)

# Filtro: elimina indici troppo vicini
filtered_indices = []
for idx in sorted_indices:
    if len(filtered_indices) >= TOPK:
        break
    if all(abs(idx - sel) > min_frame_distance for sel in filtered_indices):
        filtered_indices.append(idx)

top_indices = filtered_indices
print("🔑 Top-K frame indices (filtrati):", top_indices)




🔑 Top-K frame indices (filtrati): [np.int64(279), np.int64(89), np.int64(187), np.int64(153), np.int64(297), np.int64(171), np.int64(130)]


In [10]:
import ffmpeg
from pathlib import Path

def extract_clips(video_path: str, indices: list[int],
                  framerate: float = FRAMERATE,
                  clip_duration: float = CLIP_DURATION,
                  out_dir: str = "/content/clip_buffer") -> list[str]:
    """
    Estrae clip di durata `clip_duration` centrata sugli indici.
    """
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    half = clip_duration / 2
    clip_paths = []
    for i, idx in enumerate(indices):
        timestamp = idx / framerate
        start = max(0, timestamp - half)
        out_file = Path(out_dir) / f"clip_{i:02d}.mp4"
        (
            ffmpeg
            .input(video_path, ss=start, t=clip_duration)
            .output(str(out_file), codec="copy")
            .overwrite_output()
            .run(quiet=True)
        )
        clip_paths.append(str(out_file))
    return clip_paths

# Estrai le clip con la durata specificata da CLIP_DURATION
out_dir_drive = "/content/drive/MyDrive/clip_output_univtg/clip_video_3"
clip_paths = extract_clips(video_path, top_indices, out_dir=out_dir_drive)
print("📁 Clip salvate in Drive:")
for path in clip_paths:
    print(path)

📁 Clip salvate in Drive:
/content/drive/MyDrive/clip_output_univtg/clip_video_3/clip_00.mp4
/content/drive/MyDrive/clip_output_univtg/clip_video_3/clip_01.mp4
/content/drive/MyDrive/clip_output_univtg/clip_video_3/clip_02.mp4
/content/drive/MyDrive/clip_output_univtg/clip_video_3/clip_03.mp4
/content/drive/MyDrive/clip_output_univtg/clip_video_3/clip_04.mp4
/content/drive/MyDrive/clip_output_univtg/clip_video_3/clip_05.mp4
/content/drive/MyDrive/clip_output_univtg/clip_video_3/clip_06.mp4


In [11]:
from IPython.display import Video, display

for path in clip_paths:
    # embed=True per l’incorporamento inline
    display(Video(path, embed=True, width=360))

Output hidden; open in https://colab.research.google.com to view.