In [None]:
!pip install open-clip-torch

In [7]:
!pip install opencv-python
!pip install matplotlib


Collecting matplotlib
  Using cached matplotlib-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.56.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (101 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.2.1-py3-none-any.whl.metadata (5.0 kB)
Using cached matplotlib-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
Using cached contourpy-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.wh

In [5]:

def extract_clip_features(image_list, model):
    """
    Trích xuất đặc trưng CLIP từ danh sách ảnh có kích thước khác nhau.

    Args:
        image_list (List[torch.Tensor]): Danh sách các tensor ảnh với kích thước khác nhau (C, H, W).

    Returns:
        torch.Tensor: Tensor (N, 512) chứa vector đặc trưng của mỗi ảnh.
    """
    transform = T.Compose([
        T.Resize((224, 224), interpolation=T.InterpolationMode.BICUBIC),
        T.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
    ])

    # Chuẩn hóa từng ảnh trong danh sách
    image_batch = torch.stack([transform(img) for img in image_list])  # (N, 3, 224, 224)

    # Trích xuất đặc trưng
    with torch.no_grad():
        image_features = model.encode_image(image_batch)

    # Chuẩn hóa vector đặc trưng
    image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)

    return image_features  # (N, 512)

In [13]:
import cv2
import numpy as np
import torch
from torchvision import models, transforms
from torchvision.models.video import r3d_18  # Pretrained 3D-CNN model
from transformers import DetrForObjectDetection
from PIL import Image
import matplotlib.patches as patches
import torchvision.transforms as T
import open_clip
import matplotlib.pyplot as plt


def extract_frames(video_path, N, M):
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    key_frame_indices = np.linspace(0, frame_count - 1, N, dtype=int)

    key_frames = np.zeros((N, height, width, 3), dtype=np.uint8)
    surrounding_frames = np.zeros((N, M, height, width, 3), dtype=np.uint8)

    for i, idx in enumerate(key_frame_indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, key_frame = cap.read()
        if ret:
            key_frames[i] = key_frame

        start_idx = max(0, idx - M // 2)
        end_idx = min(frame_count, idx + M // 2)

        temp_frames = []
        for j in range(start_idx, end_idx):
            cap.set(cv2.CAP_PROP_POS_FRAMES, j)
            ret, frame = cap.read()
            if ret:
                temp_frames.append(frame)

        while len(temp_frames) < M:
            temp_frames.append(temp_frames[-1] if temp_frames else key_frame)

        surrounding_frames[i] = np.array(temp_frames[:M])

    cap.release()
    return key_frames, surrounding_frames

def extract_context_vector(key_frames, clip_model):
  transform = transforms.Compose([
        transforms.ToTensor(),
    ])
  return extract_clip_features([transform(frame) for frame in key_frames], clip_model)

def extract_temporal_features(surrounding_frames, model):
    N, M, h, w, c = surrounding_frames.shape

    transform = transforms.Compose([
        transforms.Resize((112, 112)),  # Resize frames to match model input
        transforms.ToTensor()
    ])

    frames_tensor = torch.stack([transform(Image.fromarray(frame)) for frame in surrounding_frames.reshape(-1, h, w, c)])
    frames_tensor = frames_tensor.view(N, M, 3, 112, 112).permute(0, 2, 1, 3, 4)  # (N, 3, M, 112, 112)

    with torch.no_grad():
        features = model(frames_tensor)  # (N, feature_dim)
    return features

def detect_objects_with_detr(key_frames, detr_model, feature_extractor, clip_model):
    N, h, w, c = key_frames.shape
    transform = transforms.Compose([
        transforms.ToTensor(),
    ])
    key_features = []
    bounding_boxes = []

    for i in range(N):
        frame_tensor = transform(key_frames[i]).unsqueeze(0)
        with torch.no_grad():
            outputs = detr_model(frame_tensor)

        obj_features = []
        scores = outputs.logits.softmax(-1)[0, :, :-1].max(-1)[0]
        top_probs, top_labels = scores.max(-1)


        probabilities = outputs.logits.softmax(-1)[0, :, :-1]  # Bỏ lớp background
        top_probs, top_labels = probabilities.max(-1)

        keep = top_probs > 0.7
        labels = []
        for label, score in zip(top_labels[keep], top_probs[keep]):
          labels.append(id2label[label.item()])

        keep = scores > 0.9  # Confidence threshold

        boxes = outputs.pred_boxes[0, keep]
        # print(labels)
        object_list = []
        for box in boxes:
          x, y, w_box, h_box = box.numpy()
          x_min = int((x - w_box / 2) * w)
          y_min = int((y - h_box / 2) * h)
          x_max = int((x + w_box / 2) * w)
          y_max = int((y + h_box / 2) * h)

          cropped_obj = key_frames[i][y_min:y_max, x_min:x_max]
          if cropped_obj.shape[0] >= 7 and cropped_obj.shape[1] >= 7:  # Đảm bảo đủ lớn

              obj_tensor = transform(cropped_obj)
              object_list.append(obj_tensor)
          else:
              continue

        if object_list:
          with torch.no_grad():
                obj_feature = extract_clip_features(object_list, clip_model).mean(dim=0)
          key_features.append(obj_feature.squeeze().cpu().numpy())
        else:
          key_features.append(np.zeros((512,)))

    return torch.tensor(key_features)


# Example usage
video_path = "./videoplayback.mp4"
N = 15  # Number of key frames
M = 15  # Number of surrounding frames per key frame

# Load models
cnn3d = r3d_18(pretrained=True).eval()  # Pretrained 3D-CNN model
cnn3d.fc = torch.nn.Identity()  # Remove classification layer
detr_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").eval()
feature_extractor = models.resnet50(pretrained=True)
feature_extractor.fc = torch.nn.Identity()  # Remove classification layer
id2label = detr_model.config.id2label

model_name = 'ViT-B/32'  # Đảm bảo dùng đúng mô hình
clip_model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained='openai')


key_frames, surrounding_frames = extract_frames(video_path, N, M)
temporal_features = extract_temporal_features(surrounding_frames, cnn3d)
object_features = detect_objects_with_detr(key_frames, detr_model, feature_extractor, clip_model)

context_features = extract_context_vector(key_frames, clip_model)

print("Motion Features Shape:", temporal_features.shape)  # (N, feature_dim)
print("Object Features Shape:", object_features.shape)  # (N, feature_dim)
print("Context Features Shape:", context_features.shape)  # (N, feature_dim)

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Temporal Features Shape: torch.Size([15, 512])
Object Features Shape: torch.Size([15, 512])
Context Features Shape: torch.Size([15, 512])


In [14]:
!wget https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip

--2025-03-12 22:43:53--  https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip
129.67.94.2ww.robots.ox.ac.uk (www.robots.ox.ac.uk)... 
connected. to www.robots.ox.ac.uk (www.robots.ox.ac.uk)|129.67.94.2|:443... 
HTTP request sent, awaiting response... 200 OK
Length: 6552768292 (6.1G) [application/zip]
Saving to: ‘MSRVTT.zip’


2025-03-12 23:06:55 (4.53 MB/s) - ‘MSRVTT.zip’ saved [6552768292/6552768292]



In [None]:
!unzip ./MSRVTT.zip

In [17]:
import json
# Load MSRVTT captions from a JSON file (assuming it's in a dictionary format)
# Replace 'msrvtt_captions.json' with the actual path to your file
with open('./MSRVTT/annotation/MSR_VTT.json', 'r') as file:
    msrvtt_data = json.load(file)

In [None]:
msrvtt_data

In [26]:
import torch
import os
from tqdm import tqdm  # Import tqdm để hiển thị progress bar

N = 15
M = 16
output_dir = "video_features"  # Thư mục để lưu file

# Tạo thư mục nếu chưa tồn tại
os.makedirs(output_dir, exist_ok=True)

# Sử dụng tqdm để hiển thị tiến trình
for annotation in tqdm(msrvtt_data['annotations'], desc="Processing videos", unit="video"):
    video_id = annotation['image_id']
    output_file = os.path.join(output_dir, f"{video_id}.pt")

    # Nếu file đã tồn tại, bỏ qua
    if os.path.exists(output_file):
        continue

    video_path = f"./MSRVTT/videos/all/{video_id}.mp4"
    
    key_frames, surrounding_frames = extract_frames(video_path, N, M)
    temporal_features = extract_temporal_features(surrounding_frames, cnn3d)
    object_features = detect_objects_with_detr(key_frames, detr_model, feature_extractor, clip_model) 
    context_features = extract_context_vector(key_frames, clip_model)

    # Lưu tuple đặc trưng vào file riêng biệt
    torch.save((temporal_features, object_features, context_features), output_file)


Processing videos: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 199994/199994 [46:01<00:00, 72.42video/s]
