In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp /content/drive/MyDrive/bitirme/dataset/phoenix-2014-T.v3.tar.gz /content/phoenix-2014-T.v3.tar.gz

In [3]:
!tar -xzvf /content/phoenix-2014-T.v3.tar.gz -C /content/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/dev/29September_2011_Thursday_heute-4235/images0002.png
PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/dev/29September_2011_Thursday_heute-4235/images0042.png
PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/dev/29September_2011_Thursday_heute-4235/images0039.png
PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/dev/29September_2011_Thursday_heute-4235/images0045.png
PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/dev/29September_2011_Thursday_heute-4235/images0037.png
PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/dev/29September_2011_Thursday_heute-4235/images0030.png
PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/dev/29September_2011_Thursday_heute-4235/images0021.png
PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/ful

In [4]:
import os
import torch
import numpy as np
from tqdm import tqdm
import random
from PIL import Image
import cv2
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

In [5]:
!pip install timm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->timm)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->timm)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->timm)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch->tim

In [5]:
import timm

In [6]:
def get_video_folders(root_folder, limit):
    sorted_folders = sorted([os.path.join(root_folder, d) for d in os.listdir(root_folder) if os.path.isdir(os.path.join(root_folder, d))])
    if limit == 0: return sorted_folders
    else: return sorted_folders[:limit]

In [7]:
def process_video_folder(folder_path):
    paths = sorted([os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.png')]) #sort the files
    imgs, mask = load_imgs(paths)  # Preprocess and pad/slice to 300 frames
    return imgs, mask

In [8]:
def sample_frames(video_length, target_length=300):
    #300 values spread between 0 and video_length-1
    indices = np.linspace(0, video_length - 1, target_length)
    #Round them to nearest integers so they can serve as frame indices
    indices = np.round(indices).astype(int)
    return indices

In [31]:
def load_imgs(paths, max_length=300, input_size=224, resize=(224, 224), is_train=False):
    """Loads and preprocesses images from the given paths, applies sampling or padding as necessary."""

    # Define data transformations
    data_transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.RandomCrop(size=(224,224)),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.5),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    #subsample every second frame
    paths = paths[::2]

    frame_count = len(paths)

    # Random sampling if more than max_length frames
    if frame_count > max_length:
        sample_frame_indices = sample_frames(frame_count, 300)
        paths = [paths[x] for x in sample_frame_indices]
    else:
        # Zero-padding if fewer than max_length frames
        paths = paths + [0] * (max_length - frame_count)

    mask = [0 if p != 0 else 1 for p in paths]
    mask = torch.tensor(mask, dtype=torch.bool) # Convert to a PyTorch tensor

    # Initialize tensor to store images
    imgs = torch.zeros(max_length, 3, input_size, input_size)

    # Load and process each image
    for i, img_path in enumerate(paths):
        if img_path != 0:  # Skip padding elements
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = Image.fromarray(img)
            img = data_transform(img)
            imgs[i] = img  # Assign to tensor

    return imgs, mask  # Returns tensor of shape (max_length, 3, input_size, input_size)

In [10]:
def generate_embeddings_for_all_videos(model, root_folder, limit, batch_size=8, device='cuda'):
    model = model.to(device).eval()  # Move model to device and set to eval mode

    video_folders = get_video_folders(root_folder, limit)  # Get video subfolders
    all_embeddings = []  # Store embeddings for each video
    masks = [] #Masks for each folder
    current_batch = []  # Collect frames for a batch of videos
    folder_batches = [video_folders[i:i + batch_size] for i in range(0, len(video_folders), batch_size)]
    for folder_batch in tqdm(folder_batches, desc="Processing videos in batches"):
        # Load and preprocess each video in the current batch
        for folder in folder_batch:
            imgs, mask = process_video_folder(folder)
            current_batch.append(imgs)  # Shape: (300, 3, H, W) for each video
            masks.append(mask)

        # Stack batch of videos and move to GPU
        batch_tensor = torch.stack(current_batch).to(device)  # Shape: (batch_size, 300, 3, H, W)

        # Flatten the batch to create a large tensor of frames
        # DINOv2 expects input in the format (batch_size, channels, height, width)
        batch_tensor = batch_tensor.view(-1, 3, 224, 224)  # Shape: (batch_size * 300, 3, H, W)

        # Generate embeddings
        with torch.no_grad():
            # Get the 'last_hidden_state' from the model output
            batch_embeddings = model(batch_tensor)  # Shape: (batch_size * 300, embedding_dim)

        # Reshape embeddings back to (batch_size, 300, embedding_dim)
        batch_embeddings = batch_embeddings.reshape(batch_size, 300, -1)

        # Move embeddings to CPU and store
        all_embeddings.extend(batch_embeddings.cpu())
        # Clear current batch for the next set
        current_batch = []

    masks_tensor = torch.stack(masks)
    return all_embeddings, masks_tensor  # List of tensors, each of shape (300, embedding_dim)

In [13]:
train_root_folder = "/content/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/train"
val_root_folder = "/content/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/dev"
test_root_folder = "/content/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/test"

In [11]:
# Load Supervised DeiT-Small Patch32 model (384-dimensional CLS token)
model = timm.create_model('vit_small_patch32_224', pretrained=True)
# Remove classification head
model.head = torch.nn.Identity()
# Custom forward: return only the CLS token embedding
def forward_cls(x):
    # x: (batch*frames, 3, 224, 224)
    tokens = model.forward_features(x)      # (batch*frames, 1+49, 384)
    cls_token = tokens[:, 0, :]            # (batch*frames, 384)
    return cls_token
model.forward = forward_cls

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/91.5M [00:00<?, ?B/s]

In [None]:
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')

In [32]:
iterations = 3
train_embeddings = []
train_masks = None
for i in range(iterations):
  current_train_embeddings, current_train_masks = generate_embeddings_for_all_videos(model, train_root_folder, 0)
  train_embeddings.extend(current_train_embeddings)
  if train_masks == None: train_masks = current_train_masks.clone().detach()
  else: train_masks = torch.cat((train_masks, current_train_masks), dim=0)

Processing videos in batches: 100%|██████████| 887/887 [1:17:26<00:00,  5.24s/it]
Processing videos in batches: 100%|██████████| 887/887 [1:18:06<00:00,  5.28s/it]
Processing videos in batches: 100%|██████████| 887/887 [1:18:29<00:00,  5.31s/it]


In [15]:
train_embeddings, train_masks = generate_embeddings_for_all_videos(model, train_root_folder, 0)
print(f"\nGenerated embeddings for {len(train_embeddings)} videos.")

val_embeddings, val_masks = generate_embeddings_for_all_videos(model, val_root_folder, 0)
print(f"\nGenerated embeddings for {len(val_embeddings)} videos.")

test_embeddings, test_masks = generate_embeddings_for_all_videos(model, test_root_folder, 0)
print(f"\nGenerated embeddings for {len(test_embeddings)} videos.")

Processing videos in batches: 100%|██████████| 887/887 [43:01<00:00,  2.91s/it]



Generated embeddings for 7096 videos.


Processing videos in batches: 100%|██████████| 65/65 [02:49<00:00,  2.61s/it]



Generated embeddings for 520 videos.


Processing videos in batches: 100%|██████████| 81/81 [04:08<00:00,  3.07s/it]


Generated embeddings for 648 videos.





In [33]:
with open('/content/drive/MyDrive/bitirme/dataset/val_labels.pkl', 'rb') as f:
        val_labels = pickle.load(f)
with open('/content/drive/MyDrive/bitirme/dataset/test_labels.pkl', 'rb') as f:
        test_labels = pickle.load(f)
with open('/content/drive/MyDrive/bitirme/dataset/train_labels.pkl', 'rb') as f:
        labels = pickle.load(f)

In [34]:
train_labels = labels * 3

In [18]:
import pickle
import h5py

In [21]:
def save_hdf5(embeddings, labels, masks, output_path):
    with h5py.File(output_path, 'w') as hf:
        hf.create_dataset('embeddings', data=np.array(embeddings, dtype=np.float32), compression="gzip")
        hf.create_dataset('labels', data=np.array(labels, dtype=np.int64))
        hf.create_dataset('masks', data=np.array(masks, dtype=bool))

In [27]:
for i, emb in enumerate(val_embeddings):
    print(f"Embedding {i} : {emb.shape}")

Embedding 0 : torch.Size([300, 384])
Embedding 1 : torch.Size([300, 384])
Embedding 2 : torch.Size([300, 384])
Embedding 3 : torch.Size([300, 384])
Embedding 4 : torch.Size([300, 384])
Embedding 5 : torch.Size([300, 384])
Embedding 6 : torch.Size([300, 384])
Embedding 7 : torch.Size([300, 384])
Embedding 8 : torch.Size([300, 384])
Embedding 9 : torch.Size([300, 384])
Embedding 10 : torch.Size([300, 384])
Embedding 11 : torch.Size([300, 384])
Embedding 12 : torch.Size([300, 384])
Embedding 13 : torch.Size([300, 384])
Embedding 14 : torch.Size([300, 384])
Embedding 15 : torch.Size([300, 384])
Embedding 16 : torch.Size([300, 384])
Embedding 17 : torch.Size([300, 384])
Embedding 18 : torch.Size([300, 384])
Embedding 19 : torch.Size([300, 384])
Embedding 20 : torch.Size([300, 384])
Embedding 21 : torch.Size([300, 384])
Embedding 22 : torch.Size([300, 384])
Embedding 23 : torch.Size([300, 384])
Embedding 24 : torch.Size([300, 384])
Embedding 25 : torch.Size([300, 384])
Embedding 26 : torch.S

In [28]:
val_embeddings = val_embeddings[:510]
val_masks = val_masks[:510]

In [24]:
test_masks.shape

torch.Size([640, 300])

In [39]:
len(train_labels)

21288

In [40]:
save_hdf5(train_embeddings,
          train_labels,
          train_masks,
          '/content/drive/MyDrive/bitirme/dataset/vit_train_data_aug.h5')

In [29]:
save_hdf5(val_embeddings,
          val_labels,
          val_masks,
          '/content/drive/MyDrive/bitirme/dataset/vit_val_data.h5')

In [30]:
save_hdf5(test_embeddings,
          test_labels,
          test_masks,
          '/content/drive/MyDrive/bitirme/dataset/vit_test_data.h5')

In [41]:
drive.flush_and_unmount()