# DLFeat - Custom Model Registration & Feature Extraction Examples

This notebook demonstrates:
1. The DLFeat registration API for image, video, audio, text, and multimodal models.
2. How to plug pre-trained PyTorch (and HF-style) models into DLFeat via the `register_*_model` functions.
3. Extracting features with `DLFeatExtractor` and using them in simple scikit-learn classifiers.
4. Running quick sanity checks and self-tests to validate registered models.


In [6]:
pip install git+https://github.com/emanuelegaliano/DLFeat.git

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/emanuelegaliano/DLFeat.git
  Cloning https://github.com/emanuelegaliano/DLFeat.git to /tmp/pip-req-build-ll7xc951
  Running command git clone --filter=blob:none --quiet https://github.com/emanuelegaliano/DLFeat.git /tmp/pip-req-build-ll7xc951
  Resolved https://github.com/emanuelegaliano/DLFeat.git to commit 6b653b23a9d2fa3df55dc439f15721c9110e8479
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [7]:
import dlfeat, inspect
print("dlfeat file:", dlfeat.__file__)

print("Has register_video_model?", hasattr(dlfeat, "register_video_model"))
print("Attrs containing 'register':", [n for n in dir(dlfeat) if "register" in n])

dlfeat file: /home/manu/.local/lib/python3.14/site-packages/dlfeat/__init__.py
Has register_video_model? False
Attrs containing 'register': []


## Video custom model example

This example shows how to define a tiny 3D CNN for videos, train it on a small synthetic dataset, register it in DLFeat with `register_video_model`, and then extract fixed-size feature vectors from raw `.mp4` files using `DLFeatExtractor`.


In [5]:
# Video custom model example: tiny 3D CNN + DLFeat registration

import os
import tempfile

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision

# Import DLFeat (assumes dlfeat.py is on your PYTHONPATH or installed as a package)
from dlfeat import register_video_model, DLFeatExtractor

# ---------------------------
# Hyperparameters
# ---------------------------
clip_len = 8          # number of frames per clip
frame_size = 64       # spatial resolution (H = W)
feature_dim = 128     # output feature dimension for DLFeat
num_classes = 4
num_train_videos = 32
num_val_videos = 8
batch_size = 4
epochs = 3
device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------------------------
# 1. Create a tiny synthetic video dataset on disk
# ---------------------------
tmp_root = tempfile.mkdtemp(prefix="dlfeat_tiny_video_")

def make_random_video(path, num_frames=clip_len, size=frame_size):
    """Create a simple 'moving square' RGB video and save it as MP4."""
    video = torch.zeros(num_frames, size, size, 3, dtype=torch.uint8)  # [T, H, W, C]
    for t in range(num_frames):
        x0 = (t * 2) % (size - 8)
        y0 = (t * 3) % (size - 8)
        video[t, y0:y0+8, x0:x0+8, :] = torch.randint(
            128, 255, (8, 8, 3), dtype=torch.uint8
        )
    torchvision.io.write_video(path, video, fps=8)

def generate_split(n_samples, split_name):
    paths, labels = [], []
    for i in range(n_samples):
        cls = i % num_classes
        filename = os.path.join(tmp_root, f"{split_name}_{i:03d}_class{cls}.mp4")
        make_random_video(filename)
        paths.append(filename)
        labels.append(cls)
    return paths, torch.tensor(labels, dtype=torch.long)

train_paths, train_labels = generate_split(num_train_videos, "train")
val_paths, val_labels = generate_split(num_val_videos, "val")

class VideoFileDataset(Dataset):
    """Simple dataset that loads .mp4 files and returns (C, T, H, W) tensors."""
    def __init__(self, paths, labels, clip_len, frame_size):
        self.paths = paths
        self.labels = labels
        self.clip_len = clip_len
        self.frame_size = frame_size

    def _load_video_tensor(self, path):
        # video: [T, H, W, C]
        video, _, _ = torchvision.io.read_video(path, pts_unit="sec")
        num_frames = video.size(0)

        # Sample or pad to a fixed number of frames
        if num_frames < self.clip_len:
            pad = video[-1:].repeat(self.clip_len - num_frames, 1, 1, 1)
            video = torch.cat([video, pad], dim=0)
        else:
            idx = torch.linspace(0, num_frames - 1, steps=self.clip_len).long()
            video = video[idx]

        # To [T, C, H, W]
        video = video.permute(0, 3, 1, 2)

        # Resize frames and normalize to [0, 1]
        video = torchvision.transforms.functional.resize(
            video, [self.frame_size, self.frame_size], antialias=True
        )
        video = video.float() / 255.0

        # Final shape [C, T, H, W]
        return video.permute(1, 0, 2, 3)

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        video_tensor = self._load_video_tensor(self.paths[idx])
        label = self.labels[idx]
        return video_tensor, label

train_ds = VideoFileDataset(train_paths, train_labels, clip_len, frame_size)
val_ds   = VideoFileDataset(val_paths,   val_labels,   clip_len, frame_size)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)

# ---------------------------
# 2. Define a tiny 3D CNN backbone + classifier head
# ---------------------------

class TinyVideoBackbone(nn.Module):
    """Very small 3D CNN that maps (C, T, H, W) to a feature vector."""
    def __init__(self, feature_dim=feature_dim):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv3d(3, 16, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool3d((1, 2, 2)),           # pool only spatial dims
            nn.Conv3d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool3d((None, 1, 1)) # keep time, pool H,W -> [B, 32, T, 1, 1]
        )
        self.proj = nn.Linear(32, feature_dim)

    def forward(self, x):
        # x: [B, C, T, H, W]
        x = self.features(x)          # [B, 32, T, 1, 1]
        x = x.mean(dim=2)             # temporal average -> [B, 32, 1, 1]
        x = x.view(x.size(0), 32)     # [B, 32]
        return self.proj(x)           # [B, feature_dim]

class TinyVideoClassifier(nn.Module):
    """Backbone + linear head for quick supervised training."""
    def __init__(self, backbone, num_classes):
        super().__init__()
        self.backbone = backbone
        self.head = nn.Linear(feature_dim, num_classes)

    def forward(self, x):
        feats = self.backbone(x)
        return self.head(feats)

backbone = TinyVideoBackbone(feature_dim=feature_dim)
model = TinyVideoClassifier(backbone, num_classes=num_classes).to(device)

# ---------------------------
# 3. Quick training loop (few epochs on tiny synthetic data)
# ---------------------------

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for videos, labels in train_loader:
        videos = videos.to(device)   # [B, C, T, H, W]
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(videos)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * videos.size(0)

    avg_loss = running_loss / len(train_ds)
    print(f"Epoch {epoch+1}/{epochs} - train loss: {avg_loss:.4f}")

# ---------------------------
# 4. Register the trained backbone with DLFeat
# ---------------------------

register_video_model(
    model_name="tiny_video_cnn",
    dim=feature_dim,
    model=backbone,      # pass the trained backbone instance
    clip_len=clip_len,
    input_size=frame_size,
    overwrite=True,
)

# ---------------------------
# 5. Use DLFeatExtractor to get features for a list of video paths
# ---------------------------

extractor = DLFeatExtractor("tiny_video_cnn", device=device)

# Here we just reuse a few validation paths, but any list of .mp4 files works
video_paths = val_paths[:4]
features = extractor.transform(video_paths, batch_size=2)

print("Extracted feature shape:", features.shape)  # (N_videos, feature_dim)

ImportError: cannot import name 'register_video_model' from 'dlfeat' (/home/manu/.local/lib/python3.14/site-packages/dlfeat/__init__.py)