# 💫 Welcome to the JEDi Tutorial! 💫

First, could you install our package using pip install?

In [1]:
!pip install videojedi==0.1.2



# Get MMNIST data loader.

In [2]:
import torch
def custom_collate(batch):
    import torch
    videos, labels = [], []
    for video in batch:
        videos.append(video)
    return torch.utils.data.dataloader.default_collate(videos), None

def get_MMNIST_loader(train=True, batch_size=20, image_size=(240, 320)):
    from torchvision.datasets import MovingMNIST
    from torch.utils.data import DataLoader
    from torchvision import transforms
    import av
    import torch.nn as nn

    transform = transforms.Compose([
                # TODO: this should be done by a video-level transfrom when PyTorch provides transforms.ToTensor() for video
                # scale in [0, 1] of type float
                transforms.Lambda(lambda x: x / 255.),
                # change from 1 channel to 3
                transforms.Lambda(lambda x: x.repeat(1, 3, 1, 1)),
                # # reshape into (T, C, H, W) for easier convolutions
                # transforms.Lambda(lambda x: x.permute(0, 3, 1, 2)),
                # rescale to the most common size
                transforms.Lambda(lambda x: nn.functional.interpolate(x, image_size)),
    ])
    mmnist = MovingMNIST(".", split="train" if train else "test", download=True, transform=transform)
    return DataLoader(mmnist, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

train_loader = get_MMNIST_loader(train=True)
test_loader = get_MMNIST_loader(train=False)

# Compute metric.

In [3]:
from videojedi import JEDiMetric
jedi = JEDiMetric(feature_path='.', model_dir='.')
jedi.load_features(train_loader=train_loader, test_loader=test_loader, num_samples=1500)
assert jedi.train_features.shape == (1500, 1280)
assert jedi.test_features.shape == (1500, 1280)

INFO:root:loaded params...
INFO:root:SLURM vars not set (distributed training not available)
INFO:root:Initialized (rank/world-size) 0/1
INFO:root:Loading pretrained model from ./vith16.pth.tar
  checkpoint = torch.load(pretrained, map_location='cpu')
INFO:root:loaded pretrained model with msg: <All keys matched successfully>
INFO:root:loaded pretrained encoder from epoch: 300
 path: ./vith16.pth.tar


VisionTransformer(
  (patch_embed): PatchEmbed3D(
    (proj): Conv3d(3, 1280, kernel_size=(2, 16, 16), stride=(2, 16, 16))
  )
  (blocks): ModuleList(
    (0-31): 32 x Block(
      (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1280, out_features=3840, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (mlp): MLP(
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
)


  checkpoint = torch.load(r_path, map_location=device)
INFO:root:loaded pretrained classifier from epoch 20 with msg: <All keys matched successfully>


Computing features for training set


  self.gen = func(*args, **kwds)


Saved features to ./train.npy
Computing features for testing set
Saved features to ./test.npy


In [4]:
print(f"JEDi Metric: {jedi.compute_metric()}")

JEDi Metric: 0.00691339373588562


Alternatively, you can load your pre-extracted V-JEPA features using Numpy and directly calculate the distribution distance

In [5]:
from videojedi import JEDiMetric
import numpy as np
jedi = JEDiMetric()
jedi.train_features = np.random.rand(5000, 1280)
jedi.test_features = np.random.rand(5000, 1280)
print(f"JEDi Metric: {jedi.compute_metric()}")

JEDi Metric: 0.001983016839501861
