In [28]:
# -----------------------------------------------------------------------------
# Imports
# -----------------------------------------------------------------------------
import os
from pathlib import Path
from typing import List, Tuple

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as T
from PIL import Image
import numpy as np
from tqdm.notebook import tqdm

# The transformers VideoMAE model & Feature Extractor
from transformers import VideoMAEForVideoClassification, VideoMAEFeatureExtractor

In [29]:
# -----------------------------------------------------------------------------
# Config / Hyperparameters (small test run defaults)
# -----------------------------------------------------------------------------
ROOT_FRAMES = Path(r"C:\Users\rayaa\Downloads\ucf_crime_v2\ucf_crime_frames")  # change if needed
SPLITS = ["Train", "Test", "Validation"]
NUM_FRAMES = 16          # clip length sampled uniformly across each video
RESOLUTION = 224        # square resolution (you chose 224)
BATCH_SIZE = 4          # small for test run; increase if you have GPU memory
NUM_WORKERS = 0
LR = 2e-4
EPOCHS = 2              # minimal smoke test
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_CLASSES = None      # will be inferred from dataset
SEED = 42

torch.manual_seed(SEED)

<torch._C.Generator at 0x287d1a13b30>

In [30]:
# -----------------------------------------------------------------------------
# Dataset: folder-of-frames where each video is a folder of PNG frames
# We will uniformly sample NUM_FRAMES frames across the available frames.
# Output tensor shape: (C, T, H, W) where C=3, T=NUM_FRAMES
# -----------------------------------------------------------------------------
class UCFFolderFramesDataset(Dataset):
    def __init__(self, root: Path, split: str = "Train", num_frames: int = 16, resolution: int = 224, class_names: List[str] = None):
        self.root = Path(root) / split
        self.num_frames = num_frames
        self.resolution = resolution
        self.samples = []  # list of tuples (video_folder_path, class_idx)

        # gather classes
        if class_names is None:
            self.classes = sorted([p.name for p in self.root.iterdir() if p.is_dir()])
        else:
            self.classes = class_names

        self.class_to_idx = {c: i for i, c in enumerate(self.classes)}

        # iterate classes and videos
        for class_name in self.classes:
            class_dir = self.root / class_name
            if not class_dir.exists():
                continue
            for video_folder in class_dir.iterdir():
                if video_folder.is_dir():
                    # count image files (common extensions)
                    frame_files = sorted([p for p in video_folder.iterdir() if p.suffix.lower() in [".png"]])
                    if len(frame_files) < 2:
                        continue
                    self.samples.append((video_folder, self.class_to_idx[class_name]))

        # transforms for each frame
        self.transform = T.Compose([
            T.Resize((self.resolution, self.resolution)),
            T.ToTensor(),  # produces [C, H, W] float in [0,1]
            # we will not normalize here, as we'll rely on the VideoMAE feature extractor if necessary
        ])

    def __len__(self):
        return len(self.samples)

    def _uniform_sample_indices(self, num_total: int) -> List[int]:
        """Return list of indices (length self.num_frames) uniformly sampled across [0, num_total-1].
        If num_total < num_frames, we will pad by repeating the last frame.
        """
        if num_total <= 0:
            raise ValueError("video has no frames")
        if num_total >= self.num_frames:
            # linspace indices rounded to int
            indices = np.linspace(0, num_total - 1, num=self.num_frames, dtype=int).tolist()
        else:
            # take all frames and repeat last
            indices = list(range(num_total))
            while len(indices) < self.num_frames:
                indices.append(num_total - 1)
        return indices

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        video_folder, class_idx = self.samples[idx]
        frame_files = sorted([p for p in video_folder.iterdir() if p.suffix.lower() in [".png"]])
        num_total = len(frame_files)
        indices = self._uniform_sample_indices(num_total)

        frames = []
        for i in indices:
            img = Image.open(frame_files[i]).convert("RGB")
            frames.append(self.transform(img))  # [C,H,W]

        # stack frames -> shape [T, C, H, W]
        frames = torch.stack(frames, dim=0)
        # reorder to [C, T, H, W] which VideoMAE expects
        # frames = frames.permute(1, 0, 2, 3).contiguous()

        return frames, class_idx

In [31]:
# -----------------------------------------------------------------------------
# Utility: build datasets + dataloaders (small subset mode for quick test)
# -----------------------------------------------------------------------------
def build_dataloaders(root_frames: Path, num_frames: int = NUM_FRAMES, resolution: int = RESOLUTION, batch_size: int = BATCH_SIZE, small_run: bool = True):
    train_root = root_frames / "Train"
    # infer classes from train
    classes = sorted([p.name for p in train_root.iterdir() if p.is_dir()])
    print(f"Detected classes ({len(classes)}): {classes}")

    train_ds = UCFFolderFramesDataset(root_frames, split="Train", num_frames=num_frames, resolution=resolution, class_names=classes)
    test_ds = UCFFolderFramesDataset(root_frames, split="Test", num_frames=num_frames, resolution=resolution, class_names=classes)

    # For a minimal smoke test, use small subsets
    if small_run:
        # pick at most 100 train samples, 50 test samples (or fewer if dataset smaller)
        n_train = min(len(train_ds), 100)
        n_test = min(len(test_ds), 50)
        train_ds, _ = random_split(train_ds, [n_train, max(0, len(train_ds) - n_train)])
        test_ds, _ = random_split(test_ds, [n_test, max(0, len(test_ds) - n_test)])

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

    return train_loader, test_loader, classes

In [32]:
# -----------------------------------------------------------------------------
# Model setup: VideoMAE from Hugging Face
# Note: we will use the feature extractor to normalize images appropriately
# -----------------------------------------------------------------------------

def build_model(num_labels: int):
    # load feature extractor to handle normalization / resizing guidance
    feature_extractor = VideoMAEFeatureExtractor()

    # instantiate the model for video classification; use a small pretrained checkpoint if available
    model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base", num_labels=num_labels)
    # If the above checkpoint isn't available locally/internet, fallback to random initialization
    # model = VideoMAEForVideoClassification.from_config(config)

    return model, feature_extractor

In [33]:
# -----------------------------------------------------------------------------
# Training + evaluation loops (minimal)
# -----------------------------------------------------------------------------

def train_one_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for batch in tqdm(dataloader, desc="Train", leave=False):
        videos, labels = batch  # videos: [B, C, T, H, W]
        videos = videos.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values=videos, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * videos.size(0)
        preds = logits.argmax(dim=-1)
        correct += (preds == labels).sum().item()
        total += videos.size(0)

    avg_loss = total_loss / total if total > 0 else 0.0
    acc = correct / total if total > 0 else 0.0
    return avg_loss, acc


def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval", leave=False):
            videos, labels = batch
            videos = videos.to(device)
            labels = labels.to(device)

            outputs = model(pixel_values=videos, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item() * videos.size(0)
            preds = logits.argmax(dim=-1)
            correct += (preds == labels).sum().item()
            total += videos.size(0)

    avg_loss = total_loss / total if total > 0 else 0.0
    acc = correct / total if total > 0 else 0.0
    return avg_loss, acc

In [34]:
# -----------------------------------------------------------------------------
# Main: build dataloaders, model, train and evaluate (small run)
# -----------------------------------------------------------------------------

def main_small_run():
    train_loader, test_loader, classes = build_dataloaders(ROOT_FRAMES, num_frames=NUM_FRAMES, resolution=RESOLUTION, batch_size=BATCH_SIZE, small_run=True)
    num_labels = len(classes)

    model, feature_extractor = build_model(num_labels)
    model.to(DEVICE)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

    print(f"Starting training on device={DEVICE} with {num_labels} classes")

    for epoch in range(EPOCHS):
        print(f"Epoch {epoch+1}/{EPOCHS}")
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, DEVICE)
        print(f" Train loss: {train_loss:.4f}  acc: {train_acc:.4f}")

        val_loss, val_acc = evaluate(model, test_loader, DEVICE)
        print(f" Eval loss:  {val_loss:.4f}  acc: {val_acc:.4f}")

    # Save a small checkpoint
    ckpt_path = Path("videomae_small_run.pth")
    torch.save(model.state_dict(), ckpt_path)
    print(f"Saved checkpoint to {ckpt_path}")

In [35]:

# -----------------------------------------------------------------------------
# Run the small test run
# -----------------------------------------------------------------------------
main_small_run()

# -----------------------------------------------------------------------------
# Notes / next steps
# - The script uses MCG-NJU/videomae-base pretrained weights as an example; change if you prefer
# - For large scale training: use gradient accumulation, mixed precision, more workers, and a scheduler
# - Consider using the VideoMAEFeatureExtractor to pre-process frames into pixel_values; here we rely on basic tensor transforms.
# - If you encounter shape mismatches, make sure the model expects video shape: (batch, channels, frames, height, width)
# - For reproducibility, set deterministic flags and seeds for data loading.


Detected classes (14): ['Abuse', 'Arrest', 'Arson', 'Assault', 'Burglary', 'Explosion', 'Fighting', 'NormalVideos', 'RoadAccidents', 'Robbery', 'Shooting', 'Shoplifting', 'Stealing', 'Vandalism']


Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training on device=cuda with 14 classes
Epoch 1/2


Train:   0%|          | 0/25 [00:00<?, ?it/s]

 Train loss: 2.7330  acc: 0.1100


Eval:   0%|          | 0/13 [00:00<?, ?it/s]

 Eval loss:  2.8125  acc: 0.1000
Epoch 2/2


Train:   0%|          | 0/25 [00:00<?, ?it/s]

 Train loss: 2.5954  acc: 0.1100


Eval:   0%|          | 0/13 [00:00<?, ?it/s]

 Eval loss:  2.8855  acc: 0.0400
Saved checkpoint to videomae_small_run.pth


In [38]:
# VideoMAE training notebook (filled)
# This notebook implements a minimal VideoMAE training + evaluation pipeline
# using the frame-extracted UCF Crime dataset ("ucf_crime_frames/").
#
# Choices you requested:
# 1) Frame sampling: Uniform sample N frames evenly across the video
# 2) Input resolution: 224x224
# 3) Run mode: Minimal test run (small subset) to verify training loop

# Requirements
# - torch, torchvision
# - transformers (for VideoMAE model and feature extractor)\# - tqdm
# - pillow
# - optionally accelerate for fp16 (not used here)

# If any of these are missing, install via pip, for example:
# !pip install torch torchvision transformers tqdm pillow

# -----------------------------------------------------------------------------
# Imports
# -----------------------------------------------------------------------------
import os
from pathlib import Path
from typing import List, Tuple

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as T
from PIL import Image
import numpy as np
from tqdm.notebook import tqdm

# The transformers VideoMAE model & Feature Extractor
from transformers import VideoMAEForVideoClassification, VideoMAEFeatureExtractor

# -----------------------------------------------------------------------------
# Config / Hyperparameters (small test run defaults)
# -----------------------------------------------------------------------------
ROOT_FRAMES = Path(r"C:\Users\rayaa\Downloads\ucf_crime_v2\ucf_crime_frames")
SPLITS = ["Train", "Test", "Validation"]
NUM_FRAMES = 16          # clip length sampled uniformly across each video
RESOLUTION = 224        # square resolution (you chose 224)
BATCH_SIZE = 4          # small for test run; increase if you have GPU memory
# IMPORTANT FIX for Windows multiprocessing crash during DataLoader
# Windows + PIL image loading often crashes worker processes when using multiple workers.
# Set NUM_WORKERS = 0 to avoid multiprocessing issues.
NUM_WORKERS = 0
LR = 2e-4
EPOCHS = 10              # minimal smoke test
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_CLASSES = None      # will be inferred from dataset
SEED = 42

torch.manual_seed(SEED)

# -----------------------------------------------------------------------------
# Dataset: folder-of-frames where each video is a folder of PNG frames
# We will uniformly sample NUM_FRAMES frames across the available frames.
# Output tensor shape: (C, T, H, W) where C=3, T=NUM_FRAMES
# -----------------------------------------------------------------------------
class UCFFolderFramesDataset(Dataset):
    def __init__(self, root: Path, split: str = "Train", num_frames: int = 16, resolution: int = 224, class_names: List[str] = None):
        self.root = Path(root) / split
        self.num_frames = num_frames
        self.resolution = resolution
        self.samples = []  # list of tuples (video_folder_path, class_idx)

        # gather classes
        if class_names is None:
            self.classes = sorted([p.name for p in self.root.iterdir() if p.is_dir()])
        else:
            self.classes = class_names

        self.class_to_idx = {c: i for i, c in enumerate(self.classes)}

        # iterate classes and videos
        for class_name in self.classes:
            class_dir = self.root / class_name
            if not class_dir.exists():
                continue
            for video_folder in class_dir.iterdir():
                if video_folder.is_dir():
                    # count image files (common extensions)
                    frame_files = sorted([p for p in video_folder.iterdir() if p.suffix.lower() in [".png", ".jpg", ".jpeg"]])
                    if len(frame_files) < 2:
                        continue
                    self.samples.append((video_folder, self.class_to_idx[class_name]))

        # transforms for each frame
        self.transform = T.Compose([
            T.Resize((self.resolution, self.resolution)),
            T.ToTensor(),  # produces [C, H, W] float in [0,1]
            # we will not normalize here, as we'll rely on the VideoMAE feature extractor if necessary
        ])

    def __len__(self):
        return len(self.samples)

    def _uniform_sample_indices(self, num_total: int) -> List[int]:
        """Return list of indices (length self.num_frames) uniformly sampled across [0, num_total-1].
        If num_total < num_frames, we will pad by repeating the last frame.
        """
        if num_total <= 0:
            raise ValueError("video has no frames")
        if num_total >= self.num_frames:
            # linspace indices rounded to int
            indices = np.linspace(0, num_total - 1, num=self.num_frames, dtype=int).tolist()
        else:
            # take all frames and repeat last
            indices = list(range(num_total))
            while len(indices) < self.num_frames:
                indices.append(num_total - 1)
        return indices

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        video_folder, class_idx = self.samples[idx]
        frame_files = sorted([p for p in video_folder.iterdir() if p.suffix.lower() in [".png", ".jpg", ".jpeg"]])
        num_total = len(frame_files)
        indices = self._uniform_sample_indices(num_total)

        frames = []
        for i in indices:
            img = Image.open(frame_files[i]).convert("RGB")
            frames.append(self.transform(img))  # [C,H,W]

        # stack frames -> shape [T, C, H, W]
        frames = torch.stack(frames, dim=0)

        return frames, class_idx

# -----------------------------------------------------------------------------
# Utility: build datasets + dataloaders (small subset mode for quick test)
# -----------------------------------------------------------------------------
def build_dataloaders(root_frames: Path, num_frames: int = NUM_FRAMES, resolution: int = RESOLUTION, batch_size: int = BATCH_SIZE, small_run: bool = True):
    train_root = root_frames / "Train"
    # infer classes from train
    classes = sorted([p.name for p in train_root.iterdir() if p.is_dir()])
    print(f"Detected classes ({len(classes)}): {classes}")

    train_ds = UCFFolderFramesDataset(root_frames, split="Train", num_frames=num_frames, resolution=resolution, class_names=classes)
    test_ds = UCFFolderFramesDataset(root_frames, split="Test", num_frames=num_frames, resolution=resolution, class_names=classes)

    # For a minimal smoke test, use small subsets
    if small_run:
        # pick at most 100 train samples, 50 test samples (or fewer if dataset smaller)
        n_train = min(len(train_ds), 100)
        n_test = min(len(test_ds), 50)
        train_ds, _ = random_split(train_ds, [n_train, max(0, len(train_ds) - n_train)])
        test_ds, _ = random_split(test_ds, [n_test, max(0, len(test_ds) - n_test)])

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

    return train_loader, test_loader, classes

# -----------------------------------------------------------------------------
# Model setup: VideoMAE from Hugging Face
# Note: we will use the feature extractor to normalize images appropriately
# -----------------------------------------------------------------------------

def build_model(num_labels: int):
    # load feature extractor to handle normalization / resizing guidance
    feature_extractor = VideoMAEFeatureExtractor()

    # instantiate the model for video classification; use a small pretrained checkpoint if available
    model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base", num_labels=num_labels)
    # If the above checkpoint isn't available locally/internet, fallback to random initialization
    # model = VideoMAEForVideoClassification.from_config(config)

    return model, feature_extractor

# -----------------------------------------------------------------------------
# Training + evaluation loops (minimal)
# -----------------------------------------------------------------------------

def train_one_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for batch in tqdm(dataloader, desc="Train", leave=False):
        videos, labels = batch  # videos: [B, C, T, H, W]
        videos = videos.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values=videos, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * videos.size(0)
        preds = logits.argmax(dim=-1)
        correct += (preds == labels).sum().item()
        total += videos.size(0)

    avg_loss = total_loss / total if total > 0 else 0.0
    acc = correct / total if total > 0 else 0.0
    return avg_loss, acc


def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval", leave=False):
            videos, labels = batch
            videos = videos.to(device)
            labels = labels.to(device)

            outputs = model(pixel_values=videos, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item() * videos.size(0)
            preds = logits.argmax(dim=-1)
            correct += (preds == labels).sum().item()
            total += videos.size(0)

    avg_loss = total_loss / total if total > 0 else 0.0
    acc = correct / total if total > 0 else 0.0
    return avg_loss, acc

# -----------------------------------------------------------------------------
# Main: build dataloaders, model, train and evaluate (small run)
# -----------------------------------------------------------------------------

def main_small_run():
    train_loader, test_loader, classes = build_dataloaders(ROOT_FRAMES, num_frames=NUM_FRAMES, resolution=RESOLUTION, batch_size=BATCH_SIZE, small_run=False)
    num_labels = len(classes)

    model, feature_extractor = build_model(num_labels)
    model.to(DEVICE)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

    print(f"Starting training on device={DEVICE} with {num_labels} classes")

    for epoch in range(EPOCHS):
        print(f"Epoch {epoch+1}/{EPOCHS}")
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, DEVICE)
        print(f" Train loss: {train_loss:.4f}  acc: {train_acc:.4f}")

        val_loss, val_acc = evaluate(model, test_loader, DEVICE)
        print(f" Eval loss:  {val_loss:.4f}  acc: {val_acc:.4f}")

    # Save a small checkpoint
    ckpt_path = Path("videomae_small_run.pth")
    torch.save(model.state_dict(), ckpt_path)
    print(f"Saved checkpoint to {ckpt_path}")

# -----------------------------------------------------------------------------
# Run the small test run
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    main_small_run()

# -----------------------------------------------------------------------------
# Notes / next steps
# - The script uses MCG-NJU/videomae-base pretrained weights as an example; change if you prefer
# - For large scale training: use gradient accumulation, mixed precision, more workers, and a scheduler
# - Consider using the VideoMAEFeatureExtractor to pre-process frames into pixel_values; here we rely on basic tensor transforms.
# - If you encounter shape mismatches, make sure the model expects video shape: (batch, channels, frames, height, width)
# - For reproducibility, set deterministic flags and seeds for data loading.


Detected classes (14): ['Abuse', 'Arrest', 'Arson', 'Assault', 'Burglary', 'Explosion', 'Fighting', 'NormalVideos', 'RoadAccidents', 'Robbery', 'Shooting', 'Shoplifting', 'Stealing', 'Vandalism']


Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training on device=cuda with 14 classes
Epoch 1/10


Train:   0%|          | 0/140 [00:00<?, ?it/s]

 Train loss: 2.7139  acc: 0.0768


Eval:   0%|          | 0/18 [00:00<?, ?it/s]

 Eval loss:  2.6248  acc: 0.1429
Epoch 2/10


Train:   0%|          | 0/140 [00:00<?, ?it/s]

 Train loss: 2.6564  acc: 0.0929


Eval:   0%|          | 0/18 [00:00<?, ?it/s]

 Eval loss:  2.6280  acc: 0.1143
Epoch 3/10


Train:   0%|          | 0/140 [00:00<?, ?it/s]

 Train loss: 2.6509  acc: 0.0911


Eval:   0%|          | 0/18 [00:00<?, ?it/s]

 Eval loss:  2.6178  acc: 0.0857
Epoch 4/10


Train:   0%|          | 0/140 [00:00<?, ?it/s]

 Train loss: 2.6156  acc: 0.1071


Eval:   0%|          | 0/18 [00:00<?, ?it/s]

 Eval loss:  2.5830  acc: 0.1286
Epoch 5/10


Train:   0%|          | 0/140 [00:00<?, ?it/s]

 Train loss: 2.5868  acc: 0.1125


Eval:   0%|          | 0/18 [00:00<?, ?it/s]

 Eval loss:  2.5962  acc: 0.1286
Epoch 6/10


Train:   0%|          | 0/140 [00:00<?, ?it/s]

 Train loss: 2.5856  acc: 0.1250


Eval:   0%|          | 0/18 [00:00<?, ?it/s]

 Eval loss:  2.5851  acc: 0.1000
Epoch 7/10


Train:   0%|          | 0/140 [00:00<?, ?it/s]

 Train loss: 2.5504  acc: 0.1321


Eval:   0%|          | 0/18 [00:00<?, ?it/s]

 Eval loss:  2.6460  acc: 0.0429
Epoch 8/10


Train:   0%|          | 0/140 [00:00<?, ?it/s]

 Train loss: 2.5300  acc: 0.1357


Eval:   0%|          | 0/18 [00:00<?, ?it/s]

 Eval loss:  2.5878  acc: 0.2000
Epoch 9/10


Train:   0%|          | 0/140 [00:00<?, ?it/s]

 Train loss: 2.5123  acc: 0.1375


Eval:   0%|          | 0/18 [00:00<?, ?it/s]

 Eval loss:  2.5962  acc: 0.1000
Epoch 10/10


Train:   0%|          | 0/140 [00:00<?, ?it/s]

 Train loss: 2.4724  acc: 0.1464


Eval:   0%|          | 0/18 [00:00<?, ?it/s]

 Eval loss:  2.7030  acc: 0.0857
Saved checkpoint to videomae_small_run.pth
