Imports

In [1]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import AutoImageProcessor, VideoMAEForVideoClassification, TrainingArguments, Trainer
import evaluate
from PIL import Image
import numpy as np
from pathlib import Path
from tqdm import tqdm

from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

import cv2

  from .autonotebook import tqdm as notebook_tqdm


to use pytorchvideo.transforms, i installed torch version 2.0.1, and torchvision version 0.15.2

newer versions of torchvision remove necessary modules for pytorchvideo

pre 2.0 numpy was also needed to use numpy

In [2]:
import pytorchvideo.data

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)




set root and output directories

In [3]:
train_root = r"C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Train"
test_root = r"C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test"
val_root = r"C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Validation"

output_dir = r"C:\Users\Brandon\Documents\DM_Project\output"
os.makedirs(output_dir, exist_ok=True)


Verify Data Loading

In [4]:
def count_videos(root_dir):
    total = 0
    for cls in os.listdir(root_dir):
        cls_path = os.path.join(root_dir, cls)
        if not os.path.isdir(cls_path):
            continue
        # Count subdirectories (each subdirectory is a video folder)
        num_videos = sum(
            1 for file in os.listdir(cls_path)
            if file.lower().endswith('.mp4')
        )
        print(f"{cls}: {num_videos} videos")
        total += num_videos
    return total

train_count = count_videos(train_root)
test_count = count_videos(test_root)
val_count = count_videos(val_root)

print(f"\nTotal Train videos: {train_count}")
print(f"Total Test videos: {test_count}")
print(f"Total Validation videos: {val_count}")

Abuse: 40 videos
Arrest: 40 videos
Arson: 40 videos
Assault: 40 videos
Burglary: 80 videos
Explosion: 40 videos
Fighting: 40 videos
NormalVideos: 170 videos
RoadAccidents: 120 videos
Robbery: 120 videos
Shooting: 40 videos
Shoplifting: 40 videos
Stealing: 80 videos
Vandalism: 40 videos
Abuse: 5 videos
Arrest: 5 videos
Arson: 5 videos
Assault: 5 videos
Burglary: 10 videos
Explosion: 5 videos
Fighting: 5 videos
NormalVideos: 150 videos
RoadAccidents: 15 videos
Robbery: 15 videos
Shooting: 5 videos
Shoplifting: 5 videos
Stealing: 10 videos
Vandalism: 5 videos
Abuse: 5 videos
Arrest: 5 videos
Arson: 5 videos
Assault: 5 videos
Burglary: 10 videos
Explosion: 5 videos
Fighting: 5 videos
NormalVideos: 19 videos
RoadAccidents: 15 videos
Robbery: 15 videos
Shooting: 5 videos
Shoplifting: 5 videos
Stealing: 10 videos
Vandalism: 5 videos

Total Train videos: 930
Total Test videos: 245
Total Validation videos: 114


**Process Data**

Map Video Labels

In [5]:
class_labels = sorted(os.listdir(train_root))
num_classes = len(class_labels)
label2id = {label: idx for idx, label in enumerate(class_labels)}
id2label = {idx: label for label, idx in label2id.items()}

In [6]:
def MapAndPath(root, label2id):
    videopaths = []

    for clss in sorted(os.listdir(root)):
        class_dir = os.path.join(root, clss)
        if not os.path.isdir(class_dir):
            continue

        for file in os.listdir(class_dir):
            if file.lower().endswith('.mp4'):
                filepath = os.path.join(class_dir, file)
                label_id = label2id[clss]
                videopaths.append((filepath, {"label": label_id, "video_path": filepath}))
                
    return videopaths
    

trying to wrap the class to properly allow getting the length and getting items. cv2 was used to try and help some videos being skipped in loading, but its not quite working

In [14]:
import numpy as np
import cv2
import torch
from torch.utils.data import Dataset

class SafeLabeledVideoDataset(Dataset):
    def __init__(self, labeled_video_paths, transform=None, num_frames=16, frame_size=224):
        self.labeled_video_paths = labeled_video_paths
        self.transform = transform
        self.num_frames = num_frames
        self.frame_size = frame_size
        self.length = len(labeled_video_paths)

    def __len__(self):
        return self.length

    def _load_video_cv2(self, video_path):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError("Could not open video")

        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_indices = np.linspace(0, total_frames - 1, self.num_frames, dtype=int)

        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, (self.frame_size, self.frame_size))
            frames.append(torch.from_numpy(frame.astype(np.float32) / 255.0).permute(2, 0, 1))  # (C, H, W)

        cap.release()

        if len(frames) == 0:
            raise ValueError("No frames extracted")

        video_tensor = torch.stack(frames, dim=1)  # (C, T, H, W)
        return video_tensor

    def __getitem__(self, idx):
        video_path, metadata = self.labeled_video_paths[idx]
        try:
            video_tensor = self._load_video_cv2(video_path)
            print(f"After load: {video_tensor.shape}")

            if self.transform:
                video_dict = {"video": video_tensor}
                print(f"Before transform: {video_dict['video'].shape}")
                video_dict = self.transform(video_dict)
                print(f"After transform: {video_dict['video'].shape}")
                video_tensor = video_dict["video"]

            return {
                "video": video_tensor,
                "label": metadata["label"],
                "video_path": video_path,
            }

        except Exception as e:
            print(f"[LOAD FAILED] {video_path} | Error: {e}")
            import traceback
            traceback.print_exc()  # This will show the full error trace
            dummy = torch.zeros(3, self.num_frames, self.frame_size, self.frame_size)
            return {"video": dummy, "label": metadata["label"], "video_path": video_path}

Load Model and Processor

In [8]:
import transformers.utils as utils
utils.is_torch_available = lambda: True

In [9]:
model_name = "MCG-NJU/videomae-base"
image_processor = AutoImageProcessor.from_pretrained(model_name)
model = VideoMAEForVideoClassification.from_pretrained(model_name, num_labels=num_classes, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VideoMAEForVideoClassification(
  (videomae): VideoMAEModel(
    (embeddings): VideoMAEEmbeddings(
      (patch_embeddings): VideoMAEPatchEmbeddings(
        (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
      )
    )
    (encoder): VideoMAEEncoder(
      (layer): ModuleList(
        (0-11): 12 x VideoMAELayer(
          (attention): VideoMAEAttention(
            (attention): VideoMAESelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): VideoMAESelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): VideoMAEIntermediate(
            (dense): Lin

transforming videos in a similar way to hugging face documentation

In [10]:
mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)

val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)




create datasets

In [11]:
train_paths = MapAndPath(train_root, label2id)
val_paths = MapAndPath(val_root, label2id)
test_paths = MapAndPath(test_root, label2id)

print(f"Train: {len(train_paths)}, Test: {len(test_paths)}, Val: {len(val_paths)}")

train_dataset = SafeLabeledVideoDataset(
    labeled_video_paths=train_paths,
    transform=train_transform
    
)

test_dataset = SafeLabeledVideoDataset(
    labeled_video_paths=test_paths,
    transform=val_transform
    
)

val_dataset = SafeLabeledVideoDataset(
    labeled_video_paths=val_paths,
    transform=val_transform
    
)


Train: 930, Test: 245, Val: 114


Training

In [12]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=preds, references=eval_pred.label_ids)

def collate_fn(examples):
    pixel_values = torch.stack([ex["video"] for ex in examples]) 
    labels = torch.tensor([ex["label"] for ex in examples])
    paths = [ex.get("video_path", "unknown") for ex in examples]
    return {"pixel_values": pixel_values, "labels": labels, "video_path": paths}

num_epochs = 10
batch_size = 3

training_args = TrainingArguments(
        output_dir=str(output_dir),
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=10,
        learning_rate=5e-5,
        warmup_ratio=0.1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        num_train_epochs=num_epochs,
        fp16=torch.cuda.is_available(),
        push_to_hub=False,
        remove_unused_columns=False,
        max_steps=(len(train_dataset) // batch_size) * num_epochs,
    )

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=image_processor,
        compute_metrics=compute_metrics,
        data_collator=collate_fn,
    )


max_steps is given, it will override any value given in num_train_epochs


Test Set

In [18]:
model.eval()
all_preds, all_labels, failed_paths = [], [], []

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    num_workers=0,
    shuffle=False,
    collate_fn=collate_fn,
)

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        try:
            inputs = {k: v.to(device) for k, v in batch.items() if k not in ["labels", "video_path"]}
            labels = batch["labels"].to(device)
            outputs = model(**inputs)
            preds = outputs.logits.argmax(dim=1).cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().tolist())
        except Exception as e:
            print(f"Batch failed: {batch.get('video_path', [])} | {e}")
            failed_paths.extend(batch.get("video_path", []))
            # Add dummy preds to keep length
            all_preds.extend([0] * len(labels))
            all_labels.extend(labels.cpu().tolist())

print(f"Failed videos: {len(failed_paths)}")
if failed_paths:
    print("First 5 failed paths:", failed_paths[:5])

cm = confusion_matrix(all_labels, all_preds, labels=np.arange(num_classes))
plt.figure(figsize=(12, 10))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=class_labels,
    yticklabels=class_labels,
)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Test Set Confusion Matrix")
plt.tight_layout()
cm_path = output_dir / "confusion_matrix.png"
plt.savefig(cm_path)
plt.close()
print(f"Confusion matrix saved to {cm_path}")

# Classification report
report = classification_report(
    all_labels, all_preds, target_names=class_labels, digits=4
)
report_path = output_dir / "classification_report.txt"
with open(report_path, "w") as f:
    f.write(report)
print("\nClassification report:\n", report)
print(f"Report saved to {report_path}")

acc = np.trace(cm) / np.sum(cm)
print(f"\nOverall Test Accuracy: {acc:.4f}")

Evaluating:   0%|          | 0/82 [00:00<?, ?it/s]

[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Abuse\Abuse003_x264.mp4 | Error: too many indices for tensor of dimension 4
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Abuse\Abuse011_x264.mp4 | Error: too many indices for tensor of dimension 4


Evaluating:   1%|          | 1/82 [00:00<00:52,  1.55it/s]

[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Abuse\Abuse013_x264.mp4 | Error: too many indices for tensor of dimension 4
Batch failed: ['C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Abuse\\Abuse003_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Abuse\\Abuse011_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Abuse\\Abuse013_x264.mp4'] | Make sure that the channel dimension of the pixel values match with the one set in the configuration.
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Abuse\Abuse039_x264.mp4 | Error: too many indices for tensor of dimension 4
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Abuse\Abuse047_x264.mp4 | Error: too many indices for tensor of dimension 4


Evaluating:   2%|▏         | 2/82 [00:01<00:48,  1.66it/s]

[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Arrest\Arrest013_x264.mp4 | Error: too many indices for tensor of dimension 4
Batch failed: ['C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Abuse\\Abuse039_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Abuse\\Abuse047_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Arrest\\Arrest013_x264.mp4'] | Make sure that the channel dimension of the pixel values match with the one set in the configuration.
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Arrest\Arrest015_x264.mp4 | Error: too many indices for tensor of dimension 4
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Arrest\Arrest023_x264.mp4 | Error: too many indices for tensor of dimension 4


Evaluating:   4%|▎         | 3/82 [00:01<00:49,  1.59it/s]

[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Arrest\Arrest024_x264.mp4 | Error: too many indices for tensor of dimension 4
Batch failed: ['C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Arrest\\Arrest015_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Arrest\\Arrest023_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Arrest\\Arrest024_x264.mp4'] | Make sure that the channel dimension of the pixel values match with the one set in the configuration.
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Arrest\Arrest046_x264.mp4 | Error: too many indices for tensor of dimension 4
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Arson\Arson012_x264.mp4 | Error: too many indices for tensor of dimension 4


Evaluating:   5%|▍         | 4/82 [00:02<00:44,  1.75it/s]

[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Arson\Arson021_x264.mp4 | Error: too many indices for tensor of dimension 4
Batch failed: ['C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Arrest\\Arrest046_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Arson\\Arson012_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Arson\\Arson021_x264.mp4'] | Make sure that the channel dimension of the pixel values match with the one set in the configuration.
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Arson\Arson024_x264.mp4 | Error: too many indices for tensor of dimension 4


Evaluating:   6%|▌         | 5/82 [00:02<00:39,  1.95it/s]

[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Arson\Arson035_x264.mp4 | Error: too many indices for tensor of dimension 4
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Arson\Arson048_x264.mp4 | Error: too many indices for tensor of dimension 4
Batch failed: ['C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Arson\\Arson024_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Arson\\Arson035_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Arson\\Arson048_x264.mp4'] | Make sure that the channel dimension of the pixel values match with the one set in the configuration.
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Assault\Assault001_x264.mp4 | Error: too many indices for tensor of dimension 4
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Assault\Assault010_x264.mp4 | Error: too many indices for tensor of dimension 4


Evaluating:   7%|▋         | 6/82 [00:03<00:39,  1.91it/s]

[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Assault\Assault015_x264.mp4 | Error: too many indices for tensor of dimension 4
Batch failed: ['C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Assault\\Assault001_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Assault\\Assault010_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Assault\\Assault015_x264.mp4'] | Make sure that the channel dimension of the pixel values match with the one set in the configuration.
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Assault\Assault022_x264.mp4 | Error: too many indices for tensor of dimension 4
[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Assault\Assault033_x264.mp4 | Error: too many indices for tensor of dimension 4


Evaluating:   9%|▊         | 7/82 [00:03<00:42,  1.77it/s]

[LOAD FAILED] C:\Users\Brandon\Documents\DM_Project\ucf_crime_v2\Test\Burglary\Burglary002_x264.mp4 | Error: too many indices for tensor of dimension 4
Batch failed: ['C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Assault\\Assault022_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Assault\\Assault033_x264.mp4', 'C:\\Users\\Brandon\\Documents\\DM_Project\\ucf_crime_v2\\Test\\Burglary\\Burglary002_x264.mp4'] | Make sure that the channel dimension of the pixel values match with the one set in the configuration.





KeyboardInterrupt: 