In [1]:
import os
import cv2
import time
import torch
import numpy as np
import torch.nn as nn
import tensorflow as tf
import concurrent.futures
import torch.optim as optim
import matplotlib.pyplot as plt
from collections import Counter
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from tensorflow.keras import layers, models
from torch.optim.lr_scheduler import StepLR
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import AdamW
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification, TrainingArguments, Trainer

In [2]:
def get_unique_videos(folder_path, underscore_count):
    video_files = [f for f in os.listdir(folder_path) if f.endswith('.mp4')]
    
    print(f"\nTotal videos in '{folder_path}' before filtering: {len(video_files)}")

    # Filter out videos where the filename contains the specified number of underscores
    unique_videos = [os.path.join(folder_path, f) for f in video_files if f.count('_') != underscore_count]

    print(f"Total videos in '{folder_path}' after filtering: {len(unique_videos)}")

    return unique_videos

In [3]:
non_shoplifters = "/kaggle/input/shoplift/Shop DataSet/non shop lifters"
shoplifters = "/kaggle/input/shoplift/Shop DataSet/shop lifters"

non_shop_lifters_videos = get_unique_videos(non_shoplifters, 4)  
shop_lifters_videos = get_unique_videos(shoplifters, 3) 


Total videos in '/kaggle/input/shoplift/Shop DataSet/non shop lifters' before filtering: 531
Total videos in '/kaggle/input/shoplift/Shop DataSet/non shop lifters' after filtering: 313

Total videos in '/kaggle/input/shoplift/Shop DataSet/shop lifters' before filtering: 324
Total videos in '/kaggle/input/shoplift/Shop DataSet/shop lifters' after filtering: 324


In [4]:
dataset_path = "/kaggle/input/shoplift/Shop DataSet"

# Define paths for categories
categories = {
    "non shop lifters": 0,  # Label 0
    "shop lifters": 1       # Label 1
}

# Collect video paths and labels
video_paths = []
labels = []

for category, label in categories.items():
    folder_path = os.path.join(dataset_path, category)
    unique_videos = get_unique_videos(folder_path, underscore_count=4 if label == 0 else 3)  
    video_paths.extend(unique_videos)
    labels.extend([label] * len(unique_videos))  # Assign label to each video
    
label_counts = Counter(labels)
min_class_count = min(label_counts.values())
stratify = labels if min_class_count >= 2 else None

train_paths, val_paths, train_labels, val_labels = train_test_split(
    video_paths, labels, test_size=0.3,random_state=42, stratify=stratify
)


Total videos in '/kaggle/input/shoplift/Shop DataSet/non shop lifters' before filtering: 531
Total videos in '/kaggle/input/shoplift/Shop DataSet/non shop lifters' after filtering: 313

Total videos in '/kaggle/input/shoplift/Shop DataSet/shop lifters' before filtering: 324
Total videos in '/kaggle/input/shoplift/Shop DataSet/shop lifters' after filtering: 324


In [5]:
# Load Model & Feature Extractor
model_name = "MCG-NJU/VideoMAE-base"

processor = VideoMAEImageProcessor.from_pretrained(model_name)
model = VideoMAEForVideoClassification.from_pretrained(model_name, num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/VideoMAE-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VideoMAEForVideoClassification(
  (videomae): VideoMAEModel(
    (embeddings): VideoMAEEmbeddings(
      (patch_embeddings): VideoMAEPatchEmbeddings(
        (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
      )
    )
    (encoder): VideoMAEEncoder(
      (layer): ModuleList(
        (0-11): 12 x VideoMAELayer(
          (attention): VideoMAESdpaAttention(
            (attention): VideoMAESdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): VideoMAESelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): VideoMAEIntermediate(
            (den

In [6]:
class VideoDataset(Dataset):
    def __init__(self, video_paths, labels, processor, num_frames=16):
        self.video_paths = video_paths
        self.labels = labels
        self.processor = processor
        self.num_frames = num_frames

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        
        # Extract frames
        frames = self.extract_frames(video_path, self.num_frames)

        # Preprocess frames
        inputs = self.processor(frames, return_tensors="pt")

        # Add label
        inputs["labels"] = torch.tensor(label)

        return inputs

    def __len__(self):
        return len(self.video_paths)

    def extract_frames(self, video_path, num_frames):
        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_indices = np.linspace(0, total_frames-1, num_frames).astype(int)

        frames = []
        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if ret:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB
                frames.append(frame)

        cap.release()
        return frames

In [7]:
# Load datasets
train_dataset = VideoDataset(train_paths, train_labels, processor)
val_dataset = VideoDataset(val_paths, val_labels, processor)

# Define DataLoaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

In [9]:
def train_model(model, optimizer, scheduler, criterion, train_loader, val_loader, num_epochs=10, save_path='/kaggle/working/shoplifting_detector.pt'):
    best_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss, correct = 0.0, 0

        for batch in train_loader:
            inputs = batch
            labels = inputs.pop("labels").to(device).long()
            pixel_values = inputs["pixel_values"].to(device)

            # Remove the extra dimension (if present)
            if pixel_values.dim() == 6:
                pixel_values = pixel_values.squeeze(1)  # Remove the second dimension (1)

            # Ensure pixel_values is in the correct shape
            # VideoMAE expects (batch_size, num_frames, num_channels, height, width)
            if pixel_values.dim() != 5:
                raise ValueError(f"Unexpected shape for pixel_values: {pixel_values.shape}")

            optimizer.zero_grad()
            outputs = model(pixel_values=pixel_values)

            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            correct += (outputs.logits.argmax(dim=-1) == labels).sum().item()

        train_loss /= len(train_loader)
        train_acc = correct / len(train_loader.dataset)

        # Validation loop
        model.eval()
        val_loss, val_correct = 0.0, 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = batch
                labels = inputs.pop("labels").to(device).long()
                pixel_values = inputs["pixel_values"].to(device)


                # Remove the extra dimension (if present)
                if pixel_values.dim() == 6:
                    pixel_values = pixel_values.squeeze(1)  # Remove the second dimension (1)

                outputs = model(pixel_values=pixel_values)
                val_loss += criterion(outputs.logits, labels).item()
                val_correct += (outputs.logits.argmax(dim=-1) == labels).sum().item()

        val_loss /= len(val_loader)
        val_acc = val_correct / len(val_loader.dataset)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        # Save the best model
        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model.state_dict(), save_path)
            print(f"Best model saved with val loss {best_loss:.4f}")

        # Step the scheduler
        scheduler.step()

    return model

In [10]:
print("Training Model...")
trained_model = train_model(model, optimizer, scheduler, criterion, train_loader, val_loader)

Training Model...
Epoch 1/10, Train Loss: 0.6274, Train Acc: 0.6517, Val Loss: 0.3955, Val Acc: 0.8958
Best model saved with val loss 0.3955
Epoch 2/10, Train Loss: 0.2193, Train Acc: 0.9169, Val Loss: 0.3220, Val Acc: 0.8542
Best model saved with val loss 0.3220
Epoch 3/10, Train Loss: 0.1147, Train Acc: 0.9618, Val Loss: 0.0362, Val Acc: 0.9948
Best model saved with val loss 0.0362
Epoch 4/10, Train Loss: 0.0037, Train Acc: 1.0000, Val Loss: 0.0306, Val Acc: 0.9948
Best model saved with val loss 0.0306
Epoch 5/10, Train Loss: 0.0550, Train Acc: 0.9730, Val Loss: 0.0291, Val Acc: 0.9844
Best model saved with val loss 0.0291
Epoch 6/10, Train Loss: 0.0064, Train Acc: 1.0000, Val Loss: 0.0140, Val Acc: 0.9948
Best model saved with val loss 0.0140
Epoch 7/10, Train Loss: 0.0023, Train Acc: 1.0000, Val Loss: 0.0138, Val Acc: 0.9948
Best model saved with val loss 0.0138
Epoch 8/10, Train Loss: 0.0015, Train Acc: 1.0000, Val Loss: 0.0139, Val Acc: 0.9948
Epoch 9/10, Train Loss: 0.0011, Trai