# Experiment 2 - Stage 1: Binary Crime Classification with Attention

## Overview
This notebook implements Stage 1 of Experiment 2, which performs binary classification (Normal vs. Crime) on UCF Crime videos.

### Key Features:
1. **Improved Data Ingestion**: Uses Experiment 1's approach - loading videos directly from MP4 files using OpenCV
2. **Attention Mechanism**: Identifies which frames are most important for crime detection
3. **Gradient Clipping**: Keeps weights stable between 0 and 1
4. **High-Attention Frame Extraction**: Prepares data for Stage 2 by identifying relevant frames

### Stage 1 Goals:
- Train a binary classifier (Normal=0, Crime=1)
- Extract attention weights to identify crime-relevant frames
- Save high-attention data for Stage 2 training

---

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
import json
import shutil
import numpy as np
import cv2

In [None]:
class UCFCrimeBinaryDataset(Dataset):
    """
    Dataset that loads videos directly from MP4 files using OpenCV
    (matching Experiment 1's data ingestion approach)
    """
    def __init__(self, root_dir, clip_len=16, transform=None, frame_size=112):
        self.root_dir = root_dir
        self.clip_len = clip_len
        self.transform = transform
        self.frame_size = frame_size

        self.samples = []
        self._prepare_samples()

    def _prepare_samples(self):
        """Scan directory for MP4 video files"""
        for category in os.listdir(self.root_dir):
            category_path = os.path.join(self.root_dir, category)
            if not os.path.isdir(category_path):
                continue

            label = 0 if category.lower() == "normalvideos" else 1  # normal=0, crime=1
            
            # Look for .mp4 files (not directories)
            for file in os.listdir(category_path):
                if file.lower().endswith('.mp4'):
                    video_path = os.path.join(category_path, file)
                    self.samples.append((video_path, label))

    def _load_video_cv2(self, video_path):
        """Load video using OpenCV and extract frames"""
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError(f"Could not open video: {video_path}")

        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Sample frames uniformly across the video
        frame_indices = np.linspace(0, total_frames - 1, self.clip_len, dtype=int)

        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if not ret:
                break
            # Convert BGR to RGB
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # Resize frame
            frame = cv2.resize(frame, (self.frame_size, self.frame_size))
            frames.append(frame)

        cap.release()

        if len(frames) == 0:
            raise ValueError(f"No frames extracted from: {video_path}")

        # Pad with last frame if needed
        while len(frames) < self.clip_len:
            frames.append(frames[-1])

        return frames

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        video_path, label = self.samples[idx]
        
        try:
            # Load frames using OpenCV
            frames = self._load_video_cv2(video_path)
            
            # Convert to tensors
            imgs = []
            for frame in frames:
                # Convert numpy array to PIL Image for transforms
                frame_pil = Image.fromarray(frame)
                if self.transform:
                    frame_tensor = self.transform(frame_pil)
                else:
                    frame_tensor = transforms.ToTensor()(frame_pil)
                imgs.append(frame_tensor)
            
            # Stack as (C, T, H, W)
            clip_tensor = torch.stack(imgs, dim=1)
            return clip_tensor, torch.tensor(label, dtype=torch.long), video_path
            
        except Exception as e:
            print(f"[LOAD FAILED] {video_path} | Error: {e}")
            # Return dummy tensor
            dummy = torch.zeros(3, self.clip_len, self.frame_size, self.frame_size)
            return dummy, torch.tensor(label, dtype=torch.long), video_path


In [None]:
class BinaryCrimeDetector(nn.Module):
    """
    Binary crime detector with attention mechanism for frame importance
    Attention weights help identify which frames are most relevant for crime detection
    """
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv3d(3, 32, (1,3,3), padding=(0,1,1)),
            nn.ReLU(inplace=True),
            nn.Conv3d(32, 32, (3,1,1), padding=(1,0,0)),
            nn.BatchNorm3d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool3d((1,2,2)),

            nn.Conv3d(32, 64, (1,3,3), padding=(0,1,1)),
            nn.ReLU(inplace=True),
            nn.Conv3d(64, 64, (3,1,1), padding=(1,0,0)),
            nn.BatchNorm3d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool3d((2,2,2))
        )
        
        # Attention mechanism to identify important frames
        self.attention = nn.Sequential(
            nn.Conv3d(64, 32, kernel_size=1),
            nn.ReLU(inplace=True),
            nn.Conv3d(32, 1, kernel_size=1),
            nn.Sigmoid()  # Attention weights between 0 and 1
        )
        
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool3d(1),
            nn.Flatten(),
            nn.Linear(64, 2)
        )

    def forward(self, x, return_attention=False):
        """
        Args:
            x: input tensor (B, C, T, H, W)
            return_attention: if True, return attention weights along with predictions
        
        Returns:
            logits: classification logits (B, 2)
            attention_weights (optional): frame importance weights (B, T)
        """
        features = self.features(x)
        
        # Compute attention weights for each frame
        attention_map = self.attention(features)  # (B, 1, T', H', W')
        
        # Global average pooling over spatial dimensions to get per-frame weights
        attention_weights = attention_map.mean(dim=[3, 4])  # (B, 1, T')
        attention_weights = attention_weights.squeeze(1)  # (B, T')
        
        # Apply attention to features
        weighted_features = features * attention_map
        
        # Classification
        logits = self.classifier(weighted_features)
        
        if return_attention:
            return logits, attention_weights
        return logits


In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CLIP_LEN = 16
FRAME_SIZE = 112

# Transform pipeline (resize is now handled in dataset via OpenCV)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# UPDATE THESE PATHS TO YOUR VIDEO DIRECTORIES (should contain .mp4 files, not frame directories)
train_dir = r"C:\Users\rayaa\Downloads\ucf_crime_v2\Train"
test_dir  = r"C:\Users\rayaa\Downloads\ucf_crime_v2\Test"

train_data = UCFCrimeBinaryDataset(train_dir, clip_len=CLIP_LEN, transform=transform, frame_size=FRAME_SIZE)
test_data  = UCFCrimeBinaryDataset(test_dir, clip_len=CLIP_LEN, transform=transform, frame_size=FRAME_SIZE)

train_loader = DataLoader(train_data, batch_size=4, shuffle=True, num_workers=0)
test_loader  = DataLoader(test_data, batch_size=2, shuffle=False, num_workers=0)

print(f"Train clips: {len(train_data)}, Test clips: {len(test_data)}")
print(f"Using device: {DEVICE}")


Train clips: 1610, Test clips: 290


### Verify Data Distribution
---


In [None]:
def count_videos_per_class(dataset):
    """Count videos per class for verification"""
    from collections import Counter
    labels_count = Counter([label for _, label in dataset.samples])
    
    print("\nClass Distribution:")
    print(f"  Normal (0): {labels_count[0]} videos")
    print(f"  Crime (1): {labels_count[1]} videos")
    print(f"  Total: {len(dataset)} videos")
    return labels_count

print("=" * 60)
print("TRAINING SET")
print("=" * 60)
train_dist = count_videos_per_class(train_data)

print("\n" + "=" * 60)
print("TEST SET")
print("=" * 60)
test_dist = count_videos_per_class(test_data)


### Training with Gradient Clipping & Attention
---


In [None]:
model = BinaryCrimeDetector().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)

EPOCHS = 5
GRADIENT_CLIP_VALUE = 1.0  # Gradient clipping for stable training

print(f"\nStarting training with attention mechanism...")
print(f"Epochs: {EPOCHS}")
print(f"Gradient clipping: {GRADIENT_CLIP_VALUE}")
print(f"Device: {DEVICE}\n")

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for clips, labels, _ in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        clips, labels = clips.to(DEVICE), labels.to(DEVICE)
        
        optimizer.zero_grad()
        
        # Forward pass with attention
        outputs, attention_weights = model(clips, return_attention=True)
        
        loss = criterion(outputs, labels)
        loss.backward()
        
        # Gradient clipping (keeps weights stable between 0 and 1)
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP_VALUE)
        
        optimizer.step()
        
        total_loss += loss.item()
        
        # Calculate accuracy
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    
    # Scheduler step
    scheduler.step()
    
    avg_loss = total_loss / len(train_loader)
    accuracy = 100 * correct / total
    
    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Accuracy: {accuracy:.2f}% | LR: {scheduler.get_last_lr()[0]:.6f}")

# save checkpoint
output_dir = r"C:\Users\rayaa\Downloads\ucf_crime_v2\checkpoints"
os.makedirs(output_dir, exist_ok=True)
checkpoint_path = os.path.join(output_dir, "binary_stage1_with_attention.pt")
torch.save({
    'epoch': EPOCHS,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
}, checkpoint_path)
print(f"\n✓ Model checkpoint saved to: {checkpoint_path}")


Epoch 1/5:   0%|          | 0/403 [00:00<?, ?it/s]

Epoch 1/5: 100%|██████████| 403/403 [09:43<00:00,  1.45s/it]


Epoch 1 Loss: 0.6681


Epoch 2/5: 100%|██████████| 403/403 [03:52<00:00,  1.73it/s]


Epoch 2 Loss: 0.6361


Epoch 3/5: 100%|██████████| 403/403 [03:41<00:00,  1.82it/s]


Epoch 3 Loss: 0.6067


Epoch 4/5: 100%|██████████| 403/403 [03:32<00:00,  1.90it/s]


Epoch 4 Loss: 0.6061


Epoch 5/5: 100%|██████████| 403/403 [03:34<00:00,  1.88it/s]

Epoch 5 Loss: 0.5924





### Stage 1 Inference - Extract Anomaly Clips with High Attention
---


In [None]:
model.eval()
anomaly_dir = "./stage1_output/anomaly_clips"
attention_output_dir = "./stage1_output/attention_data"
os.makedirs(anomaly_dir, exist_ok=True)
os.makedirs(attention_output_dir, exist_ok=True)

anomaly_records = []
all_attention_data = []

print("\nRunning Stage 1 inference with attention extraction...")
print("=" * 60)

with torch.no_grad():
    for clips, labels, paths in tqdm(test_loader, desc="Stage 1 Inference"):
        clips = clips.to(DEVICE)
        
        # Get predictions and attention weights
        outputs, attention_weights = model(clips, return_attention=True)
        probs = torch.softmax(outputs, dim=1)
        preds = probs.argmax(dim=1).cpu().numpy()
        crime_probs = probs[:,1].cpu().numpy()  # prob of 'crime'
        
        # Process each clip in batch
        for i, pred in enumerate(preds):
            if pred == 1:  # Crime detected
                video_path = paths[i]
                confidence = float(crime_probs[i])
                att_weights = attention_weights[i].cpu().numpy()
                
                # Save video information
                anomaly_records.append({
                    "video_path": video_path,
                    "confidence": confidence,
                    "attention_weights": att_weights.tolist(),
                    "top_attention_indices": np.argsort(att_weights)[-5:].tolist()  # Top 5 frames
                })
                
                # Store attention data for Stage 2
                all_attention_data.append({
                    "video_path": video_path,
                    "confidence": confidence,
                    "attention_weights": att_weights,
                    "clip_tensor": clips[i].cpu()  # Save for potential Stage 2 use
                })

# Save anomaly records as JSON
anomalies_json_path = "./stage1_output/anomalies.json"
with open(anomalies_json_path, "w") as f:
    json.dump(anomaly_records, f, indent=2)

# Save attention data for Stage 2
attention_data_path = os.path.join(attention_output_dir, "stage1_attention_data.pt")
torch.save(all_attention_data, attention_data_path)

print("\n" + "=" * 60)
print(f"✓ Stage 1 Complete!")
print(f"  - Detected {len(anomaly_records)} crime clips")
print(f"  - Anomaly records saved to: {anomalies_json_path}")
print(f"  - Attention data saved to: {attention_data_path}")
print(f"  - Ready for Stage 2 training")
print("=" * 60)


Stage 1 Inference: 100%|██████████| 145/145 [10:22<00:00,  4.29s/it]

Stage 1 complete — 153 crime clips saved to ./stage1_output/anomaly_clips





### (Optional) Visualize Attention Weights
---


In [None]:
import matplotlib.pyplot as plt

def visualize_attention_examples(num_examples=3):
    """Visualize attention weights for a few crime examples"""
    if len(anomaly_records) == 0:
        print("No anomaly records found. Run inference first.")
        return
    
    # Select random examples
    import random
    examples = random.sample(anomaly_records, min(num_examples, len(anomaly_records)))
    
    fig, axes = plt.subplots(num_examples, 1, figsize=(12, 3*num_examples))
    if num_examples == 1:
        axes = [axes]
    
    for idx, example in enumerate(examples):
        video_name = os.path.basename(example['video_path'])
        attention = np.array(example['attention_weights'])
        confidence = example['confidence']
        
        axes[idx].bar(range(len(attention)), attention, color='crimson', alpha=0.7)
        axes[idx].set_xlabel('Frame Index')
        axes[idx].set_ylabel('Attention Weight')
        axes[idx].set_title(f'{video_name} | Confidence: {confidence:.3f}')
        axes[idx].grid(True, alpha=0.3)
        
        # Highlight top frames
        top_indices = example['top_attention_indices']
        for top_idx in top_indices:
            if top_idx < len(attention):
                axes[idx].bar(top_idx, attention[top_idx], color='darkred', alpha=1.0)
    
    plt.tight_layout()
    plt.savefig('./stage1_output/attention_visualization.png', dpi=150, bbox_inches='tight')
    plt.show()
    print(f"\n✓ Attention visualization saved to: ./stage1_output/attention_visualization.png")

# Visualize if anomalies were detected
if len(anomaly_records) > 0:
    visualize_attention_examples(num_examples=min(3, len(anomaly_records)))
else:
    print("No anomalies detected to visualize.")
