# UCF-Crime Anomaly Detection with Multi-Task Learning

This Jupyter Notebook implements a Multi-Task Learning (MTL) pipeline for anomaly detection using an image-based UCF-Crime dataset. The pipeline processes `.png` images, trains a model with four tasks (general anomaly detection, violence detection, property crime detection, anomaly type classification), analyzes task relationships, and conducts an ablation study. The dataset is assumed to be at `/home/user/ucf_crime_dataset` with generated annotation files.

## Setup Instructions
1. Install dependencies: `pip install tensorflow opencv-python numpy pandas scikit-learn scipy`
2. Download the Kaggle dataset: https://www.kaggle.com/datasets/mission-ai/crimeucfdataset
3. Extract `.png` images or use `extract_frames.py` to convert videos to images.
4. Update paths in cells as needed (e.g., `data_dir`).
5. Run cells sequentially, inspecting outputs for debugging.


# Step 1

In [None]:
# Import libraries
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50
import numpy as np
import os
import cv2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from scipy.stats import pearsonr


In [None]:
# Define constants
DATA_DIR = '/home/user/ucf_crime_dataset'  # Update to your dataset path
TRAIN_ANNOTATION_FILE = 'train_annotations.txt'
TEST_ANNOTATION_FILE = 'test_annotations.txt'
IMAGE_SIZE = (224, 224)  # Image size for ResNet50
BATCH_SIZE = 32
EPOCHS = 10
NUM_ANOMALY_TYPES = 14  # 13 anomaly classes + Normal

# Verify dataset directory
if not os.path.exists(DATA_DIR):
    raise ValueError(f"Dataset directory {DATA_DIR} does not exist. Update DATA_DIR.")

print("Setup complete. Dataset directory:", DATA_DIR)

# Stage 2: Annotation Generation
Generate annotation files (train_annotations.txt, test_annotations.txt) for the image-based dataset, mapping .png images to class labels based on folder structure.



In [None]:
import glob

def generate_annotation_file(dataset_root, split, output_file):
    """
    Generate an annotation file for a given dataset split (train or test) with .png images.
    """
    split_dir = os.path.join(dataset_root, split)
    if not os.path.exists(split_dir):
        raise ValueError(f"Directory {split_dir} does not exist.")
    
    annotations = []
    for class_name in os.listdir(split_dir):
        class_dir = os.path.join(split_dir, class_name)
        if not os.path.isdir(class_dir):
            continue
        
        image_files = glob.glob(os.path.join(class_dir, "*.png"))
        for image_path in image_files:
            relative_path = os.path.relpath(image_path, dataset_root)
            annotation = f"{relative_path} {class_name}"
            annotations.append(annotation)
    
    with open(output_file, 'w') as f:
        for annotation in annotations:
            f.write(annotation + '\n')
    
    print(f"Generated {output_file} with {len(annotations)} entries.")

# Generate annotations for train and test splits
generate_annotation_file(DATA_DIR, 'train', TRAIN_ANNOTATION_FILE)
generate_annotation_file(DATA_DIR, 'test', TEST_ANNOTATION_FILE)

# Verify annotation files
if os.path.exists(TRAIN_ANNOTATION_FILE) and os.path.exists(TEST_ANNOTATION_FILE):
    print("Annotation files created successfully.")
else:
    raise FileNotFoundError("Annotation files not found. Check generation process.")

# Stage 3: Data Loading
Load .png images and labels from the annotation files using load_ucf_crime_data. Outputs the number of loaded images and label shapes for debugging.



In [None]:
def load_ucf_crime_data(data_dir, annotation_file, image_size=(224, 224)):
    """
    Load UCF-Crime dataset images and annotations from the generated annotation file.
    """
    annotations = pd.read_csv(annotation_file, sep=' ', header=None, names=['image', 'label'])
    
    images = []
    labels = {
        'general_anomaly': [],
        'violence': [],
        'property_crime': [],
        'anomaly_type': []
    }
    
    anomaly_classes = ['Abuse', 'Arrest', 'Arson', 'Assault', 'Burglary', 'Explosion',
                       'Fighting', 'Robbery', 'Shooting', 'Stealing', 'Shoplifting',
                       'Vandalism', 'RoadAccident']
    violent_classes = ['Assault', 'Fighting', 'Shooting']
    property_classes = ['Burglary', 'Stealing', 'Shoplifting', 'Vandalism']
    
    for _, row in annotations.iterrows():
        image_path = os.path.join(data_dir, row['image'])
        label = row['label']
        
        image = cv2.imread(image_path)
        if image is None:
            print(f"Failed to load image: {image_path}")
            continue
        
        image = cv2.resize(image, image_size)
        image = image / 255.0
        images.append(image)
        
        is_anomaly = 1 if label in anomaly_classes else 0
        is_violent = 1 if label in violent_classes else 0
        is_property = 1 if label in property_classes else 0
        anomaly_type = anomaly_classes.index(label) if label in anomaly_classes else len(anomaly_classes)
        
        labels['general_anomaly'].append(is_anomaly)
        labels['violence'].append(is_violent)
        labels['property_crime'].append(is_property)
        labels['anomaly_type'].append(anomaly_type)
    
    images = np.array(images)
    for task in labels:
        labels[task] = np.array(labels[task])
    
    print(f"Loaded {len(images)} images from {annotation_file}")
    print("Label shapes:", {task: len(labels[task]) for task in labels})
    return images, labels

# Load training and validation data
X_train, y_train = load_ucf_crime_data(DATA_DIR, TRAIN_ANNOTATION_FILE, IMAGE_SIZE)
X_val, y_val = load_ucf_crime_data(DATA_DIR, TEST_ANNOTATION_FILE, IMAGE_SIZE)

# Verify data shapes
print("Training images shape:", X_train.shape)
print("Validation images shape:", X_val.shape)

# Stage 4: Model Definition
Define the MTL model with a ResNet50 backbone and four task-specific heads using build_mtl_model. Display the model summary for inspection.



In [None]:
def build_mtl_model(input_shape=(224, 224, 3), num_anomaly_types=14):
    """
    Build MTL model with shared ResNet50 backbone and task-specific heads for images.
    """
    inputs = tf.keras.Input(shape=input_shape)
    
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = False
    
    x = base_model(inputs)
    pooled = layers.GlobalAveragePooling2D()(x)
    shared_dense = layers.Dense(512, activation='relu')(pooled)
    
    anomaly_output = layers.Dense(1, activation='sigmoid', name='general_anomaly')(shared_dense)
    violence_output = layers.Dense(1, activation='sigmoid', name='violence')(shared_dense)
    property_output = layers.Dense(1, activation='sigmoid', name='property_crime')(shared_dense)
    type_output = layers.Dense(num_anomaly_types, activation='softmax', name='anomaly_type')(shared_dense)
    
    model = models.Model(inputs, [
        anomaly_output, violence_output, property_output, type_output
    ])
    
    return model

# Build model and display summary
model = build_mtl_model(input_shape=(224, 224, 3), num_anomaly_types=NUM_ANOMALY_TYPES)
model.summary()

# Stage 5: Task Relationship Functions
Define functions to analyze task relationships: compute_gradient_alignment (cosine similarity of gradients) and compute_loss_correlation (Pearson correlation of losses).



In [None]:
def compute_gradient_alignment(model, data, labels, task_names):
    """
    Compute cosine similarity between gradients of tasks.
    """
    gradients = {}
    for task in task_names:
        with tf.GradientTape() as tape:
            predictions = model(data)
            task_idx = task_names.index(task)
            loss = tf.keras.losses.binary_crossentropy(labels[task], predictions[task_idx])
            if task == 'anomaly_type':
                loss = tf.keras.losses.sparse_categorical_crossentropy(labels[task], predictions[task_idx])
        grads = tape.gradient(loss, model.trainable_variables)
        gradients[task] = grads
    
    similarities = {}
    for task1 in task_names:
        for task2 in task_names:
            if task1 >= task2:
                continue
            g1 = tf.concat([tf.reshape(g, [-1]) for g in gradients[task1] if g is not None], axis=0)
            g2 = tf.concat([tf.reshape(g, [-1]) for g in gradients[task2] if g is not None], axis=0)
            cos_sim = tf.reduce_sum(g1 * g2) / (tf.norm(g1) * tf.norm(g2))
            similarities[f'{task1}_vs_{task2}'] = cos_sim.numpy()
    
    print("Computed gradient similarities:", similarities)
    return similarities

def compute_loss_correlation(losses_dict, task_names):
    """
    Compute Pearson correlation between task losses.
    """
    correlations = {}
    for task1 in task_names:
        for task2 in task_names:
            if task1 >= task2:
                continue
            corr, _ = pearsonr(losses_dict[task1], losses_dict[task2])
            correlations[f'{task1}_vs_{task2}'] = corr
    print("Computed loss correlations:", correlations)
    return correlations

# Stage 6: Training Function
Define train_mtl_model to train the MTL model, compute task relationships, and return training history. Monitor training progress via printed outputs.



In [None]:
def train_mtl_model(model, X_train, y_train, X_val, y_val, epochs=10, batch_size=32):
    """
    Train MTL model and collect losses for correlation analysis.
    """
    task_names = ['general_anomaly', 'violence', 'property_crime', 'anomaly_type']
    losses_dict = {task: [] for task in task_names}
    
    model.compile(
        optimizer='adam',
        loss={
            'general_anomaly': 'binary_crossentropy',
            'violence': 'binary_crossentropy',
            'property_crime': 'binary_crossentropy',
            'anomaly_type': 'sparse_categorical_crossentropy'
        },
        loss_weights={
            'general_anomaly': 1.0,
            'violence': 0.5,
            'property_crime': 0.5,
            'anomaly_type': 1.0
        },
        metrics={
            'general_anomaly': ['accuracy', tf.keras.metrics.AUC(name='auc')],
            'violence': ['accuracy'],
            'property_crime': ['accuracy'],
            'anomaly_type': ['accuracy']
        }
    )
    
    history = model.fit(
        X_train,
        [y_train[task] for task in task_names],
        validation_data=(X_val, [y_val[task] for task in task_names]),
        epochs=epochs,
        batch_size=batch_size,
        verbose=1
    )
    
    for task in task_names:
        losses_dict[task] = history.history[f'{task}_loss']
    
    grad_similarities = compute_gradient_alignment(model, X_val[:batch_size], y_val, task_names)
    loss_correlations = compute_loss_correlation(losses_dict, task_names)
    
    return history, grad_similarities, loss_correlations

# Train model (uncomment to run after verifying previous stages)
# history, grad_similarities, loss_correlations = train_mtl_model(model, X_train, y_train, X_val, y_val, epochs=EPOCHS, batch_size=BATCH_SIZE)

# Stage 7: Ablation Study
Define ablation_study to evaluate the model with subsets of tasks, reporting AUC for general anomaly detection.



In [None]:
def ablation_study(X_train, y_train, X_val, y_val, tasks_to_include, input_shape=(224, 224, 3), num_anomaly_types=14):
    """
    Train model with a subset of tasks for ablation study.
    """
    inputs = tf.keras.Input(shape=input_shape)
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = False
    
    x = base_model(inputs)
    pooled = layers.GlobalAveragePooling2D()(x)
    shared_dense = layers.Dense(512, activation='relu')(pooled)
    
    outputs = []
    loss_dict = {}
    metrics_dict = {}
    
    for task in tasks_to_include:
        if task in ['general_anomaly', 'violence', 'property_crime']:
            output = layers.Dense(1, activation='sigmoid', name=task)(shared_dense)
            loss_dict[task] = 'binary_crossentropy'
            metrics_dict[task] = ['accuracy']
        elif task == 'anomaly_type':
            output = layers.Dense(num_anomaly_types, activation='softmax', name=task)(shared_dense)
            loss_dict[task] = 'sparse_categorical_crossentropy'
            metrics_dict[task] = ['accuracy']
        outputs.append(output)
    
    model = models.Model(inputs, outputs)
    model.compile(
        optimizer='adam',
        loss=loss_dict,
        metrics=metrics_dict
    )
    
    history = model.fit(
        X_train,
        [y_train[task] for task in tasks_to_include],
        validation_data=(X_val, [y_val[task] for task in tasks_to_include]),
        epochs=5,
        batch_size=32,
        verbose=1
    )
    
    val_preds = model.predict(X_val)
    general_idx = tasks_to_include.index('general_anomaly') if 'general_anomaly' in tasks_to_include else None
    if general_idx is not None:
        auc = roc_auc_score(y_val['general_anomaly'], val_preds[general_idx])
        print(f"AUC for tasks {tasks_to_include}: {auc:.4f}")
        return auc
    return None

# Stage 8: Execution and Results
Run the pipeline, train the model, compute task relationships, and perform the ablation study. Display results for gradient alignment, loss correlation, and AUC scores.



In [None]:
# Train the model
history, grad_similarities, loss_correlations = train_mtl_model(
    model, X_train, y_train, X_val, y_val, epochs=EPOCHS, batch_size=BATCH_SIZE
)

# Print task relationship results
print("\nGradient Alignment (Cosine Similarity):")
for pair, sim in grad_similarities.items():
    print(f"{pair}: {sim:.4f}")

print("\nLoss Correlation (Pearson):")
for pair, corr in loss_correlations.items():
    print(f"{pair}: {corr:.4f}")

# Perform ablation study
task_combinations = [
    ['general_anomaly'],
    ['general_anomaly', 'violence', 'property_crime'],
    ['general_anomaly', 'anomaly_type'],
    ['general_anomaly', 'violence', 'property_crime', 'anomaly_type']
]

print("\nAblation Study Results (AUC for General Anomaly Detection):")
for tasks in task_combinations:
    auc = ablation_study(X_train, y_train, X_val, y_val, tasks)
    if auc is not None:
        print(f"Tasks: {tasks}, AUC: {auc:.4f}")