### Preprocessing

Contains a helper to augment the UCF Crime dataset found on [Kaggle](https://www.kaggle.com/datasets/odins0n/ucf-crime-dataset).

File structure is organized by frame granularity, for a video transformer, it would be helpful to aggregate data based on frames, which is what this script does.

In [None]:
import os
import shutil
import re
from tqdm import tqdm

def reorganize_ucfcrime_frames(root_dir):
    """
    Reorganize UCF-Crime dataset frames into video subfolders.

    Example input structure:
        Train/Vandalism/Vandalism046_x264_10.jpg
        Train/Vandalism/Vandalism046_x264_20.jpg

    Example output structure:
        Train/Vandalism/Vandalism046/Vandalism046_x264_10.jpg
        Train/Vandalism/Vandalism046/Vandalism046_x264_20.jpg
    """

    # Regex pattern to capture video base (e.g., Vandalism046) and frame number
    video_pattern = re.compile(r"([A-Za-z]+[0-9]+)_x264_\d+\.(jpg|png)$", re.IGNORECASE)

    for split in ["Train", "Test"]:
        split_path = os.path.join(root_dir, split)
        if not os.path.exists(split_path):
            print(f"Skipping {split_path} (not found)")
            continue

        print(f"\n📂 Processing {split_path}...")
        for class_name in tqdm(os.listdir(split_path)):
            class_path = os.path.join(split_path, class_name)
            if not os.path.isdir(class_path):
                continue
            frame_files = [f for f in os.listdir(class_path) if f.lower().endswith((".jpg", ".png"))]

            for frame_file in frame_files:
                match = video_pattern.match(frame_file)
                if not match:
                    continue

                video_id = match.group(1)
                video_folder = os.path.join(class_path, video_id)

                # Make subfolder if needed
                os.makedirs(video_folder, exist_ok=True)

                src = os.path.join(class_path, frame_file)
                dst = os.path.join(video_folder, frame_file)

                shutil.move(src, dst)

        print(f"✅ Finished reorganizing {split_path}")


# replace DATASET_ROOT with an abs path to the ucf_crime dataset on your device
DATASET_ROOT = r"C:\Users\rayaa\Downloads\ucf_crime"
reorganize_ucfcrime_frames(DATASET_ROOT)
