In [13]:
import os
import re
from collections import defaultdict

# base path to the original data folder
data_dir = "data"

# define how to extract video prefixes per class
def extract_video_prefix(filename, class_name):
    if class_name.lower() == "normalvideos":
        # handle both naming styles
        patterns = [
            re.compile(r"(Normal_Videos\d+_x264)_\d+\.png", re.IGNORECASE),
            re.compile(r"(Normal_Videos_\d+_x264)_\d+\.png", re.IGNORECASE)
        ]
    else:
        patterns = [re.compile(rf"({class_name}\d+_x264)_\d+\.png", re.IGNORECASE)]
    
    for pattern in patterns:
        match = pattern.match(filename)
        if match:
            return match.group(1)
    return None

# count unique video prefixes in a folder
def count_videos_in_class(class_dir, class_name):
    video_ids = set()
    for file in os.listdir(class_dir):
        if file.endswith(".png"):
            prefix = extract_video_prefix(file, class_name)
            if prefix:
                video_ids.add(prefix)
    return len(video_ids)

# ccan both train and test folders
for split in ["Train", "Test"]:
    split_path = os.path.join(data_dir, split)
    print(f"{split} Set Video Counts:")
    
    for class_name in sorted(os.listdir(split_path)):
        class_path = os.path.join(split_path, class_name)
        if os.path.isdir(class_path):
            num_videos = count_videos_in_class(class_path, class_name)
            print(f"{class_name:15s} → {num_videos:3d} videos")

Train Set Video Counts:
Arrest          →  45 videos
Arson           →  41 videos
Assault         →  47 videos
Burglary        →  87 videos
Explosion       →  29 videos
Fighting        →  45 videos
NormalVideos    → 800 videos
Shooting        →  27 videos
Test Set Video Counts:
Arrest          →   5 videos
Arson           →   9 videos
Assault         →   3 videos
Burglary        →  13 videos
Explosion       →  21 videos
Fighting        →   5 videos
NormalVideos    → 150 videos
Shooting        →  23 videos


In [14]:
import shutil
import os
import re
from collections import defaultdict

original_root = "data"
trimmed_root = "data_trimmed"

# number of videos to keep per class 
train_video_limits = {
    "Arrest": 30,
    "Arson": 20,
    "Assault": 35,
    "Burglary": 40,
    "Explosion": 20,
    "Fighting": 35,
    "Shooting": 27,
    "NormalVideos": 80
}

test_video_limits = {
    "Arrest": 5,
    "Arson": 9,
    "Assault": 3,
    "Burglary": 8,
    "Explosion": 10,
    "Fighting": 5,
    "NormalVideos": 15,
    "Shooting": 10,
}


def trim_class_videos(src_dir, dst_dir, class_name, num_videos_to_keep):
    os.makedirs(dst_dir, exist_ok=True)
    video_groups = defaultdict(list)

    # Handle NormalVideos with two possible patterns
    if class_name.lower() == "normalvideos":
        patterns = [
            re.compile(r"(Normal_Videos\d+_x264)_\d+\.png", re.IGNORECASE),  # Train pattern
            re.compile(r"(Normal_Videos_\d+_x264)_\d+\.png", re.IGNORECASE)  # Test pattern
        ]
    else:
        patterns = [re.compile(rf"({class_name}\d+_x264)_\d+\.png", re.IGNORECASE)]

    for file in os.listdir(src_dir):
        if file.endswith(".png"):
            for pattern in patterns:
                match = pattern.match(file)
                if match:
                    prefix = match.group(1)
                    video_groups[prefix].append(file)
                    break

    selected_videos = sorted(video_groups.keys())[:num_videos_to_keep]
    files_to_copy = set()
    for vid in selected_videos:
        files_to_copy.update(video_groups[vid])

    for file in files_to_copy:
        src = os.path.join(src_dir, file)
        dst = os.path.join(dst_dir, file)
        shutil.copy2(src, dst)

    print(f"[{class_name}] Kept {len(selected_videos)} videos ({len(files_to_copy)} files)")


# trim train set
for class_name, limit in train_video_limits.items():
    print(f"Processing Train: {class_name}")
    src_class_dir = os.path.join(original_root, "Train", class_name)
    dst_class_dir = os.path.join(trimmed_root, "Train", class_name)
    trim_class_videos(src_class_dir, dst_class_dir, class_name, limit)


# trim only NormalVideos in test set
for class_name, limit in test_video_limits.items():
    print(f"Processing Test: {class_name}")
    src_class_dir = os.path.join(original_root, "Test", class_name)
    dst_class_dir = os.path.join(trimmed_root, "Test", class_name)
    trim_class_videos(src_class_dir, dst_class_dir, class_name, limit)

Processing Train: Arrest
[Arrest] Kept 30 videos (12342 files)
Processing Train: Arson
[Arson] Kept 20 videos (19088 files)
Processing Train: Assault
[Assault] Kept 35 videos (7443 files)
Processing Train: Burglary
[Burglary] Kept 40 videos (14997 files)
Processing Train: Explosion
[Explosion] Kept 20 videos (3277 files)
Processing Train: Fighting
[Fighting] Kept 35 videos (15318 files)
Processing Train: Shooting
[Shooting] Kept 27 videos (7140 files)
Processing Train: NormalVideos
[NormalVideos] Kept 80 videos (39711 files)
Processing Test: Arrest
[Arrest] Kept 5 videos (3365 files)
Processing Test: Arson
[Arson] Kept 9 videos (2793 files)
Processing Test: Assault
[Assault] Kept 3 videos (2657 files)
Processing Test: Burglary
[Burglary] Kept 8 videos (3724 files)
Processing Test: Explosion
[Explosion] Kept 10 videos (3529 files)
Processing Test: Fighting
[Fighting] Kept 5 videos (1231 files)
Processing Test: NormalVideos
[NormalVideos] Kept 15 videos (2606 files)
Processing Test: Shoo

In [15]:
def get_folder_size(path):
    total_size = 0
    for dirpath, _, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if os.path.isfile(fp):
                total_size += os.path.getsize(fp)
    return total_size

size_bytes = get_folder_size(trimmed_root)
size_mb = size_bytes / (1024 ** 2)
size_gb = size_bytes / (1024 ** 3)

print(f"Size of trimmed dataset:")
print(f" {size_mb:.2f} MB")
print(f" {size_gb:.2f} GB")

Size of trimmed dataset:
 1085.39 MB
 1.06 GB


In [16]:
import os

def count_png_files(root_path):
    total_count = 0
    for dirpath, _, filenames in os.walk(root_path):
        count = len([f for f in filenames if f.endswith(".png")])
        total_count += count
    return total_count

trimmed_root = "data_trimmed"
train_path = os.path.join(trimmed_root, "Train")
test_path = os.path.join(trimmed_root, "Test")

train_count = count_png_files(train_path)
test_count = count_png_files(test_path)
total_count = train_count + test_count

print(f"Train Set: {train_count:,} .png files")
print(f"Test Set:  {test_count:,} .png files")
print(f"Total:     {total_count:,} .png files in 'data_trimmed'")

Train Set: 119,316 .png files
Test Set:  21,914 .png files
Total:     141,230 .png files in 'data_trimmed'
