In [None]:
# Filter Stamp Dataset - Keep Only First 3 Classes

This notebook filters the stamp_yolov9 dataset to keep only classes 0, 1, 2 (circle, oval, rectangle) and removes classes 3, 4 (triangle, word).

In [10]:
# Verify class distribution in filtered dataset
from collections import Counter

def count_classes(split):
    labels_dir = Path(OUTPUT_DIR) / split / "labels"
    if not labels_dir.exists():
        return Counter()
    
    class_counts = Counter()
    for label_file in labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 5:
                    class_id = int(parts[0])
                    class_counts[class_id] += 1
    return class_counts

print("\nClass Distribution in Filtered Dataset:")
print("="*50)

class_names = {1: 'stamp'}
for split in ['train', 'valid', 'test']:
    counts = count_classes(split)
    if counts:
        print(f"\n{split.upper()}:")
        for class_id in sorted(counts.keys()):
            class_name = class_names.get(class_id, f"unknown({class_id})")
            print(f"  Class {class_id} ({class_name}): {counts[class_id]} annotations")
        print(f"  Total annotations: {sum(counts.values())}")


Class Distribution in Filtered Dataset:

TRAIN:
  Class 1 (stamp): 3136 annotations
  Total annotations: 3136

VALID:
  Class 1 (stamp): 318 annotations
  Total annotations: 318

TEST:
  Class 1 (stamp): 154 annotations
  Total annotations: 154


## Verify Results (Optional)

In [9]:
# Create updated data.yaml with single stamp class
yaml_content = """train: ../train/images
val: ../valid/images
test: ../test/images

nc: 1
names: ['stamp']

roboflow:
  workspace: swp-3jks1
  project: stamp-shape-filtered
  version: 3
  license: Public Domain
  url: https://universe.roboflow.com/swp-3jks1/stamp-shape/dataset/3
"""

yaml_path = Path(OUTPUT_DIR) / "data.yaml"
with open(yaml_path, 'w') as f:
    f.write(yaml_content)

print("✓ data.yaml created")
print(f"\nFiltered dataset ready at: {OUTPUT_DIR}")
print("All stamp shapes (circle, oval, rectangle) remapped to class ID 1")

✓ data.yaml created

Filtered dataset ready at: ../datasets/stamp_yolov9_filtered
All stamp shapes (circle, oval, rectangle) remapped to class ID 1


## Create Updated data.yaml

In [8]:
def filter_labels_and_copy(split):
    """
    Filter label files to keep only specified classes and remap them to class 1
    """
    source_labels = Path(SOURCE_DIR) / split / "labels"
    source_images = Path(SOURCE_DIR) / split / "images"
    output_labels = Path(OUTPUT_DIR) / split / "labels"
    output_images = Path(OUTPUT_DIR) / split / "images"
    
    if not source_labels.exists():
        print(f"⚠ {split} split not found, skipping...")
        return 0, 0
    
    label_files = list(source_labels.glob("*.txt"))
    kept_count = 0
    skipped_count = 0
    
    for label_file in label_files:
        # Read and filter annotations
        filtered_lines = []
        
        with open(label_file, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 5:
                    class_id = int(parts[0])
                    if class_id in KEEP_CLASSES:
                        # Remap to class 1
                        parts[0] = str(NEW_CLASS_ID)
                        filtered_lines.append(' '.join(parts) + '\n')
        
        # Only keep image/label if it has valid annotations
        if filtered_lines:
            # Write filtered label file
            output_label_path = output_labels / label_file.name
            with open(output_label_path, 'w') as f:
                f.writelines(filtered_lines)
            
            # Copy corresponding image
            img_name = label_file.stem + ".jpg"
            img_path = source_images / img_name
            
            # Try .png if .jpg doesn't exist
            if not img_path.exists():
                img_name = label_file.stem + ".png"
                img_path = source_images / img_name
            
            if img_path.exists():
                shutil.copy(img_path, output_images / img_name)
                kept_count += 1
            else:
                print(f"⚠ Image not found for {label_file.name}")
        else:
            skipped_count += 1
    
    return kept_count, skipped_count


# Process all splits
print("\nProcessing datasets...")
total_kept = 0
total_skipped = 0

for split in ['train', 'valid', 'test']:
    print(f"\n{split.upper()}:")
    kept, skipped = filter_labels_and_copy(split)
    print(f"  ✓ Kept: {kept} images")
    print(f"  ✗ Skipped: {skipped} images (no valid classes)")
    total_kept += kept
    total_skipped += skipped

print(f"\n{'='*50}")
print(f"TOTAL: {total_kept} images kept, {total_skipped} images skipped")


Processing datasets...

TRAIN:
  ✓ Kept: 2030 images
  ✗ Skipped: 574 images (no valid classes)

VALID:
  ✓ Kept: 206 images
  ✗ Skipped: 44 images (no valid classes)

TEST:
  ✓ Kept: 98 images
  ✗ Skipped: 27 images (no valid classes)

TOTAL: 2334 images kept, 645 images skipped


## Filter Labels and Copy Images

In [7]:
# Create output directories
for split in ['train', 'valid', 'test']:
    os.makedirs(f"{OUTPUT_DIR}/{split}/images", exist_ok=True)
    os.makedirs(f"{OUTPUT_DIR}/{split}/labels", exist_ok=True)

print("✓ Directory structure created")

✓ Directory structure created


## Create Output Directory Structure

In [6]:
import os
import shutil
from pathlib import Path

# Source and destination paths
SOURCE_DIR = "../datasets/stamp_yolov9"
OUTPUT_DIR = "../datasets/stamp_yolov9_filtered"

# Classes to keep (0, 1, 2) and remap to class 1
KEEP_CLASSES = {0, 1, 2}
NEW_CLASS_ID = 1  # All stamps become class 1

print(f"Source: {SOURCE_DIR}")
print(f"Output: {OUTPUT_DIR}")
print(f"Keeping classes: {KEEP_CLASSES}")
print(f"Remapping to class ID: {NEW_CLASS_ID}")

Source: ../datasets/stamp_yolov9
Output: ../datasets/stamp_yolov9_filtered
Keeping classes: {0, 1, 2}
Remapping to class ID: 1
