In [2]:
import os
import pandas as pd
from PIL import Image
from collections import defaultdict
import shutil
from sklearn.model_selection import train_test_split

In [4]:
# === Paths ===
CSV_PATH = "faces.csv"
IMAGES_DIR = "images"
OUTPUT_DIR = "clean_dataset"

In [6]:
# === Step 1: Load CSV ===
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip().lower() for c in df.columns]
required = ['image_name', 'width', 'height', 'x0', 'y0', 'x1', 'y1']
if not all(col in df.columns for col in required):
    raise ValueError("CSV missing required columns")

In [8]:
# === Step 2: Keep only valid image entries ===
df['image_name'] = df['image_name'].astype(str).str.strip()
image_files = {f for f in os.listdir(IMAGES_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))}
df = df[df['image_name'].isin(image_files)]

In [10]:
# === Step 3: Drop duplicates and invalid boxes ===
df = df.drop_duplicates(subset=['image_name', 'x0', 'y0', 'x1', 'y1'])
df = df[(df['x1'] > df['x0']) & (df['y1'] > df['y0'])]

In [12]:
# === Step 4: Group by image ===
grouped = defaultdict(list)
for _, row in df.iterrows():
    grouped[row['image_name']].append(row)

In [14]:
# === Step 5: Remove images with no faces ===
image_names = list(grouped.keys())

In [16]:
# === Step 6: Split into train/val ===
train_imgs, val_imgs = train_test_split(image_names, test_size=0.2, random_state=42)
splits = {'train': train_imgs, 'val': val_imgs}

In [18]:
# === Step 7: Create folders ===
for split in ['train', 'val']:
    os.makedirs(f"{OUTPUT_DIR}/images/{split}", exist_ok=True)
    os.makedirs(f"{OUTPUT_DIR}/labels/{split}", exist_ok=True)

In [20]:
# === Step 8: Generate YOLO labels ===
for split, img_list in splits.items():
    for img_name in img_list:
        src_path = os.path.join(IMAGES_DIR, img_name)
        dst_img = os.path.join(OUTPUT_DIR, "images", split, img_name)
        shutil.copy(src_path, dst_img)

        label_path = os.path.join(OUTPUT_DIR, "labels", split, os.path.splitext(img_name)[0] + ".txt")
        with open(label_path, "w") as f:
            for row in grouped[img_name]:
                w, h = row['width'], row['height']
                x_center = ((row['x0'] + row['x1']) / 2) / w
                y_center = ((row['y0'] + row['y1']) / 2) / h
                bw = (row['x1'] - row['x0']) / w
                bh = (row['y1'] - row['y0']) / h
                if 0 < bw <= 1 and 0 < bh <= 1:
                    f.write(f"0 {x_center:.6f} {y_center:.6f} {bw:.6f} {bh:.6f}\n")

print("✅ Dataset cleaned and saved to:", OUTPUT_DIR)
print("📁 Train images:", len(train_imgs))
print("📁 Val images:", len(val_imgs))

✅ Dataset cleaned and saved to: clean_dataset
📁 Train images: 1763
📁 Val images: 441


In [22]:
# === Step 9: Sanity check ===
count_per_img = df.groupby('image_name').size().sort_values(ascending=False)
print("\n🧠 Top 10 images with highest face counts:")
print(count_per_img.head(10))
print("\n✅ Total images with at least one face:", len(count_per_img))
print("🚫 Images removed (no faces):", len(image_files) - len(count_per_img))


🧠 Top 10 images with highest face counts:
image_name
00000657.jpg    12
00003172.jpg    11
00003165.jpg    11
00000508.jpg    10
00000609.jpg     9
00000737.jpg     9
00003361.jpg     9
00000593.jpg     9
00000578.jpg     9
00000548.jpg     8
dtype: int64

✅ Total images with at least one face: 2204
🚫 Images removed (no faces): 0
