In [10]:
import os
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split
import glob

print("Libraries imported successfully!")

Libraries imported successfully!


In [11]:
# Configuration
DATA_DIR = Path('../data/colorize_dataset/data')
VALIDATION_SPLIT = 0.05  # 5% for validation
RANDOM_SEED = 42

# Original folders (will be renamed to backup)
ORIGINAL_TRAIN_COLOR = DATA_DIR / 'train_color'
ORIGINAL_TRAIN_BLACK = DATA_DIR / 'train_black'

# Backup folders
BACKUP_TRAIN_COLOR = DATA_DIR / 'train_color_original'
BACKUP_TRAIN_BLACK = DATA_DIR / 'train_black_original'

# New folders
NEW_TRAIN_COLOR = DATA_DIR / 'train_color'
NEW_TRAIN_BLACK = DATA_DIR / 'train_black'
VAL_COLOR = DATA_DIR / 'val_color'
VAL_BLACK = DATA_DIR / 'val_black'

print(f"Data directory: {DATA_DIR}")
print(f"Validation split: {VALIDATION_SPLIT * 100}%")
print(f"Random seed: {RANDOM_SEED}")

Data directory: ..\data\colorize_dataset\data
Validation split: 5.0%
Random seed: 42


In [12]:
# Get all image files from original training folders
color_images = sorted(glob.glob(str(ORIGINAL_TRAIN_COLOR / '*.jpg')))
black_images = sorted(glob.glob(str(ORIGINAL_TRAIN_BLACK / '*.jpg')))

print(f"Found {len(color_images)} color images")
print(f"Found {len(black_images)} black images")

# Verify they match
assert len(color_images) == len(black_images), "Mismatch between color and black images!"

# Extract filenames to ensure pairing
color_names = [Path(p).name for p in color_images]
black_names = [Path(p).name for p in black_images]
assert color_names == black_names, "Image names don't match between color and black folders!"

print("✓ All images are properly paired")

Found 5000 color images
Found 5000 black images
✓ All images are properly paired


In [13]:
# Split the data
train_indices, val_indices = train_test_split(
    range(len(color_images)),
    test_size=VALIDATION_SPLIT,
    random_state=RANDOM_SEED
)

print(f"Training set: {len(train_indices)} images ({(1-VALIDATION_SPLIT)*100:.1f}%)")
print(f"Validation set: {len(val_indices)} images ({VALIDATION_SPLIT*100:.1f}%)")

train_color_files = [color_images[i] for i in train_indices]
train_black_files = [black_images[i] for i in train_indices]
val_color_files = [color_images[i] for i in val_indices]
val_black_files = [black_images[i] for i in val_indices]

Training set: 4750 images (95.0%)
Validation set: 250 images (5.0%)


In [14]:
# Backup original folders (rename)
print("\nCreating backups...")

if BACKUP_TRAIN_COLOR.exists():
    print(f"Backup already exists: {BACKUP_TRAIN_COLOR}")
    print("  Skipping backup creation. Delete backup folders if you want to re-run.")
else:
    shutil.move(str(ORIGINAL_TRAIN_COLOR), str(BACKUP_TRAIN_COLOR))
    shutil.move(str(ORIGINAL_TRAIN_BLACK), str(BACKUP_TRAIN_BLACK))
    print(f"  ✓ Backed up train_color → {BACKUP_TRAIN_COLOR.name}")
    print(f"  ✓ Backed up train_black → {BACKUP_TRAIN_BLACK.name}")


Creating backups...
  ✓ Backed up train_color → train_color_original
  ✓ Backed up train_black → train_black_original


In [15]:
# Create new directories
print("\nCreating new directories...")
NEW_TRAIN_COLOR.mkdir(exist_ok=True)
NEW_TRAIN_BLACK.mkdir(exist_ok=True)
VAL_COLOR.mkdir(exist_ok=True)
VAL_BLACK.mkdir(exist_ok=True)

print(f"  ✓ Created {NEW_TRAIN_COLOR.name}")
print(f"  ✓ Created {NEW_TRAIN_BLACK.name}")
print(f"  ✓ Created {VAL_COLOR.name}")
print(f"  ✓ Created {VAL_BLACK.name}")


Creating new directories...
  ✓ Created train_color
  ✓ Created train_black
  ✓ Created val_color
  ✓ Created val_black


In [16]:
# Copy files to new training folders
print("\nCopying training files...")
for src in train_color_files:
    dst = NEW_TRAIN_COLOR / Path(src).name
    # Use the backup folder as source if original was already moved
    if BACKUP_TRAIN_COLOR.exists():
        src = BACKUP_TRAIN_COLOR / Path(src).name
    shutil.copy2(src, dst)

for src in train_black_files:
    dst = NEW_TRAIN_BLACK / Path(src).name
    if BACKUP_TRAIN_BLACK.exists():
        src = BACKUP_TRAIN_BLACK / Path(src).name
    shutil.copy2(src, dst)

print(f"  ✓ Copied {len(train_color_files)} images to train_color")
print(f"  ✓ Copied {len(train_black_files)} images to train_black")


Copying training files...
  ✓ Copied 4750 images to train_color
  ✓ Copied 4750 images to train_black


In [17]:
# Copy files to validation folders
print("\nCopying validation files...")
for src in val_color_files:
    dst = VAL_COLOR / Path(src).name
    if BACKUP_TRAIN_COLOR.exists():
        src = BACKUP_TRAIN_COLOR / Path(src).name
    shutil.copy2(src, dst)

for src in val_black_files:
    dst = VAL_BLACK / Path(src).name
    if BACKUP_TRAIN_BLACK.exists():
        src = BACKUP_TRAIN_BLACK / Path(src).name
    shutil.copy2(src, dst)

print(f"  ✓ Copied {len(val_color_files)} images to val_color")
print(f"  ✓ Copied {len(val_black_files)} images to val_black")


Copying validation files...
  ✓ Copied 250 images to val_color
  ✓ Copied 250 images to val_black


In [18]:
# Verify the split
print("\nVerification:")
print(f"  train_color: {len(list(NEW_TRAIN_COLOR.glob('*.jpg')))} images")
print(f"  train_black: {len(list(NEW_TRAIN_BLACK.glob('*.jpg')))} images")
print(f"  val_color: {len(list(VAL_COLOR.glob('*.jpg')))} images")
print(f"  val_black: {len(list(VAL_BLACK.glob('*.jpg')))} images")

# Check test set still exists
test_color = DATA_DIR / 'test_color'
test_black = DATA_DIR / 'test_black'
if test_color.exists() and test_black.exists():
    print(f"  test_color: {len(list(test_color.glob('*.jpg')))} images")
    print(f"  test_black: {len(list(test_black.glob('*.jpg')))} images")

print("\nValidation split created successfully!")
print(f"\nNote: Original data is backed up in:")
print(f"   - {BACKUP_TRAIN_COLOR.name}")
print(f"   - {BACKUP_TRAIN_BLACK.name}")


Verification:
  train_color: 4750 images
  train_black: 4750 images
  val_color: 250 images
  val_black: 250 images
  test_color: 739 images
  test_black: 739 images

Validation split created successfully!

Note: Original data is backed up in:
   - train_color_original
   - train_black_original
