In [None]:
!pip install facenet-pytorch
!pip install torch torchvision


In [None]:
!pip uninstall pillow -y
!pip install pillow


In [None]:
import os
import shutil
import random
from zipfile import ZipFile

# Configuration
DATASET_DIR = "/kaggle/working/cropped_dataset"  # Path to the preprocessed dataset
OUTPUT_DIR = "/kaggle/working/shuffled_dataset"  # Path to save the shuffled dataset
ZIP_FILE = "/kaggle/working/shuffled_dataset.zip"  # Path to save the zipped dataset
TRAIN_RATIO = 0.8  # 80% for training, 20% for testing

def shuffle_and_split_dataset(dataset_dir, output_dir, zip_file, train_ratio=0.8):
    """
    Shuffle and re-split the dataset into Train and Test sets, and save as a .zip file.

    Args:
        dataset_dir (str): Path to the input dataset.
        output_dir (str): Path to save the shuffled dataset.
        zip_file (str): Path to save the zipped dataset.
        train_ratio (float): Proportion of data to use for training.

    Returns:
        None
    """
    train_dir = os.path.join(output_dir, "Train")
    test_dir = os.path.join(output_dir, "Test")
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # Iterate through classes
    class_dirs = [os.path.join(dataset_dir, d) for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))]
    for class_dir in class_dirs:
        class_name = os.path.basename(class_dir)
        images = [f for f in os.listdir(class_dir) if f.lower().endswith(('.jpg', '.png', '.jpeg'))]

        # Shuffle the images
        random.shuffle(images)

        # Calculate split index
        split_index = int(len(images) * train_ratio)

        # Split images into Train and Test
        train_images = images[:split_index]
        test_images = images[split_index:]

        # Create class subdirectories in Train and Test folders
        train_class_dir = os.path.join(train_dir, class_name)
        test_class_dir = os.path.join(test_dir, class_name)
        os.makedirs(train_class_dir, exist_ok=True)
        os.makedirs(test_class_dir, exist_ok=True)

        # Copy images to Train and Test directories
        for img_name in train_images:
            src_path = os.path.join(class_dir, img_name)
            dest_path = os.path.join(train_class_dir, img_name)
            shutil.copy(src_path, dest_path)

        for img_name in test_images:
            src_path = os.path.join(class_dir, img_name)
            dest_path = os.path.join(test_class_dir, img_name)
            shutil.copy(src_path, dest_path)

        print(f"Class '{class_name}': {len(train_images)} images in Train, {len(test_images)} images in Test.")

    # Create a ZIP archive of the shuffled dataset
    print(f"Creating ZIP file at {zip_file}...")
    with ZipFile(zip_file, 'w') as zipf:
        for root, dirs, files in os.walk(output_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, output_dir)  # Preserve folder structure in the ZIP
                zipf.write(file_path, arcname)
    print(f"ZIP file created at {zip_file}")

# Run the function
shuffle_and_split_dataset(DATASET_DIR, OUTPUT_DIR, ZIP_FILE, TRAIN_RATIO)
