In [1]:
import os
import shutil
import random
import zipfile

# Path to the dataset folder
dataset_path = "/kaggle/input/final-dataset"  # Replace with the actual path
output_path = "/kaggle/working/output_dataset"  # Output path for reorganized dataset
zip_output_path = "/kaggle/working/dataset_split.zip"  # Path for the final zip file

# Train-Test split ratio
train_ratio = 0.8

# Ensure output folders exist
train_path = os.path.join(output_path, "Train")
test_path = os.path.join(output_path, "Test")
os.makedirs(train_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)

def combine_classes_and_split():
    # Iterate through each class folder in the dataset
    for main_folder in os.listdir(dataset_path):
        main_folder_path = os.path.join(dataset_path, main_folder)
        if not os.path.isdir(main_folder_path):
            continue

        for class_name in os.listdir(main_folder_path):
            class_path = os.path.join(main_folder_path, class_name)
            if not os.path.isdir(class_path):
                continue

            # Collect all files in this class
            files = [os.path.join(class_path, f) for f in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, f))]

            # Shuffle files for randomness
            random.shuffle(files)

            # Split files into training and testing sets
            split_index = int(len(files) * train_ratio)
            train_files = files[:split_index]
            test_files = files[split_index:]

            # Organize training files
            train_class_path = os.path.join(train_path, class_name)
            os.makedirs(train_class_path, exist_ok=True)
            for file in train_files:
                shutil.copy(file, os.path.join(train_class_path, os.path.basename(file)))

            # Organize testing files
            test_class_path = os.path.join(test_path, class_name)
            os.makedirs(test_class_path, exist_ok=True)
            for file in test_files:
                shutil.copy(file, os.path.join(test_class_path, os.path.basename(file)))

            print(f"Processed class '{class_name}': {len(train_files)} training files, {len(test_files)} testing files.")

def zip_dataset():
    # Create a zip file of the dataset
    with zipfile.ZipFile(zip_output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(output_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, output_path)
                zipf.write(file_path, arcname)
    print(f"Dataset successfully zipped at {zip_output_path}")

# Run the process
combine_classes_and_split()
zip_dataset()

print("Dataset successfully combined, split by class, and zipped!")


Processed class 'surprise': 793 training files, 199 testing files.
Processed class 'fear': 793 training files, 199 testing files.
Processed class 'neutral': 793 training files, 199 testing files.
Processed class 'sad': 793 training files, 199 testing files.
Processed class 'happy': 793 training files, 199 testing files.
Processed class 'anger': 793 training files, 199 testing files.
Processed class 'surprise': 1864 training files, 466 testing files.
Processed class 'fear': 1864 training files, 466 testing files.
Processed class 'neutral': 1864 training files, 466 testing files.
Processed class 'sad': 1864 training files, 466 testing files.
Processed class 'happy': 1864 training files, 466 testing files.
Processed class 'anger': 1864 training files, 466 testing files.
Dataset successfully zipped at /kaggle/working/dataset_split.zip
Dataset successfully combined, split by class, and zipped!
