In [None]:
import os
import zipfile
from sklearn.model_selection import train_test_split

# Function to get all files from a directory with subdirectories representing classes
def get_files_from_classes(directory):
    data = []
    labels = []
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            for file_name in os.listdir(class_path):
                file_path = os.path.join(class_path, file_name)
                if os.path.isfile(file_path):
                    data.append(file_path)
                    labels.append(class_name)
    return data, labels

# Input and output paths for Kaggle
input_path = "/kaggle/input/final-dataset"
output_path = "/kaggle/working"

# Get files and labels from train and test directories
train_data, train_labels = get_files_from_classes(os.path.join(input_path, "Train"))
test_data, test_labels = get_files_from_classes(os.path.join(input_path, "Test"))

# Combine train and test data
combined_data = train_data + test_data
combined_labels = train_labels + test_labels

# Perform an 80-20 split
X_train, X_test, y_train, y_test = train_test_split(combined_data, combined_labels, test_size=0.2, random_state=42)

# Zip the combined data
zip_filename = os.path.join(output_path, "combined_data.zip")
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for file_path in combined_data:
        arcname = os.path.relpath(file_path, start=os.path.commonpath(combined_data))
        zipf.write(file_path, arcname)

# Display the results
print(f"Number of training samples: {len(X_train)}")
print(f"Number of testing samples: {len(X_test)}")
print(f"Combined data has been zipped into: {zip_filename}")
