In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

def create_split_csv(input_csv_path, output_csv_path, test_size=0.2, val_size=0.1, seed=42):
    # Load the dataset
    metadata = pd.read_csv(input_csv_path)

    # Create a combined column for stratification based on benign_malignant and patches
    metadata['combined_group'] = metadata['benign_malignant'] * 2 + metadata['patches']

    # Split into train and test with combined stratification
    train_data, test_data = train_test_split(
        metadata,
        test_size=test_size,
        random_state=seed,
        stratify=metadata['combined_group']
    )

    # Split the remaining training data into train and validation sets
    train_data, val_data = train_test_split(
        train_data,
        test_size=val_size / (1 - test_size),
        random_state=seed,
        stratify=train_data['combined_group']
    )

    # Add a new 'split' column to each DataFrame
    train_data['split'] = 0
    val_data['split'] = 1
    test_data['split'] = 2

    # Combine all splits into a single DataFrame
    combined_df = pd.concat([train_data, val_data, test_data])

    # Drop the combined group column as it's not needed in the final CSV
    combined_df = combined_df.drop(columns=['combined_group'])

    # Save to the output CSV
    combined_df.to_csv(output_csv_path, index=False)

# Example usage
create_split_csv(input_csv_path='C:/Users/elmop/Downloads/isic/metadata.csv', 
                 output_csv_path='C:/Users/elmop/Downloads/metadata_w_split_elmo.csv', 
                 test_size=0.2, 
                 val_size=0.1, 
                 seed=7)
