In [11]:
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold
import os

# Configuration
class CFG:
    seed = 101
    n_fold = 5

# Set the seed for reproducibility
def set_seed(seed=42):
    import numpy as np
    import random
    import torch
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    print('> SEEDING DONE')
    
set_seed(CFG.seed)

# Define the data directory and read the CSV
DATA_DIR = "/Users/elizabethnemeti/Desktop/uw-madison-gi-tract-image-segmentation"
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
train_df = pd.read_csv(TRAIN_CSV)

# Add the necessary columns
train_df['segmentation'] = train_df['segmentation'].fillna('')
train_df['rle_len'] = train_df['segmentation'].map(len)

# Define mask_path
train_df['mask_path'] = train_df['id'].apply(lambda x: os.path.join(DATA_DIR, 'train', x.split('_')[0], x.split('_')[0] + '_' + x.split('_')[1], 'scans', x + '.npy'))

# Simplify further processing
train_df['empty'] = (train_df['rle_len'] == 0)

# Create folds
skf = StratifiedGroupKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['empty'], groups=train_df["id"].str.split('_').str[0])):
    train_df.loc[val_idx, 'fold'] = fold

# Save the splits to a CSV file
splits_path = '/Users/elizabethnemeti/Desktop/uwdatapreprocessing/splits.csv'
train_df[['id', 'fold']].to_csv(splits_path, index=False)

print(f"Splits saved to {splits_path}")

> SEEDING DONE
Splits saved to /Users/elizabethnemeti/Desktop/uwdatapreprocessing/splits.csv
