#Imports

In [None]:
import os
import random
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import shutil

#Config


In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

base_dir = None
output_dir = None
split_ratios = {"train": 0.8, "val": 0.2}
random.seed(42)

# Split ID's

In [None]:
# Read Excel file
excel_file_path = base_dir / "sand_task_1.xlsx"
metadata = pd.read_excel(excel_file_path, dtype={"ID": str})

# separate participants by class
class_1 = metadata[metadata['Class'] == 1]
class_2 = metadata[metadata['Class'] == 2]
class_3 = metadata[metadata['Class'] == 3]
class_4 = metadata[metadata['Class'] == 4]
class_5 = metadata[metadata['Class'] == 5]

print('Class 1:')
print(class_1)

# Split ID's
c1_shuffled = class_1.sample(frac=1).reset_index(drop=True)
c2_shuffled = class_2.sample(frac=1).reset_index(drop=True)
c3_shuffled = class_3.sample(frac=1).reset_index(drop=True)
c4_shuffled = class_4.sample(frac=1).reset_index(drop=True)
c5_shuffled = class_5.sample(frac=1).reset_index(drop=True)

print('\nClass 1 shuffled:')
print(c1_shuffled)

# split up each class for train / val
def split(data, train_split=0.8):
    n_train = int(len(data) * train_split)
    train = data.iloc[:n_train]
    val   = data.iloc[n_train:]
    return train, val

# split up each class into train and val
c1_train, c1_val = split(c1_shuffled, split_ratios['train'])
c2_train, c2_val = split(c2_shuffled, split_ratios['train'])
c3_train, c3_val = split(c3_shuffled, split_ratios['train'])
c4_train, c4_val = split(c4_shuffled, split_ratios['train'])
c5_train, c5_val = split(c5_shuffled, split_ratios['train'])

# combine training and val dataframes
train_mixed = pd.concat([c1_train, c2_train, c3_train, c4_train, c5_train], ignore_index=True)
val_mixed = pd.concat([c1_val, c2_val, c3_val, c4_val, c5_val], ignore_index=True)

# shuffle the order of each dataset
train_mixed_shuffled = train_mixed.sample(frac=1).reset_index(drop=True)
val_mixed_shuffled = val_mixed.sample(frac=1).reset_index(drop=True)

# visual check of distributions of each dataset

print('\ndistribution of class in train set:')
print(f'class 1: {len(c1_train) / len(train_mixed_shuffled) * 100}%')
print(f'class 2: {len(c2_train) / len(train_mixed_shuffled) * 100}%')
print(f'class 3: {len(c3_train) / len(train_mixed_shuffled) * 100}%')
print(f'class 4: {len(c4_train) / len(train_mixed_shuffled) * 100}%')
print(f'class 5: {len(c5_train) / len(train_mixed_shuffled) * 100}%')

print('\ndistribution of class in val set:')
print(f'class 1: {len(c1_val) / len(val_mixed_shuffled) * 100}%')
print(f'class 2: {len(c2_val) / len(val_mixed_shuffled) * 100}%')
print(f'class 3: {len(c3_val) / len(val_mixed_shuffled) * 100}%')
print(f'class 4: {len(c4_val) / len(val_mixed_shuffled) * 100}%')
print(f'class 5: {len(c5_val) / len(val_mixed_shuffled) * 100}%')

print('\ndistribution of original dataset:')
print(f'class 1: {len(class_1) / len(metadata) * 100}%')
print(f'class 2: {len(class_2) / len(metadata) * 100}%')
print(f'class 3: {len(class_3) / len(metadata) * 100}%')
print(f'class 4: {len(class_4) / len(metadata) * 100}%')
print(f'class 5: {len(class_5) / len(metadata) * 100}%')

# make selections and convert to list
train_ids = train_mixed_shuffled['ID'].tolist()
val_ids   = val_mixed_shuffled['ID'].tolist()

splits = {"train": train_ids, "val": val_ids}
print(f"\nTotal subjects: {len(metadata)} | Train: {len(train_ids)} | Val: {len(val_ids)}")

# visual check of first few participants

print('\ntraining set head:')
print(train_mixed_shuffled.sort_values(by='ID').head())

print('\nval set head:')
print(val_mixed_shuffled.sort_values(by='ID').head())

Class 1:
        ID  Age Sex  Class
60   ID077   72   F      1
86   ID111   53   F      1
107  ID138   60   F      1
171  ID213   72   F      1
210  ID261   68   M      1
253  ID316   60   M      1

Class 1 shuffled:
      ID  Age Sex  Class
0  ID111   53   F      1
1  ID213   72   F      1
2  ID316   60   M      1
3  ID261   68   M      1
4  ID138   60   F      1
5  ID077   72   F      1

distribution of class in train set:
class 1: 1.8691588785046727%
class 2: 9.345794392523365%
class 3: 21.02803738317757%
class 4: 28.037383177570092%
class 5: 39.719626168224295%

distribution of class in val set:
class 1: 3.4482758620689653%
class 2: 10.344827586206897%
class 3: 20.689655172413794%
class 4: 27.586206896551722%
class 5: 37.93103448275862%

distribution of original dataset:
class 1: 2.2058823529411766%
class 2: 9.558823529411764%
class 3: 20.955882352941178%
class 4: 27.941176470588236%
class 5: 39.338235294117645%

Total subjects: 272 | Train: 214 | Val: 58

training set head:
      

# Move Files

In [None]:
# Create split folders
for split in tqdm(splits):
    for task_folder in base_dir.iterdir():
        if task_folder.is_dir():
            out_dir = output_dir / split / task_folder.name
            out_dir.mkdir(parents=True, exist_ok=True)

# Move files
for task_folder in tqdm(base_dir.iterdir()):
    if not task_folder.is_dir():
        continue
    for wav_path in task_folder.glob("*.wav"):
        subj_id = wav_path.name.split("_")[0]
        for split_name, ids in splits.items():
            if subj_id in ids:
                dest = output_dir / split_name / task_folder.name / wav_path.name
                dest.parent.mkdir(parents=True, exist_ok=True)  # ensure folder exists
                shutil.copyfile(wav_path, dest)  # copy the file
                break

print("✅ Dataset split complete!")

100%|██████████| 2/2 [00:00<00:00, 39.18it/s]
9it [02:10, 14.48s/it]

✅ Dataset split complete!



