In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import librosa
from pathlib import Path
from tqdm import tqdm

In [3]:
# Configuration
AUDIO_DIR = Path("../../data/raw/Audio")  # Relative to this script location
TRAIN_CSV = Path("../../data/raw/Train.csv")
EVAL_SIZE = 0.15  # 15% for eval
HOLDOUT_SIZE = 0.10  # 10% for holdout (from remaining data)
RANDOM_STATE = 42
OUTPUT_DIR = Path("../../data/splits")

In [4]:
def extract_audio_duration(audio_path):
    """Extract duration from audio file"""
    try:
        duration = librosa.get_duration(path=audio_path)
        return duration
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

def create_duration_bins(durations, n_bins=4):
    """Create duration bins based on quantiles"""
    if n_bins == 3:
        bins = [0, 4, 8, np.inf]
        labels = ['short', 'medium', 'long']
    else:  # 4 bins based on quartiles
        q1, q2, q3 = np.percentile(durations, [25, 50, 75])
        bins = [0, q1, q2, q3, np.inf]
        labels = ['short', 'medium_short', 'medium_long', 'long']
    
    return bins, labels


In [5]:
# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    
print("Loading training data...")
train_df = pd.read_csv(TRAIN_CSV)
    
print(f"Total samples: {len(train_df)}")

Loading training data...
Total samples: 19856


In [7]:
# Extract durations for all audio files
print("\nExtracting audio durations...")
durations = []
valid_indices = []
    
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    audio_path = AUDIO_DIR / f"{row['ID']}.wav"  # Adjust extension if needed
    duration = extract_audio_duration(audio_path)
        
    if duration is not None:
        durations.append(duration)
        valid_indices.append(idx)


Extracting audio durations...


100%|██████████| 19856/19856 [00:12<00:00, 1629.79it/s]


In [8]:
# Filter to valid samples only
train_df = train_df.iloc[valid_indices].copy()
train_df['duration'] = durations
    
print(f"\nValid samples: {len(train_df)}")
print(f"Duration statistics:")
print(train_df['duration'].describe())


Valid samples: 19856
Duration statistics:
count    19856.000000
mean         6.049944
std          3.386629
min          2.000000
25%          3.442000
50%          5.301000
75%          7.643000
max         29.694000
Name: duration, dtype: float64


In [9]:
# Create duration bins for stratification
print("\nCreating duration bins...")
bins, labels = create_duration_bins(train_df['duration'].values, n_bins=4)
train_df['duration_bin'] = pd.cut(train_df['duration'], bins=bins, labels=labels)
    
print("\nDuration bin distribution:")
print(train_df['duration_bin'].value_counts().sort_index())


Creating duration bins...

Duration bin distribution:
duration_bin
short           4969
medium_short    4965
medium_long     4963
long            4959
Name: count, dtype: int64


In [10]:
# First split: separate out eval set
print(f"\nStep 1: Splitting off eval set ({EVAL_SIZE*100}%)...")
train_temp_indices, eval_indices = train_test_split(
    train_df.index,
    test_size=EVAL_SIZE,
    stratify=train_df['duration_bin'],
    random_state=RANDOM_STATE
)
    
train_temp = train_df.loc[train_temp_indices].copy()
eval_set = train_df.loc[eval_indices].copy()


Step 1: Splitting off eval set (15.0%)...


In [11]:
# Second split: separate train and holdout from remaining data
# Calculate holdout size relative to the temporary training set
holdout_size_adjusted = HOLDOUT_SIZE / (1 - EVAL_SIZE)
    
print(f"Step 2: Splitting train and holdout ({HOLDOUT_SIZE*100}% of total)...")
train_indices, holdout_indices = train_test_split(
    train_temp.index,
    test_size=holdout_size_adjusted,
    stratify=train_temp['duration_bin'],
    random_state=RANDOM_STATE
)
    
train_set = train_df.loc[train_indices].copy()
holdout_set = train_df.loc[holdout_indices].copy()

Step 2: Splitting train and holdout (10.0% of total)...


In [12]:
# Validation: Check distribution preservation
print("\n" + "="*80)
print("SPLIT VALIDATION")
print("="*80)
    
print(f"\nDataset sizes:")
print(f"  Train:    {len(train_set):5d} ({len(train_set)/len(train_df)*100:.1f}%)")
print(f"  Eval:     {len(eval_set):5d} ({len(eval_set)/len(train_df)*100:.1f}%)")
print(f"  Holdout:  {len(holdout_set):5d} ({len(holdout_set)/len(train_df)*100:.1f}%)")
print(f"  Total:    {len(train_df):5d}")
    
print("\nDuration bin distribution (proportions):")
print("\nTrain:")
print(train_set['duration_bin'].value_counts(normalize=True).sort_index())
print("\nEval:")
print(eval_set['duration_bin'].value_counts(normalize=True).sort_index())
print("\nHoldout:")
print(holdout_set['duration_bin'].value_counts(normalize=True).sort_index())
    
print("\nDuration statistics:")
for name, dataset in [("Train", train_set), ("Eval", eval_set), ("Holdout", holdout_set)]:
    print(f"\n{name}:")
    print(dataset['duration'].describe())


SPLIT VALIDATION

Dataset sizes:
  Train:    14891 (75.0%)
  Eval:      2979 (15.0%)
  Holdout:   1986 (10.0%)
  Total:    19856

Duration bin distribution (proportions):

Train:
duration_bin
short           0.250285
medium_short    0.250017
medium_long     0.249950
long            0.249748
Name: proportion, dtype: float64

Eval:
duration_bin
short           0.250084
medium_short    0.250084
medium_long     0.250084
long            0.249748
Name: proportion, dtype: float64

Holdout:
duration_bin
short           0.250252
medium_short    0.250252
medium_long     0.249748
long            0.249748
Name: proportion, dtype: float64

Duration statistics:

Train:
count    14891.000000
mean         6.048614
std          3.381003
min          2.000000
25%          3.442000
50%          5.301000
75%          7.643000
max         29.468000
Name: duration, dtype: float64

Eval:
count    2979.000000
mean        6.051557
std         3.364252
min         2.000000
25%         3.442500
50%         5.28

In [13]:
# Save splits
print("\n" + "="*80)
print("SAVING SPLITS")
print("="*80)
    
# Save clean versions (just ID and Transcription)
train_set[['ID', 'Transcription']].to_csv(OUTPUT_DIR / 'train.csv', index=False)
eval_set[['ID', 'Transcription']].to_csv(OUTPUT_DIR / 'eval.csv', index=False)
holdout_set[['ID', 'Transcription']].to_csv(OUTPUT_DIR / 'holdout.csv', index=False)
    
# Save versions with metadata
train_set.to_csv(OUTPUT_DIR / 'train_with_metadata.csv', index=False)
eval_set.to_csv(OUTPUT_DIR / 'eval_with_metadata.csv', index=False)
holdout_set.to_csv(OUTPUT_DIR / 'holdout_with_metadata.csv', index=False)
    
print(f"\nFiles saved to {OUTPUT_DIR}/:")
print("  Core splits (for modeling):")
print("    - train.csv")
print("    - eval.csv")
print("    - holdout.csv")
print("\n  With metadata (for analysis):")
print("    - train_with_metadata.csv")
print("    - eval_with_metadata.csv")
print("    - holdout_with_metadata.csv")


SAVING SPLITS

Files saved to ..\..\data\splits/:
  Core splits (for modeling):
    - train.csv
    - eval.csv
    - holdout.csv

  With metadata (for analysis):
    - train_with_metadata.csv
    - eval_with_metadata.csv
    - holdout_with_metadata.csv


In [14]:
# Save split info summary
summary = {
    'total_samples': len(train_df),
    'train_samples': len(train_set),
    'eval_samples': len(eval_set),
    'holdout_samples': len(holdout_set),
    'train_pct': len(train_set)/len(train_df)*100,
    'eval_pct': len(eval_set)/len(train_df)*100,
    'holdout_pct': len(holdout_set)/len(train_df)*100,
    'random_state': RANDOM_STATE,
    'stratification': 'duration_bin (4 bins based on quartiles)'
}
    
summary_df = pd.DataFrame([summary])
summary_df.to_csv(OUTPUT_DIR / 'split_summary.csv', index=False)
print("\n  Summary:")
print("    - split_summary.csv")
    
print("\n✓ Split creation complete!")


  Summary:
    - split_summary.csv

✓ Split creation complete!
