# Data Splitting

Create train/validation/test splits for model development.

In [1]:
import sys
import os
from pathlib import Path

# Get the absolute path to the build directory
notebook_dir = Path(os.path.abspath('')).parent
build_dir = notebook_dir.parent

# Add src to path
sys.path.append(str(notebook_dir))

import pandas as pd
from src.preprocessing.data_splitter import DataSplitter

In [2]:
# Load dataset
data = pd.read_csv(os.path.join(build_dir, 'test_set.tsv'), sep='\t')

# Define features and labels
feature_cols = ['mH2', 'mHD', 'mAD', 'mHDp', 'alpha', 'L2', 'L8', 'vs', 'm22sq']
label_cols = ['valid_BFB', 'valid_Uni', 'valid_STU', 'valid_Higgs']

# Initialize splitter
splitter = DataSplitter(data, feature_cols, label_cols)

# Create splits
splits = splitter.create_splits()

Training set: 81298 samples (70.0%)
Validation set: 17421 samples (15.0%)
Test set: 17421 samples (15.0%)


In [3]:
# Save splits to build directory
splits_dir = os.path.join(build_dir, 'data_splits')
splitter.save_splits(splits, splits_dir)

Saved train set to /home/maien/work/ScannerS-master/build/data_splits/train_set.tsv
Saved val set to /home/maien/work/ScannerS-master/build/data_splits/val_set.tsv
Saved test set to /home/maien/work/ScannerS-master/build/data_splits/test_set.tsv


## Verify Splits

Check label distributions in each split to ensure they're representative.

In [4]:
# Print label distributions for each split
for split_name, (X, y) in splits.items():
    print(f"\n{split_name.upper()} SET LABEL DISTRIBUTIONS:")
    print("-" * 40)
    for label in label_cols:
        dist = y[label].value_counts(normalize=True)
        print(f"\n{label}:")
        print(dist)


TRAIN SET LABEL DISTRIBUTIONS:
----------------------------------------

valid_BFB:
valid_BFB
1    0.603766
0    0.396234
Name: proportion, dtype: float64

valid_Uni:
valid_Uni
0    0.503432
1    0.496568
Name: proportion, dtype: float64

valid_STU:
valid_STU
1    0.559337
0    0.440663
Name: proportion, dtype: float64

valid_Higgs:
valid_Higgs
1    0.572253
0    0.427747
Name: proportion, dtype: float64

VAL SET LABEL DISTRIBUTIONS:
----------------------------------------

valid_BFB:
valid_BFB
1    0.602147
0    0.397853
Name: proportion, dtype: float64

valid_Uni:
valid_Uni
0    0.504162
1    0.495838
Name: proportion, dtype: float64

valid_STU:
valid_STU
1    0.558407
0    0.441593
Name: proportion, dtype: float64

valid_Higgs:
valid_Higgs
1    0.577808
0    0.422192
Name: proportion, dtype: float64

TEST SET LABEL DISTRIBUTIONS:
----------------------------------------

valid_BFB:
valid_BFB
1    0.607657
0    0.392343
Name: proportion, dtype: float64

valid_Uni:
valid_Uni
0    0.