# Feature Preprocessing

Apply and save feature transformations for model training.

In [1]:
import sys
import os
from pathlib import Path

# Get the absolute path to the build directory
notebook_dir = Path(os.path.abspath('')).parent
build_dir = notebook_dir.parent

# Add src to path
sys.path.append(str(notebook_dir))

import pandas as pd
import numpy as np
from src.preprocessing.preprocessor import FeaturePreprocessor

In [2]:
# Load the splits
splits_dir = os.path.join(build_dir, 'data_splits')

# Load training data (we fit preprocessor only on training data)
train_data = pd.read_csv(os.path.join(splits_dir, 'train_set.tsv'), sep='\t')

# Define features
feature_cols = ['mH2', 'mHD', 'mAD', 'mHDp', 'alpha', 'L2', 'L8', 'vs', 'm22sq']
X_train = train_data[feature_cols]

In [3]:
# Initialize and fit preprocessor
preprocessor = FeaturePreprocessor(apply_yj=True, apply_scaler=True)
X_train_processed = preprocessor.fit_transform(X_train)

# Look at the effects of preprocessing
stats = preprocessor.describe_transformations(X_train)

print("Original Data Statistics:")
print(stats['original'])
print("\nAfter Yeo-Johnson:")
print(stats['after_yj'])
print("\nAfter Scaling:")
print(stats['after_scaling'])

Original Data Statistics:
                mH2           mHD           mAD          mHDp         alpha  \
count  81298.000000  81298.000000  81298.000000  81298.000000  81298.000000   
mean     756.116787    696.081561    694.813204    843.290329     -0.012395   
std      396.567008    380.291064    379.271306    356.407160      0.590349   
min      125.098308      1.201244      1.011494      5.091194     -1.570259   
25%      408.405833    399.826236    399.305965    561.508174     -0.204915   
50%      725.630913    664.080251    662.258240    814.770169     -0.007623   
75%     1091.444487    980.967621    978.983405   1142.121298      0.185137   
max     1499.957514   1499.950285   1499.989430   1499.992267      1.570347   

                 L2            L8            vs          m22sq  
count  81298.000000  81298.000000  81298.000000   81298.000000  
mean       6.954888      1.809151    499.960266  233773.526880  
std        5.573781     11.035877    381.849480  143639.857164  
mi



In [4]:
# Save the preprocessor
preprocessor_dir = os.path.join(build_dir, 'preprocessor')
preprocessor.save_transformers(preprocessor_dir)
print(f"Saved preprocessor to {preprocessor_dir}")

Saved preprocessor to /home/maien/work/ScannerS-master/build/preprocessor


In [5]:
# Test loading and applying to validation data
val_data = pd.read_csv(os.path.join(splits_dir, 'val_set.tsv'), sep='\t')
X_val = val_data[feature_cols]

# Load preprocessor and transform validation data
loaded_preprocessor = FeaturePreprocessor.load_transformers(preprocessor_dir)
X_val_processed = loaded_preprocessor.transform(X_val)

print("Validation set statistics after preprocessing:")
print(pd.DataFrame(X_val_processed, columns=feature_cols).describe())

Validation set statistics after preprocessing:
                mH2           mHD           mAD          mHDp         alpha  \
count  17421.000000  17421.000000  17421.000000  17421.000000  17421.000000   
mean       0.008537      0.000182      0.006776      0.010591     -0.004821   
std        1.000533      1.002813      0.998217      1.003741      0.993972   
min       -1.918485     -2.432998     -2.429876     -2.890977     -2.663512   
25%       -0.808887     -0.708749     -0.706885     -0.758927     -0.321349   
50%        0.066824      0.014774      0.013272     -0.009883      0.016370   
75%        0.869955      0.771323      0.775250      0.869012      0.332269   
max        1.648301      1.859613      1.865832      1.732041      2.656103   

                 L2            L8            vs         m22sq  
count  17421.000000  17421.000000  17421.000000  17421.000000  
mean       0.000207      0.015253      0.003122      0.010594  
std        0.998872      1.004136      0.992185  