# Feature Scaling Validation

This notebook validates the feature scaling by:
1. Loading the saved scaler (created by `scripts/apply_scaling.py`)
2. Applying it to the dataset
3. Verifying scaling properties
4. Checking for data leakage


In [48]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

sys.path.insert(0, str(Path.cwd().parent/"src"))

from src.features.scaling import FeatureScaler, NUMERICAL_FEATURES
from sklearn.model_selection import train_test_split


## 1. Load Data and Scaler


In [49]:
# Load data
df = pd.read_csv('../data/processed/openfoodfacts_cleaned.csv')
print(f"Dataset shape: {df.shape}")

# Find saved scaler (check for standard or minmax)
models_dir = Path('../models')
scaler_paths = {
    'standard': models_dir / 'scaler_standard.pkl',
    'minmax': models_dir / 'scaler_minmax.pkl'
}

# Find which scaler exists
scaler_path = None
method = None
for m, path in scaler_paths.items():
    if path.exists():
        scaler_path = path
        method = m
        break

if scaler_path is None:
    raise FileNotFoundError("No scaler found! Run 'python scripts/apply_scaling.py' first.")

# Load scaler
scaler = FeatureScaler.load(str(scaler_path))
print(f"✓ Scaler loaded: {scaler_path.name}")
print(f"  Method: {method.upper()}")
print(f"  Features: {len(scaler.features)}")


Dataset shape: (96132, 20)
✓ Scaler loaded: scaler_minmax.pkl
  Method: MINMAX
  Features: 9


## 2. Split Data


In [50]:
X = df.drop(columns=['nutriscore_grade', 'code'])
y = df['nutriscore_grade']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train):,} | Test: {len(X_test):,}")


Train: 76,905 | Test: 19,227


## 3. Apply Scaling


In [51]:
# Apply loaded scaler to train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"✓ Scaling applied ({method.upper()})")
print(f"  Train: {len(X_train_scaled):,} | Test: {len(X_test_scaled):,}")


✓ Scaling applied (MINMAX)
  Train: 76,905 | Test: 19,227


## 4. Verify Scaling Properties


In [52]:
# Check scaled features statistics
scaled_features = X_train_scaled[NUMERICAL_FEATURES]

print("Scaled features statistics (training set):")
print(scaled_features.describe().T[['mean', 'std', 'min', 'max']].round(4))

# Verify scaling properties based on method
print(f"\n{'='*70}")
print(f"VERIFICATION: {method.upper()} Properties")
print(f"{'='*70}")

if method == 'standard':
    mean_of_means = scaled_features.mean().mean()
    mean_of_stds = scaled_features.std().mean()
    print(f"Mean of feature means: {mean_of_means:.6f} (should be ~0)")
    print(f"Mean of feature stds:  {mean_of_stds:.4f} (should be ~1)")
    print(f"✓ StandardScaler properties verified" if abs(mean_of_means) < 0.1 and abs(mean_of_stds - 1) < 0.1 else "✗ Warning: Properties not as expected")
    
elif method == 'minmax':
    min_val = scaled_features.min().min()
    max_val = scaled_features.max().max()
    print(f"Min value: {min_val:.4f} (should be ~0)")
    print(f"Max value: {max_val:.4f} (should be ~1)")
    print(f"✓ MinMaxScaler properties verified" if abs(min_val) < 0.1 and abs(max_val - 1) < 0.1 else "✗ Warning: Properties not as expected")


Scaled features statistics (training set):
                      mean     std  min  max
energy-kcal_100g    0.0494  0.0327  0.0  1.0
fat_100g            0.1248  0.1436  0.0  1.0
saturated-fat_100g  0.0482  0.0680  0.0  1.0
carbohydrates_100g  0.2767  0.2728  0.0  1.0
sugars_100g         0.1265  0.1869  0.0  1.0
fiber_100g          0.0236  0.0358  0.0  1.0
proteins_100g       0.0853  0.0883  0.0  1.0
salt_100g           0.0205  0.0419  0.0  1.0
additives_n         0.0323  0.0625  0.0  1.0

VERIFICATION: MINMAX Properties
Min value: 0.0000 (should be ~0)
Max value: 1.0000 (should be ~1)
✓ MinMaxScaler properties verified


## 5. Check Data Leakage Prevention


In [53]:
# Verify test set was transformed with training statistics (not its own)
# Test set should NOT have perfect scaling properties
test_scaled = X_test_scaled[NUMERICAL_FEATURES]

print("Test set statistics (should differ from training):")
print(test_scaled.describe().T[['mean', 'std']].round(4))

if method == 'standard':
    test_mean = test_scaled.mean().mean()
    test_std = test_scaled.std().mean()
    print(f"\nTest set - Mean of means: {test_mean:.4f} | Mean of stds: {test_std:.4f}")
    print("✓ Test set transformed with training statistics (no data leakage)")
elif method == 'minmax':
    test_min = test_scaled.min().min()
    test_max = test_scaled.max().max()
    print(f"\nTest set - Min: {test_min:.4f} | Max: {test_max:.4f}")
    print("✓ Test set transformed with training statistics (no data leakage)")


Test set statistics (should differ from training):
                      mean     std
energy-kcal_100g    0.0493  0.0351
fat_100g            0.1246  0.1419
saturated-fat_100g  0.0478  0.0671
carbohydrates_100g  0.2744  0.2720
sugars_100g         0.1251  0.1854
fiber_100g          0.0233  0.0348
proteins_100g       0.0853  0.0880
salt_100g           0.0207  0.0420
additives_n         0.0325  0.0628

Test set - Min: 0.0000 | Max: 1.8377
✓ Test set transformed with training statistics (no data leakage)


## 6. Summary

In [54]:
print("="*70)
print("VALIDATION SUMMARY")
print("="*70)
print(f"\nScaler loaded: {scaler_path.name}")
print(f"Method: {method.upper()}")
print(f"Features scaled: {len(NUMERICAL_FEATURES)}")
print(f"\nDataset:")
print(f"  Train: {len(X_train_scaled):,} samples")
print(f"  Test:  {len(X_test_scaled):,} samples")
print(f"\n✓ Scaling validation completed")
print(f"✓ Ready for model training")


VALIDATION SUMMARY

Scaler loaded: scaler_minmax.pkl
Method: MINMAX
Features scaled: 9

Dataset:
  Train: 76,905 samples
  Test:  19,227 samples

✓ Scaling validation completed
✓ Ready for model training
