# Feature Encoding Validation

This notebook validates the feature encoding by:
1. Loading the saved encoder (created by `scripts/apply_encoding.py`)
2. Loading the encoded dataset
3. Verifying encoding properties for each categorical feature
4. Checking that original categorical columns were removed/replaced
5. Checking for data leakage using the `split_group` column


In [62]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

sys.path.insert(0, str(Path.cwd().parent/"src"))

from features.encoding import FeatureEncoder, CATEGORICAL_FEATURES
from sklearn.preprocessing import OneHotEncoder, TargetEncoder


## 1. Load Encoder and Encoded Dataset


In [63]:
# Load encoder
models_dir = Path('../models')
encoder_path = models_dir / 'encoder.joblib'

if not encoder_path.exists():
    raise FileNotFoundError("No encoder found. Run 'scripts/apply_encoding.py' first")

encoder = FeatureEncoder.load(str(encoder_path))
print(f"Encoder loaded: {encoder_path.name}")
print(f"    Features to encode: {len(CATEGORICAL_FEATURES)}")
print("="*70)

# Show encoder type for each feature
for feature in CATEGORICAL_FEATURES:
    if feature in encoder.encoders:
        encoder_type = encoder.encoders[feature].__class__.__name__
        print(f"Feature: {feature} - Encoder: {encoder_type}")
    else:
        print(f"Feature: {feature} - Encoder: NOT FITTED (feature not in training data)")

# Load encoded dataset if it exists
encoded_file = Path('../data/processed/openfoodfacts_encoded.csv')
if encoded_file.exists():
    df_encoded = pd.read_csv(encoded_file)
    print(f"\nLoaded Encoded Dataset with shape: {df_encoded.shape}")
    
    # Check if split_group column exists
    if 'split_group' in df_encoded.columns:
        print("="*70)
        print(f"Found 'split_group' column for train/test tracking")
    else:
        print("="*70)
        print(f"Warning: 'split_group' column not found")
else:
    print(f"\nNo Encoded Dataset found. Run 'scripts/apply_encoding.py' first")


Encoder loaded: encoder.joblib
    Features to encode: 3
Feature: countries - Encoder: MultiLabelBinarizer
Feature: pnns_groups_1 - Encoder: OneHotEncoder
Feature: pnns_groups_2 - Encoder: TargetEncoder

Loaded Encoded Dataset with shape: (96132, 40)
Found 'split_group' column for train/test tracking


## 2. Split Data Using split_group Column


In [64]:
# Split using split_group column if available
if 'split_group' in df_encoded.columns:
    train_mask = df_encoded['split_group'] == 'train'
    X_train_encoded = df_encoded[train_mask].drop(columns=['split_group'], errors='ignore')
    X_test_encoded = df_encoded[~train_mask].drop(columns=['split_group'], errors='ignore')
    print(f"✓ Using 'split_group' column for splitting")
    print(f"  Train: {len(X_train_encoded):,} | Test: {len(X_test_encoded):,}")
else:
    # Fallback: recreate split (not ideal, but works)
    print("⚠ 'split_group' not found. Recreating split...")
    from sklearn.model_selection import train_test_split
    df_original = pd.read_csv('../data/processed/openfoodfacts_scaled.csv')
    X = df_encoded.drop(columns=['nutriscore_grade', 'code'], errors='ignore')
    y = df_original['nutriscore_grade'] if 'nutriscore_grade' in df_original.columns else None
    X_train_encoded, X_test_encoded, _, _ = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"  Train: {len(X_train_encoded):,} | Test: {len(X_test_encoded):,}")


✓ Using 'split_group' column for splitting
  Train: 76,905 | Test: 19,227


## 3. Verify Encoding Properties


In [65]:
summary_data = []

for feature in CATEGORICAL_FEATURES:
    if feature not in encoder.encoders:
        summary_data.append({
            'Feature': feature,
            'Encoder Type': 'NOT FITTED',
            'Status': 'Feature not in training data',
            'Columns Created': 0
        })
        continue
    
    encoder_obj = encoder.encoders[feature]
    encoder_type = encoder_obj.__class__.__name__
    
    if isinstance(encoder_obj, OneHotEncoder):
        # OneHotEncoder creates multiple columns
        encoded_cols = [col for col in X_train_encoded.columns if col.startswith(feature + '_')]
        n_cols = len(encoded_cols)
        
        # Check that original column is removed
        original_removed = feature not in X_train_encoded.columns
        
        # Check OneHotEncoder properties (binary columns, sum to 1)
        if n_cols > 0:
            sample_col = encoded_cols[0]
            is_binary = X_train_encoded[encoded_cols].isin([0, 1]).all().all()
            row_sums = X_train_encoded[encoded_cols].sum(axis=1)
            sums_to_one = row_sums.isin([0, 1]).all()  # Each row should have exactly one 1
        else:
            is_binary = False
            sums_to_one = False
        
        status = "✓ OK" if (original_removed and is_binary and sums_to_one) else "⚠ ISSUES"
        
        summary_data.append({
            'Feature': feature,
            'Encoder Type': encoder_type,
            'Status': status,
            'Columns Created': n_cols,
            'Original Removed': original_removed,
            'Binary Values': is_binary,
            'Rows Sum to 1': sums_to_one
        })
    
    elif isinstance(encoder_obj, TargetEncoder):
        # TargetEncoder replaces the original column with numeric values
        column_exists = feature in X_train_encoded.columns
        original_removed = False  # TargetEncoder keeps the same column name
        
        if column_exists:
            # Check that values are numeric and in reasonable range
            values = X_train_encoded[feature]
            is_numeric = pd.api.types.is_numeric_dtype(values)
            in_range = (values.min() >= 0) and (values.max() <= 1)  # TargetEncoder outputs probabilities
        else:
            is_numeric = False
            in_range = False
        
        status = "✓ OK" if (column_exists and is_numeric and in_range) else "⚠ ISSUES"
        
        summary_data.append({
            'Feature': feature,
            'Encoder Type': encoder_type,
            'Status': status,
            'Columns Created': 1,
            'Original Removed': original_removed,
            'Is Numeric': is_numeric,
            'In Range [0,1]': in_range
        })

check_df = pd.DataFrame(summary_data)
print("Encoding Validation Summary:\n")
display(check_df)


Encoding Validation Summary:



Unnamed: 0,Feature,Encoder Type,Status,Columns Created,Original Removed,Binary Values,Rows Sum to 1,Is Numeric,"In Range [0,1]"
0,pnns_groups_1,OneHotEncoder,✓ OK,11,True,True,True,,
1,pnns_groups_2,TargetEncoder,✓ OK,1,False,,,True,True


## 4. Check Original Columns Removed


In [66]:
# Check that original categorical columns are removed (for OneHotEncoder) or replaced (for TargetEncoder)
removal_check = []

for feature in CATEGORICAL_FEATURES:
    if feature not in encoder.encoders:
        removal_check.append({
            'Feature': feature,
            'In Encoded Dataset': feature in df_encoded.columns,
            'Status': 'N/A (not encoded)'
        })
        continue
    
    encoder_obj = encoder.encoders[feature]
    in_dataset = feature in df_encoded.columns
    
    if isinstance(encoder_obj, OneHotEncoder):
        # OneHotEncoder should remove original column
        status = "✓ OK" if not in_dataset else "⚠ ERROR: Original column still present"
    elif isinstance(encoder_obj, TargetEncoder):
        # TargetEncoder replaces original with numeric values
        if in_dataset:
            is_numeric = pd.api.types.is_numeric_dtype(df_encoded[feature])
            status = "✓ OK (replaced with numeric)" if is_numeric else "⚠ ERROR: Not numeric"
        else:
            status = "⚠ ERROR: Column missing"
    else:
        status = "⚠ UNKNOWN ENCODER TYPE"
    
    removal_check.append({
        'Feature': feature,
        'In Encoded Dataset': in_dataset,
        'Status': status
    })

print("Original Column Removal Check:\n")
display(pd.DataFrame(removal_check))


Original Column Removal Check:



Unnamed: 0,Feature,In Encoded Dataset,Status
0,countries,False,⚠ UNKNOWN ENCODER TYPE
1,pnns_groups_1,False,✓ OK
2,pnns_groups_2,True,✓ OK (replaced with numeric)


## 5. Check Data Leakage Prevention


In [67]:
# Check for data leakage by comparing train/test statistics
leakage_data = []

for feature in CATEGORICAL_FEATURES:
    if feature not in encoder.encoders:
        continue
    
    encoder_obj = encoder.encoders[feature]
    
    if isinstance(encoder_obj, OneHotEncoder):
        # For OneHotEncoder, check encoded columns
        encoded_cols = [col for col in X_train_encoded.columns if col.startswith(feature + '_')]
        
        if len(encoded_cols) == 0:
            continue
        
        # Check if test set has categories not seen in training
        train_means = X_train_encoded[encoded_cols].mean()
        test_means = X_test_encoded[encoded_cols].mean()
        
        # Check for suspicious patterns (test set should have similar distribution)
        # If test set has perfect 0/1 patterns that match training exactly, might indicate leakage
        mean_diff = (train_means - test_means).abs().mean()
        
        # Also check for unknown categories (should be handled by handle_unknown="ignore")
        train_max = X_train_encoded[encoded_cols].max().max()
        test_max = X_test_encoded[encoded_cols].max().max()
        
        # Leakage would show as test set having identical patterns to train
        is_suspicious = mean_diff < 0.001 and train_max == test_max == 1.0
        
        leakage_data.append({
            'Feature': feature,
            'Encoder': 'OneHotEncoder',
            'Test Set Mean Diff': f"{mean_diff:.4f}",
            'Leakage Detected': "⚠️ YES" if is_suspicious else "✅ NO",
            'Note': 'Check if test patterns match train exactly'
        })
    
    elif isinstance(encoder_obj, TargetEncoder):
        # For TargetEncoder, check that test values are in reasonable range
        if feature not in X_test_encoded.columns:
            continue
        
        train_values = X_train_encoded[feature]
        test_values = X_test_encoded[feature]
        
        train_mean = train_values.mean()
        test_mean = test_values.mean()
        mean_diff = abs(train_mean - test_mean)
        
        # TargetEncoder should produce similar distributions
        # Very similar means might indicate leakage, but some similarity is expected
        is_suspicious = mean_diff < 0.001  # Very suspicious if identical
        
        leakage_data.append({
            'Feature': feature,
            'Encoder': 'TargetEncoder',
            'Train Mean': f"{train_mean:.4f}",
            'Test Mean': f"{test_mean:.4f}",
            'Mean Diff': f"{mean_diff:.4f}",
            'Leakage Detected': "⚠️ YES" if is_suspicious else "✅ NO",
            'Note': 'TargetEncoder uses target info, some similarity expected'
        })

if leakage_data:
    print("DATA LEAKAGE CHECK (Target: NO for all rows)\n")
    display(pd.DataFrame(leakage_data))
else:
    print("No categorical features encoded to check for leakage.")


DATA LEAKAGE CHECK (Target: NO for all rows)



Unnamed: 0,Feature,Encoder,Test Set Mean Diff,Leakage Detected,Note,Train Mean,Test Mean,Mean Diff
0,pnns_groups_1,OneHotEncoder,0.0018,✅ NO,Check if test patterns match train exactly,,,
1,pnns_groups_2,TargetEncoder,,⚠️ YES,"TargetEncoder uses target info, some similarit...",0.1496,0.1502,0.0006


## 6. Summary


In [68]:
print("="*70)
print("VALIDATION SUMMARY")
print("="*70)

print(f"\nEncoder loaded: {encoder_path.name}")
print(f"Features to encode: {len(CATEGORICAL_FEATURES)}")

# Count encoder types
encoder_types = {}
for feature in CATEGORICAL_FEATURES:
    if feature in encoder.encoders:
        enc_type = encoder.encoders[feature].__class__.__name__
        encoder_types[enc_type] = encoder_types.get(enc_type, 0) + 1

print(f"\nEncoder distribution:")
for enc_type, count in encoder_types.items():
    print(f"  {enc_type}: {count} feature(s)")

print(f"\nDataset:")
print(f"  Train: {len(X_train_encoded):,} samples")
print(f"  Test:  {len(X_test_encoded):,} samples")
print(f"  Total columns: {len(df_encoded.columns)}")

# Check for issues
issues = []
for feature in CATEGORICAL_FEATURES:
    if feature not in encoder.encoders:
        issues.append(f"  - {feature}: Not encoded (not in training data)")

if issues:
    print(f"\n⚠ Issues found:")
    for issue in issues:
        print(issue)
else:
    print(f"\n✓ Encoding validation completed")
    print(f"✓ Ready for model training")
    print(f"✓ No major issues detected")


VALIDATION SUMMARY

Encoder loaded: encoder.joblib
Features to encode: 3

Encoder distribution:
  MultiLabelBinarizer: 1 feature(s)
  OneHotEncoder: 1 feature(s)
  TargetEncoder: 1 feature(s)

Dataset:
  Train: 76,905 samples
  Test:  19,227 samples
  Total columns: 40

✓ Encoding validation completed
✓ Ready for model training
✓ No major issues detected


In [69]:
df=pd.read_csv("D:\cesare\Documents\ML_project\ml-project-2025\data\processed\openfoodfacts_encoded.csv")

In [70]:
df.head()

Unnamed: 0,code,product_name,brands,categories,energy_100g,energy-kcal_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,...,pnns_groups_1_Composite foods,pnns_groups_1_Fat and sauces,pnns_groups_1_Fish Meat Eggs,pnns_groups_1_Fruits and vegetables,pnns_groups_1_Milk and dairy products,pnns_groups_1_Salty snacks,pnns_groups_1_Sugary snacks,pnns_groups_1_unknown,nutriscore_grade,split_group
0,2201130003253,Lomo embuchado,Realvalle,"Productos a base de carne, Carnes, Embutidos",946.0,-0.201939,0.08,0.031313,-0.999668,0.004,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,e,train
1,3663811146427,Farçou façon grand mère pommes de terre sauce ...,Cellier Sarlat,Plats préparés,519.0,-0.792792,0.078,0.029293,-0.768744,0.006,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,c,train
2,9003740065606,Bio-Karottensaft,Ja! Natürlich,"Pflanzliche Lebensmittel und Getränke, Getränk...",163.0,-1.285168,0.005,0.00101,-0.691769,0.088,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,e,test
3,8431876251883,Macedonia de verduras,Carrefour,"Alimentos y bebidas de origen vegetal, Aliment...",180.0,-1.261998,0.0,0.0,-0.73942,0.01,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,c,train
4,8014745044207,Burrata,Murgella,"Latticini, Cibi fermentati, Prodotti lattiero-...",908.0,-0.254073,0.18,0.121212,-0.968512,0.0125,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,c,test


In [71]:
df2=pd.read_csv("D:\cesare\Documents\ML_project\ml-project-2025\data\processed\openfoodfacts_scaled.csv")

In [72]:
df2['countries'].value_counts().head(10)

countries
France            29323
United States     25302
Germany            6408
Spain              5872
Italy              5374
United Kingdom     2745
unknown            2549
Switzerland        1875
Belgium            1601
Canada             1342
Name: count, dtype: int64