# Feature Engineering Validation

This notebook validates the feature engineering pipeline that creates derived nutritional features.

## Objectives
1. Load cleaned data
2. Apply feature engineering transformations
3. Validate no NaN/Inf values
4. Analyze correlation with target (nutriscore_grade)
5. Assess feature importance and usefulness

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append('../src')

from features.feature_engineering import FeatureEngineer

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Cleaned Data

In [None]:
# Load cleaned data
df = pd.read_csv('../data/processed/openfoodfacts_cleaned.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check for any missing values in nutritional columns
nutritional_cols = ['energy-kcal_100g', 'fat_100g', 'saturated-fat_100g', 
                    'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 
                    'proteins_100g', 'salt_100g']

print("Missing values in nutritional columns:")
print(df[nutritional_cols].isnull().sum())

## 2. Apply Feature Engineering

In [None]:
# Create feature engineer
engineer = FeatureEngineer(
    add_ratios=True,
    add_energy_density=True,
    add_caloric_contributions=True,
    add_boolean_flags=True
)

# Fit and transform
engineer.fit(df)
df_engineered = engineer.transform(df)

print(f"Original shape: {df.shape}")
print(f"Engineered shape: {df_engineered.shape}")
print(f"\nNew features added: {df_engineered.shape[1] - df.shape[1]}")

# Show new columns
new_cols = [col for col in df_engineered.columns if col not in df.columns]
print(f"\nNew feature columns:")
for i, col in enumerate(new_cols, 1):
    print(f"{i}. {col}")

In [None]:
# Display sample of engineered features
print("Sample of engineered features:")
df_engineered[new_cols].head(10)

## 3. Validate No NaN/Inf Values

In [None]:
# Check for NaN values in new features
print("NaN values in engineered features:")
nan_counts = df_engineered[new_cols].isnull().sum()
print(nan_counts)

if nan_counts.sum() > 0:
    print("\n⚠️  WARNING: NaN values detected!")
else:
    print("\n✓ No NaN values detected")

In [None]:
# Check for Inf values in new features
print("Inf values in engineered features:")
inf_counts = {}
for col in new_cols:
    inf_count = np.isinf(df_engineered[col]).sum()
    inf_counts[col] = inf_count

inf_counts_series = pd.Series(inf_counts)
print(inf_counts_series)

if inf_counts_series.sum() > 0:
    print("\n⚠️  WARNING: Inf values detected!")
else:
    print("\n✓ No Inf values detected")

In [None]:
# Statistical summary of engineered features
print("Statistical summary of engineered features:")
df_engineered[new_cols].describe()

## 4. Correlation Analysis with Target

In [None]:
# Convert nutriscore_grade to numeric for correlation analysis
# Nutri-Score grades: a (best), b, c, d, e (worst)
grade_mapping = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
df_engineered['nutriscore_numeric'] = df_engineered['nutriscore_grade'].map(grade_mapping)

# Calculate correlations with target
correlations = df_engineered[new_cols + ['nutriscore_numeric']].corr()['nutriscore_numeric'].drop('nutriscore_numeric')
correlations = correlations.sort_values(ascending=False)

print("Correlation of engineered features with Nutri-Score (higher grade = worse):")
print(correlations)
print("\nNote: Positive correlation = feature increases with worse Nutri-Score")
print("      Negative correlation = feature decreases with worse Nutri-Score")

In [None]:
# Visualize correlations
plt.figure(figsize=(10, 6))
correlations.plot(kind='barh', color=['green' if x < 0 else 'red' for x in correlations])
plt.xlabel('Correlation with Nutri-Score')
plt.title('Correlation of Engineered Features with Nutri-Score\n(Positive = worse score, Negative = better score)')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
plt.tight_layout()
plt.show()

In [None]:
# Compare with original features correlation
original_nutritional_cols = ['energy-kcal_100g', 'fat_100g', 'saturated-fat_100g',
                             'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
                             'proteins_100g', 'salt_100g']

original_correlations = df_engineered[original_nutritional_cols + ['nutriscore_numeric']].corr()['nutriscore_numeric'].drop('nutriscore_numeric')
original_correlations = original_correlations.sort_values(ascending=False)

print("Correlation of original features with Nutri-Score:")
print(original_correlations)

In [None]:
# Compare absolute correlations
comparison = pd.DataFrame({
    'Engineered Features': correlations.abs().sort_values(ascending=False),
    'Original Features': original_correlations.abs().sort_values(ascending=False)
})

print("\nTop correlated features (by absolute value):")
print("\nEngineered Features:")
print(comparison['Engineered Features'].head(5))
print("\nOriginal Features:")
print(comparison['Original Features'].head(5))

## 5. Feature Importance Assessment

In [None]:
# Identify features with low correlation (potential candidates for removal)
threshold = 0.1  # Features with |correlation| < 0.1 are considered weak

weak_features = correlations[correlations.abs() < threshold]
strong_features = correlations[correlations.abs() >= threshold]

print(f"Features with strong correlation (|r| >= {threshold}):")
print(strong_features.sort_values(key=abs, ascending=False))
print(f"\nNumber of strong features: {len(strong_features)}")

print(f"\n\nFeatures with weak correlation (|r| < {threshold}):")
print(weak_features.sort_values(key=abs, ascending=False))
print(f"\nNumber of weak features: {len(weak_features)}")

In [None]:
# Visualize distribution of engineered features by Nutri-Score grade
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for i, col in enumerate(new_cols):
    if i < len(axes):
        df_engineered.boxplot(column=col, by='nutriscore_grade', ax=axes[i])
        axes[i].set_title(f'{col}')
        axes[i].set_xlabel('Nutri-Score Grade')
        axes[i].set_ylabel('Value')

plt.suptitle('Distribution of Engineered Features by Nutri-Score Grade', y=1.00)
plt.tight_layout()
plt.show()

## 6. Save Engineered Data

In [None]:
# Save engineered dataset
output_path = '../data/processed/openfoodfacts_engineered.csv'
df_engineered.drop('nutriscore_numeric', axis=1, inplace=True)  # Remove temporary column
df_engineered.to_csv(output_path, index=False)
print(f"Engineered dataset saved to: {output_path}")
print(f"Shape: {df_engineered.shape}")

In [None]:
# Save the feature engineer
engineer_path = '../models/feature_engineer.pkl'
engineer.save(engineer_path)
print(f"Feature engineer saved to: {engineer_path}")

## Summary

### Engineered Features Created:
1. **Macro nutrient ratios**: fat_to_protein_ratio, sugar_to_carb_ratio, saturated_to_total_fat_ratio
2. **Energy density**: energy_density (kcal per gram)
3. **Caloric contributions**: calories_from_fat, calories_from_carbs, calories_from_protein
4. **Boolean flags**: high_fat, high_sugar, high_salt

### Validation Results:
- Check for NaN/Inf values above
- Correlation analysis completed
- Feature importance assessed

### Next Steps:
- Remove features with very low correlation if needed
- Use engineered features in model training
- Evaluate impact on model performance