# Week 4: Data Preprocessing & Feature Engineering

This notebook covers:
1. Data cleaning and preprocessing
2. Feature encoding and scaling
3. Feature engineering and selection
4. Data splitting for ML
5. Preparation for model training

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Scikit-learn preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
import joblib

# Set up paths
DATA_PROCESSED = Path('../data/processed')
MODELS = Path('../models')
MODELS.mkdir(exist_ok=True)

print("✅ Libraries imported and paths set up")

In [None]:
# Load the ingredient database
print("📊 Loading ingredient database...")
ingredient_db = pd.read_csv(DATA_PROCESSED / 'ingredient_toxicity_db.csv')

print(f"✅ Loaded {len(ingredient_db)} ingredients")
print(f"Columns: {list(ingredient_db.columns)}")
print(f"Shape: {ingredient_db.shape}")

# Display first few rows
ingredient_db.head()

In [None]:
# Data preprocessing pipeline
print("🔧 PREPROCESSING PIPELINE")
print("=" * 40)

# Create a copy for preprocessing
df_processed = ingredient_db.copy()

# 1. Handle missing values (if any)
print("\n1️⃣ Checking for missing values...")
missing_values = df_processed.isnull().sum()
print(f"Missing values: {missing_values.sum()}")
if missing_values.sum() > 0:
    print(missing_values[missing_values > 0])
else:
    print("✅ No missing values found")

# 2. Remove duplicates (if any)
print("\n2️⃣ Checking for duplicates...")
duplicates = df_processed.duplicated().sum()
print(f"Duplicate rows: {duplicates}")
if duplicates > 0:
    df_processed = df_processed.drop_duplicates()
    print(f"✅ Removed {duplicates} duplicate rows")
else:
    print("✅ No duplicates found")

# 3. Data type optimization
print("\n3️⃣ Optimizing data types...")
print("Before optimization:")
print(df_processed.dtypes)

# Convert categorical columns to category type
categorical_cols = ['category', 'health_impact', 'risk_level', 'allergen_risk']
for col in categorical_cols:
    if col in df_processed.columns:
        df_processed[col] = df_processed[col].astype('category')

print("\nAfter optimization:")
print(df_processed.dtypes)
print("✅ Data types optimized")

In [None]:
# Feature encoding
print("🏷️ FEATURE ENCODING")
print("=" * 30)

# Separate features and target
target_col = 'is_toxic'
feature_cols = [col for col in df_processed.columns if col not in [target_col, 'ingredient_name']]

print(f"Target variable: {target_col}")
print(f"Feature columns: {feature_cols}")

# Initialize encoders
label_encoders = {}
encoded_features = df_processed[feature_cols].copy()

# Encode categorical variables
categorical_features = encoded_features.select_dtypes(include=['category', 'object']).columns
print(f"\nCategorical features to encode: {list(categorical_features)}")

for col in categorical_features:
    le = LabelEncoder()
    encoded_features[f'{col}_encoded'] = le.fit_transform(encoded_features[col])
    label_encoders[col] = le
    print(f"✅ Encoded {col}: {len(le.classes_)} unique values")

# Remove original categorical columns
encoded_features = encoded_features.drop(columns=categorical_features)

print(f"\nFinal feature matrix shape: {encoded_features.shape}")
print(f"Features: {list(encoded_features.columns)}")

In [None]:
# Feature engineering
print("⚙️ FEATURE ENGINEERING")
print("=" * 35)

# Create additional features
engineered_features = encoded_features.copy()

# 1. Toxicity level categories
engineered_features['toxicity_level'] = pd.cut(
    engineered_features['toxicity_score'], 
    bins=[0, 20, 40, 70, 100], 
    labels=[0, 1, 2, 3]  # Safe, Low, Medium, High
).astype(int)

# 2. High toxicity flag
engineered_features['high_toxicity_flag'] = (engineered_features['toxicity_score'] > 70).astype(int)

# 3. Toxicity score squared (for non-linear relationships)
engineered_features['toxicity_score_squared'] = engineered_features['toxicity_score'] ** 2

# 4. Toxicity score normalized (0-1 scale)
engineered_features['toxicity_score_normalized'] = (
    engineered_features['toxicity_score'] / 100
)

print(f"✅ Created {len(engineered_features.columns) - len(encoded_features.columns)} new features")
print(f"New features: {[col for col in engineered_features.columns if col not in encoded_features.columns]}")
print(f"Total features: {engineered_features.shape[1]}")

# Display feature statistics
print("\n📊 Feature Statistics:")
engineered_features.describe()

In [None]:
# Feature scaling
print("📏 FEATURE SCALING")
print("=" * 25)

# Prepare features and target
X = engineered_features
y = df_processed[target_col]

print(f"Feature matrix X: {X.shape}")
print(f"Target vector y: {y.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")

# Split data before scaling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n📊 Data Split:")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training target distribution: {y_train.value_counts().to_dict()}")
print(f"Test target distribution: {y_test.value_counts().to_dict()}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

print("\n✅ Features scaled using StandardScaler")
print(f"Scaled training features mean: {X_train_scaled.mean().mean():.3f}")
print(f"Scaled training features std: {X_train_scaled.std().mean():.3f}")

In [None]:
# Feature selection
print("🎯 FEATURE SELECTION")
print("=" * 30)

# 1. Correlation-based feature selection
print("\n1️⃣ Correlation Analysis:")
correlation_matrix = X_train_scaled.corr()

# Find highly correlated features
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            high_corr_pairs.append((
                correlation_matrix.columns[i], 
                correlation_matrix.columns[j], 
                correlation_matrix.iloc[i, j]
            ))

if high_corr_pairs:
    print("High correlation pairs (>0.8):")
    for feat1, feat2, corr in high_corr_pairs:
        print(f"  {feat1} - {feat2}: {corr:.3f}")
else:
    print("✅ No highly correlated features found")

# 2. Statistical feature selection
print("\n2️⃣ Statistical Feature Selection:")
selector = SelectKBest(score_func=f_classif, k='all')
X_train_selected = selector.fit_transform(X_train_scaled, y_train)

# Get feature scores
feature_scores = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'score': selector.scores_,
    'p_value': selector.pvalues_
}).sort_values('score', ascending=False)

print("Feature importance scores:")
print(feature_scores)

# Select top features
top_k = min(8, len(X_train_scaled.columns))  # Select top 8 features or all if less
top_features = feature_scores.head(top_k)['feature'].tolist()

print(f"\n🎯 Selected top {len(top_features)} features:")
for i, feat in enumerate(top_features, 1):
    score = feature_scores[feature_scores['feature'] == feat]['score'].iloc[0]
    print(f"  {i}. {feat}: {score:.2f}")

In [None]:
# Dimensionality reduction (PCA)
print("📉 DIMENSIONALITY REDUCTION (PCA)")
print("=" * 45)

# Apply PCA
pca = PCA(n_components=0.95)  # Keep 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Original features: {X_train_scaled.shape[1]}")
print(f"PCA components: {X_train_pca.shape[1]}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {pca.explained_variance_ratio_.sum():.3f}")

# Plot explained variance
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
plt.title('PCA Explained Variance by Component')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')

plt.subplot(1, 2, 2)
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
         np.cumsum(pca.explained_variance_ratio_), 'bo-')
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% threshold')
plt.legend()

plt.tight_layout()
plt.savefig('../reports/figures/pca_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ PCA analysis completed and saved")

In [None]:
# Save preprocessed data and models
print("💾 SAVING PREPROCESSED DATA")
print("=" * 40)

# Create final datasets
datasets = {
    'original': {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    },
    'scaled': {
        'X_train': X_train_scaled,
        'X_test': X_test_scaled,
        'y_train': y_train,
        'y_test': y_test
    },
    'selected_features': {
        'X_train': X_train_scaled[top_features],
        'X_test': X_test_scaled[top_features],
        'y_train': y_train,
        'y_test': y_test
    },
    'pca': {
        'X_train': pd.DataFrame(X_train_pca, index=X_train.index),
        'X_test': pd.DataFrame(X_test_pca, index=X_test.index),
        'y_train': y_train,
        'y_test': y_test
    }
}

# Save datasets
for dataset_name, data in datasets.items():
    dataset_dir = DATA_PROCESSED / dataset_name
    dataset_dir.mkdir(exist_ok=True)
    
    for data_name, df in data.items():
        filepath = dataset_dir / f'{data_name}.csv'
        df.to_csv(filepath, index=True)
    
    print(f"✅ Saved {dataset_name} dataset: {data['X_train'].shape}")

# Save preprocessing objects
preprocessing_objects = {
    'scaler': scaler,
    'label_encoders': label_encoders,
    'pca': pca,
    'feature_selector': selector,
    'top_features': top_features
}

for obj_name, obj in preprocessing_objects.items():
    filepath = MODELS / f'{obj_name}.pkl'
    joblib.dump(obj, filepath)
    print(f"✅ Saved {obj_name}")

print(f"\n📁 All preprocessed data saved to: {DATA_PROCESSED}")
print(f"📁 All preprocessing objects saved to: {MODELS}")

In [None]:
# Preprocessing summary
print("📋 PREPROCESSING SUMMARY")
print("=" * 35)

summary = {
    'Original Data': {
        'Samples': len(ingredient_db),
        'Features': len(ingredient_db.columns) - 2,  # Exclude target and ingredient name
        'Missing Values': ingredient_db.isnull().sum().sum(),
        'Duplicates': ingredient_db.duplicated().sum()
    },
    'Feature Engineering': {
        'Original Features': len(encoded_features.columns),
        'Engineered Features': len(engineered_features.columns),
        'New Features Created': len(engineered_features.columns) - len(encoded_features.columns)
    },
    'Data Splitting': {
        'Training Samples': len(X_train),
        'Test Samples': len(X_test),
        'Training Positive Class': y_train.sum(),
        'Test Positive Class': y_test.sum()
    },
    'Feature Selection': {
        'Total Features': len(X_train_scaled.columns),
        'Selected Features': len(top_features),
        'PCA Components': X_train_pca.shape[1],
        'PCA Variance Explained': f"{pca.explained_variance_ratio_.sum():.1%}"
    }
}

for section, metrics in summary.items():
    print(f"\n📊 {section}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")

# Save summary
import json
with open(DATA_PROCESSED / 'preprocessing_summary.json', 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print("\n💾 Preprocessing summary saved")
print("\n🎉 Week 4 Data Preprocessing Complete!")
print("\n📋 Next Steps (Week 5):")
print("  1. Baseline model development")
print("  2. Decision Tree implementation")
print("  3. Random Forest training")
print("  4. Model evaluation and comparison")