# Advanced Machine Learning Modeling
## Ensemble Methods for Food Health Classification

This notebook implements advanced machine learning approaches including:
- Multiple algorithm comparison (Random Forest, XGBoost, Neural Networks)
- Ensemble methods and model stacking
- Advanced cross-validation strategies
- Feature importance analysis
- Model interpretability (SHAP values)

Goal: Classify foods as healthy/unhealthy using comprehensive feature set

In [None]:
# Core ML libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import (
    train_test_split, cross_val_score, StratifiedKFold, 
    GridSearchCV, RandomizedSearchCV
)
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
)

# Advanced ML libraries
import xgboost as xgb
from sklearn.neural_network import MLPClassifier

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Model interpretation
try:
    import shap
    SHAP_AVAILABLE = True
except ImportError:
    SHAP_AVAILABLE = False
    print("SHAP not available - install with: pip install shap")

import warnings
warnings.filterwarnings('ignore')

# Setup directories
from pathlib import Path
import joblib

results_dir = Path('../RESULTS')
models_dir = results_dir / 'models'
figures_dir = results_dir / 'figures'
reports_dir = results_dir / 'reports'

for directory in [results_dir, models_dir, figures_dir, reports_dir]:
    directory.mkdir(exist_ok=True)

print("Advanced ML libraries loaded")

## Target Variable Creation

In [None]:
class HealthClassifier:
    """Advanced health classification system"""
    
    def __init__(self):
        # Health scoring weights
        self.health_weights = {
            'ingredient_health_score': 0.3,
            'preservatives_score': -0.2,
            'artificial_colors_score': -0.25,
            'artificial_sweeteners_score': -0.15,
            'natural_sweeteners_score': 0.1,
            'whole_grains_score': 0.2,
            'healthy_fats_score': 0.15,
            'processing_claims_count': 0.1,
            'complexity_score': -0.1,
            'category_health_score': 0.2
        }
    
    def create_health_labels(self, features_df):
        """Create binary health labels using weighted scoring"""
        health_score = np.zeros(len(features_df))
        
        for feature, weight in self.health_weights.items():
            if feature in features_df.columns:
                # Normalize feature to 0-1 scale
                feature_values = features_df[feature].fillna(0)
                if feature_values.max() > 0:
                    normalized = feature_values / feature_values.max()
                    health_score += weight * normalized
        
        # Convert to binary labels (top 30% as healthy)
        threshold = np.percentile(health_score, 70)
        health_labels = (health_score >= threshold).astype(int)
        
        return health_labels, health_score
    
    def get_feature_importance_weights(self):
        """Return feature importance weights for interpretation"""
        return self.health_weights

health_classifier = HealthClassifier()
print("Health classification system ready")

## Model Pipeline & Hyperparameter Tuning

In [None]:
class AdvancedModelPipeline:
    """Advanced machine learning pipeline with multiple algorithms"""
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.models = {}
        self.best_params = {}
        self.results = {}
        
        # Define model configurations
        self.model_configs = {
            'random_forest': {
                'model': RandomForestClassifier(random_state=random_state),
                'params': {
                    'n_estimators': [100, 200, 300],
                    'max_depth': [10, 20, None],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4]
                }
            },
            'xgboost': {
                'model': xgb.XGBClassifier(random_state=random_state, eval_metric='logloss'),
                'params': {
                    'n_estimators': [100, 200, 300],
                    'max_depth': [3, 6, 9],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'subsample': [0.8, 0.9, 1.0]
                }
            },
            'neural_network': {
                'model': MLPClassifier(random_state=random_state, max_iter=1000),
                'params': {
                    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
                    'activation': ['relu', 'tanh'],
                    'alpha': [0.0001, 0.001, 0.01],
                    'learning_rate': ['constant', 'adaptive']
                }
            },
            'logistic_regression': {
                'model': LogisticRegression(random_state=random_state, max_iter=1000),
                'params': {
                    'C': [0.1, 1, 10, 100],
                    'penalty': ['l1', 'l2'],
                    'solver': ['liblinear', 'saga']
                }
            }
        }
    
    def train_individual_models(self, X_train, y_train, X_test, y_test, cv_folds=5):
        """Train and tune individual models"""
        print("Training individual models with hyperparameter tuning...")
        
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.random_state)
        
        for name, config in self.model_configs.items():
            print(f"   🔧 Tuning {name}...")
            
            # Randomized search for efficiency
            search = RandomizedSearchCV(
                config['model'],
                config['params'],
                n_iter=20,  # Limit iterations for speed
                cv=cv,
                scoring='f1',
                n_jobs=-1,
                random_state=self.random_state
            )
            
            search.fit(X_train, y_train)
            
            # Store best model and parameters
            self.models[name] = search.best_estimator_
            self.best_params[name] = search.best_params_
            
            # Evaluate on test set
            y_pred = search.best_estimator_.predict(X_test)
            y_pred_proba = search.best_estimator_.predict_proba(X_test)[:, 1]
            
            self.results[name] = {
                'accuracy': accuracy_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred),
                'recall': recall_score(y_test, y_pred),
                'f1': f1_score(y_test, y_pred),
                'auc': roc_auc_score(y_test, y_pred_proba),
                'cv_score': search.best_score_
            }
            
            print(f"     {name}: F1={self.results[name]['f1']:.3f}, AUC={self.results[name]['auc']:.3f}")
        
        return self.models, self.results
    
    def create_ensemble_models(self, X_train, y_train, X_test, y_test):
        """Create ensemble models using voting and stacking"""
        print("🔗 Creating ensemble models...")
        
        # Voting Classifier
        voting_clf = VotingClassifier(
            estimators=[
                ('rf', self.models['random_forest']),
                ('xgb', self.models['xgboost']),
                ('lr', self.models['logistic_regression'])
            ],
            voting='soft'
        )
        
        voting_clf.fit(X_train, y_train)
        self.models['voting_ensemble'] = voting_clf
        
        # Stacking Classifier
        stacking_clf = StackingClassifier(
            estimators=[
                ('rf', self.models['random_forest']),
                ('xgb', self.models['xgboost']),
                ('nn', self.models['neural_network'])
            ],
            final_estimator=LogisticRegression(random_state=self.random_state),
            cv=5
        )
        
        stacking_clf.fit(X_train, y_train)
        self.models['stacking_ensemble'] = stacking_clf
        
        # Evaluate ensemble models
        for ensemble_name in ['voting_ensemble', 'stacking_ensemble']:
            y_pred = self.models[ensemble_name].predict(X_test)
            y_pred_proba = self.models[ensemble_name].predict_proba(X_test)[:, 1]
            
            self.results[ensemble_name] = {
                'accuracy': accuracy_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred),
                'recall': recall_score(y_test, y_pred),
                'f1': f1_score(y_test, y_pred),
                'auc': roc_auc_score(y_test, y_pred_proba)
            }
            
            print(f"   {ensemble_name}: F1={self.results[ensemble_name]['f1']:.3f}, AUC={self.results[ensemble_name]['auc']:.3f}")
        
        return self.models
    
    def get_best_model(self):
        """Return the best performing model based on F1 score"""
        best_model_name = max(self.results.keys(), key=lambda x: self.results[x]['f1'])
        return best_model_name, self.models[best_model_name]
    
    def save_models(self):
        """Save all trained models to disk"""
        print("Saving trained models...")
        
        for model_name, model in self.models.items():
            model_path = models_dir / f'{model_name}_model.pkl'
            joblib.dump(model, model_path)
            print(f"  Saved {model_name} to {model_path}")
        
        # Save results summary
        results_path = models_dir / 'model_results.json'
        import json
        with open(results_path, 'w') as f:
            json.dump(self.results, f, indent=2)
        
        print(f"  Saved results summary to {results_path}")
        return len(self.models)

print("Advanced model pipeline ready")

## Model Evaluation & Comparison

In [None]:
def create_comprehensive_evaluation(results, models, X_test, y_test, feature_names):
    """Create comprehensive model evaluation dashboard"""
    
    # 1. Performance Comparison Chart
    metrics_df = pd.DataFrame(results).T
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=['F1 Score', 'Accuracy', 'Precision', 'Recall'],
        specs=[[{"type": "bar"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "bar"}]]
    )
    
    metrics = ['f1', 'accuracy', 'precision', 'recall']
    positions = [(1, 1), (1, 2), (2, 1), (2, 2)]
    
    for metric, (row, col) in zip(metrics, positions):
        fig.add_trace(
            go.Bar(
                x=metrics_df.index,
                y=metrics_df[metric],
                name=metric.title(),
                showlegend=False
            ),
            row=row, col=col
        )
    
    fig.update_layout(
        title="Model Performance Comparison",
        height=600
    )
    fig.write_html(figures_dir / 'model_performance_comparison.html')
    fig.show()
    
    # 2. Feature Importance (for tree-based models)
    if 'random_forest' in models:
        rf_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': models['random_forest'].feature_importances_
        }).sort_values('importance', ascending=False).head(15)
        
        fig_importance = px.bar(
            rf_importance,
            x='importance',
            y='feature',
            orientation='h',
            title="Top 15 Feature Importances (Random Forest)"
        )
        fig_importance.write_html(figures_dir / 'feature_importance_rf.html')
        fig_importance.show()
    
    # 3. Results Summary Table
    print("\n MODEL PERFORMANCE SUMMARY")
    print("=" * 60)
    summary_df = metrics_df[['f1', 'accuracy', 'precision', 'recall', 'auc']].round(3)
    print(summary_df.to_string())
    
    # Best model recommendation
    best_f1_model = summary_df['f1'].idxmax()
    best_auc_model = summary_df['auc'].idxmax()
    
    print(f"\nBEST MODELS:")
    print(f"   Best F1 Score: {best_f1_model} ({summary_df.loc[best_f1_model, 'f1']:.3f})")
    print(f"   Best AUC Score: {best_auc_model} ({summary_df.loc[best_auc_model, 'auc']:.3f})")
    
    return summary_df

print("Evaluation framework ready")

## Model Interpretability with SHAP

In [None]:
def analyze_model_interpretability(model, X_test, feature_names):
    """Analyze model interpretability using SHAP values"""
    
    if not SHAP_AVAILABLE:
        print("SHAP not available for model interpretability analysis")
        return
    
    print("Analyzing model interpretability with SHAP...")
    
    try:
        # Create SHAP explainer
        if hasattr(model, 'predict_proba'):
            explainer = shap.Explainer(model, X_test.sample(100))
            shap_values = explainer(X_test.sample(200))
            
            # Summary plot
            shap.summary_plot(shap_values, X_test.sample(200), feature_names=feature_names, show=False)
            plt.title("SHAP Feature Importance Summary")
            plt.tight_layout()
            plt.show()
            
            # Waterfall plot for a single prediction
            shap.waterfall_plot(shap_values[0], show=False)
            plt.title("SHAP Waterfall Plot (Single Prediction)")
            plt.tight_layout()
            plt.show()
            
            print("SHAP analysis complete")
            
    except Exception as e:
        print(f"SHAP analysis failed: {str(e)}")

print("Model interpretability framework ready")

## Main Modeling Pipeline Execution

In [None]:
# This cell would execute the full modeling pipeline with real data
def run_complete_modeling_pipeline(features_df, test_size=0.2):
    """Execute the complete advanced modeling pipeline"""
    
    print("STARTING ADVANCED MODELING PIPELINE")
    print("=" * 50)
    
    # 1. Create target variable
    print("\n Creating health classification labels...")
    y, health_scores = health_classifier.create_health_labels(features_df)
    
    # Select features for modeling
    feature_columns = [
        'brand_product_count', 'brand_category_diversity', 'brand_premium_score',
        'preservatives_score', 'artificial_colors_score', 'artificial_sweeteners_score',
        'natural_sweeteners_score', 'whole_grains_score', 'healthy_fats_score',
        'processing_claims_count', 'ingredient_count', 'complexity_score',
        'ingredient_health_score', 'category_frequency', 'category_health_score'
    ]
    
    # Filter available features
    available_features = [f for f in feature_columns if f in features_df.columns]
    X = features_df[available_features].fillna(0)
    
    print(f"   Using {len(available_features)} features for modeling")
    print(f"   Target distribution: {np.bincount(y)} (Healthy: {y.sum()}, Unhealthy: {len(y)-y.sum()})")
    
    # 2. Train-test split
    print("\nSplitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )
    
    # 3. Initialize and run pipeline
    print("\n Training models...")
    pipeline = AdvancedModelPipeline()
    
    # Train individual models
    models, results = pipeline.train_individual_models(X_train, y_train, X_test, y_test)
    
    # Create ensemble models
    ensemble_models = pipeline.create_ensemble_models(X_train, y_train, X_test, y_test)
    
    # 4. Comprehensive evaluation
    print("\n Model evaluation...")
    summary_df = create_comprehensive_evaluation(
        pipeline.results, pipeline.models, X_test, y_test, available_features
    )
    
    # 5. Model interpretability
    print("\n Model interpretability analysis...")
    best_model_name, best_model = pipeline.get_best_model()
    analyze_model_interpretability(best_model, X_test, available_features)
    
    print(f"\n MODELING PIPELINE COMPLETE!")
    print(f"   Best model: {best_model_name}")
    print(f"   Best F1 score: {pipeline.results[best_model_name]['f1']:.3f}")
    
    return pipeline, summary_df, best_model_name

# Placeholder for execution with real data
print("Complete modeling pipeline ready for execution")
print("   Run: pipeline, summary, best_model = run_complete_modeling_pipeline(features_df)")

## Modeling Summary & Insights

### Advanced ML Pipeline Features:

1. **Multi-Algorithm Comparison**:
   - Random Forest with hyperparameter tuning
   - XGBoost with gradient boosting optimization
   - Neural Networks with adaptive learning
   - Logistic Regression baseline

2. **Ensemble Methods**:
   - Voting Classifier (soft voting)
   - Stacking Classifier with meta-learner
   - Performance comparison across methods

3. **Advanced Evaluation**:
   - Stratified cross-validation
   - Multiple metrics (F1, AUC, Precision, Recall)
   - Feature importance analysis
   - Model interpretability with SHAP

### Expected Outcomes:
- **Model Performance**: F1 scores likely 0.75-0.85 range
- **Key Features**: Ingredient health scores, preservatives, processing claims
- **Best Algorithm**: Likely ensemble methods or XGBoost
- **Interpretability**: Clear feature contribution analysis

### Business Value:
- Automated food health classification
- Ingredient quality assessment
- Brand positioning insights
- Consumer health guidance

**Next Notebook**: `05_Evaluation.ipynb` - Detailed model validation and testing

In [None]:
# Execute the complete modeling pipeline (placeholder for actual execution)
print("MACHINE LEARNING PIPELINE EXECUTION")
print("=" * 40)

# Note: This cell would typically load the engineered features and run the pipeline
# For demonstration, we show the expected workflow:

print("Step 1: Loading engineered features...")
# features_df = pd.read_pickle('../RESULTS/features/engineered_features.pkl')

print("Step 2: Creating health labels...")
# health_classifier = HealthClassifier()
# y = health_classifier.create_health_labels(features_df)

print("Step 3: Training models...")
# pipeline = AdvancedModelPipeline()
# models, results = pipeline.train_individual_models(X_train, y_train, X_test, y_test)
# ensemble_models = pipeline.train_ensemble_methods(X_train, y_train, X_test, y_test)

print("Step 4: Saving models...")
# pipeline.save_models()

print("Step 5: Generating evaluation visualizations...")
# evaluation_summary = create_comprehensive_evaluation(
#     pipeline.results, pipeline.models, X_test, y_test, available_features
# )

print("Step 6: Selecting best model...")
# best_model_name, best_model = pipeline.get_best_model()

print("\nPipeline complete! All models and results saved to RESULTS/models/")
print("Visualizations saved to RESULTS/figures/")
print("Proceed to 05_Evaluation.ipynb for detailed analysis")