In [1]:
# =====================================
# CONFIGURATION: TESTING vs PRODUCTION
# =====================================

# Toggle between TESTING (fast, 1 epoch) and PRODUCTION (full training)
TESTING_MODE = True  # Set to False for final production run

if TESTING_MODE:
    print("üß™ TESTING MODE - Using minimal epochs for quick pipeline validation")
    TRAINING_CONFIG = {
        'iterations': 10,      # CatBoost iterations (minimum viable)
        'n_estimators': 10,    # Tree-based models (minimum viable)
        'epochs': 1,           # Neural networks
        'n_trials': 2,         # Optuna optimization trials
        'cv_folds': 2,         # Cross-validation folds
        'max_features': 20,    # Reduced feature count for testing
        'early_stopping': 5    # Early stopping rounds
    }
else:
    print("üöÄ PRODUCTION MODE - Full training with optimal parameters")
    TRAINING_CONFIG = {
        'iterations': 500,     # CatBoost iterations  
        'n_estimators': 500,   # Tree-based models
        'epochs': 100,         # Neural networks
        'n_trials': 100,       # Optuna optimization trials
        'cv_folds': 5,         # Cross-validation folds
        'max_features': 100,   # Full feature count
        'early_stopping': 50   # Early stopping rounds
    }

print(f"üìä Training Configuration: {TRAINING_CONFIG}")
print("=" * 80)

üß™ TESTING MODE - Using minimal epochs for quick pipeline validation
üìä Training Configuration: {'iterations': 10, 'n_estimators': 10, 'epochs': 1, 'n_trials': 2, 'cv_folds': 2, 'max_features': 20, 'early_stopping': 5}


In [2]:
# =====================================
# ULTIMATE POLLUTION PREDICTION MODEL V5
# Enhanced with 100+ Features, BoxCox Transformation, and Consistent Model Management
# =====================================

import pandas as pd
import numpy as np
import warnings
import os
import json
import joblib
import glob
import shutil
from datetime import datetime
from scipy import stats
from scipy.stats import boxcox, yeojohnson
import gc

# Core ML Libraries
from sklearn.model_selection import (
    train_test_split, cross_val_score, TimeSeriesSplit, KFold,
    StratifiedKFold, GroupKFold
)
from sklearn.preprocessing import (
    RobustScaler, StandardScaler, MinMaxScaler, PowerTransformer,
    PolynomialFeatures, QuantileTransformer
)
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    explained_variance_score, median_absolute_error
)
from sklearn.feature_selection import (
    SelectKBest, f_regression, RFE, SelectFromModel,
    mutual_info_regression
)
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Advanced Models
from sklearn.ensemble import (
    RandomForestRegressor, ExtraTreesRegressor,
    VotingRegressor, StackingRegressor, BaggingRegressor,
    GradientBoostingRegressor, AdaBoostRegressor
)
from sklearn.linear_model import (
    Ridge, Lasso, ElasticNet, HuberRegressor,
    QuantileRegressor, TheilSenRegressor, LinearRegression,
    BayesianRidge, SGDRegressor
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

# Gradient Boosting Libraries
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# Hyperparameter Optimization
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

# Advanced Analysis (if available)
try:
    import shap
    SHAP_AVAILABLE = True
except ImportError:
    SHAP_AVAILABLE = False
    print("SHAP not available - feature importance analysis will be limited")

warnings.filterwarnings('ignore')
np.random.seed(42)

# Clean and create directories
def clean_and_create_directories():
    """Remove old models and create fresh directories"""
    directories = [
        'models', 'models/individual', 'models/ensembles', 'models/robust',
        'results', 'submissions', 'feature_analysis', 'transformations', 'predictions'
    ]
    
    print("üßπ Cleaning old models and creating directories...")
    
    # Remove old model files
    for pattern in ['models/*.pkl', 'models/*/*.pkl', 'results/*.json', 'predictions/*.csv']:
        files = glob.glob(pattern)
        for file in files:
            try:
                os.remove(file)
                print(f"   Removed: {file}")
            except:
                pass
    
    # Create directories
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
    
    print("‚úÖ Directories cleaned and created")

clean_and_create_directories()

print("üöÄ ULTIMATE POLLUTION PREDICTION MODEL V5")
print("üîß Enhanced with 100+ Features & BoxCox Transformation")
print("üíæ Automatic Prediction Saving & Consistent Model Management")
print("=" * 80)

# =====================================
# COMPREHENSIVE MODEL REGISTRY (Dynamic Configuration)
# =====================================

def create_model_registry(config):
    """Create model registry with dynamic configuration"""
    return {
        # Gradient Boosting Models
        'catboost_mae': {
            'class': cb.CatBoostRegressor,
            'params': {
                'loss_function': 'MAE',
                'iterations': config['iterations'],
                'depth': 6,
                'learning_rate': 0.05,
                'l2_leaf_reg': 3.0,
                'bootstrap_type': 'Bayesian',
                'bagging_temperature': 1.0,
                'od_type': 'IncToDec',
                'od_wait': config['early_stopping'],
                'random_state': 42,
                'verbose': False
            }
        },
        'catboost_quantile': {
            'class': cb.CatBoostRegressor,
            'params': {
                'loss_function': 'Quantile:alpha=0.5',
                'iterations': config['iterations'],
                'depth': 6,
                'learning_rate': 0.05,
                'l2_leaf_reg': 3.0,
                'bootstrap_type': 'Bayesian',
                'random_state': 42,
                'verbose': False
            }
        },
        'catboost_rmse': {
            'class': cb.CatBoostRegressor,
            'params': {
                'loss_function': 'RMSE',
                'iterations': config['iterations'],
                'depth': 6,
                'learning_rate': 0.05,
                'l2_leaf_reg': 3.0,
                'bootstrap_type': 'Bayesian',
                'random_state': 42,
                'verbose': False
            }
        },
        'lgb_mae': {
            'class': lgb.LGBMRegressor,
            'params': {
                'objective': 'mae',
                'n_estimators': config['n_estimators'],
                'learning_rate': 0.05,
                'num_leaves': 31,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 5,
                'random_state': 42,
                'verbose': -1
            }
        },
        'lgb_rmse': {
            'class': lgb.LGBMRegressor,
            'params': {
                'objective': 'regression',
                'n_estimators': config['n_estimators'],
                'learning_rate': 0.05,
                'num_leaves': 31,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 5,
                'random_state': 42,
                'verbose': -1
            }
        },
        'xgb_mae': {
            'class': xgb.XGBRegressor,
            'params': {
                'objective': 'reg:absoluteerror',
                'n_estimators': config['n_estimators'],
                'learning_rate': 0.05,
                'max_depth': 6,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'random_state': 42
            }
        },
        'xgb_rmse': {
            'class': xgb.XGBRegressor,
            'params': {
                'objective': 'reg:squarederror',
                'n_estimators': config['n_estimators'],
                'learning_rate': 0.05,
                'max_depth': 6,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'random_state': 42
            }
        },
        'gradient_boost': {
            'class': GradientBoostingRegressor,
            'params': {
                'n_estimators': config['n_estimators'] // 2,  # Adjust for slower training
                'learning_rate': 0.1,
                'max_depth': 6,
                'subsample': 0.8,
                'random_state': 42
            }
        },
        
        # Tree-based Models
        'rf_robust': {
            'class': RandomForestRegressor,
            'params': {
                'n_estimators': config['n_estimators'],
                'max_depth': 15,
                'min_samples_split': 10,
                'min_samples_leaf': 5,
                'max_features': 'sqrt',
                'bootstrap': True,
                'random_state': 42,
                'n_jobs': -1
            }
        },
        'extra_trees': {
            'class': ExtraTreesRegressor,
            'params': {
                'n_estimators': config['n_estimators'],
                'max_depth': 15,
                'min_samples_split': 10,
                'min_samples_leaf': 5,
                'max_features': 'sqrt',
                'bootstrap': True,
                'random_state': 42,
                'n_jobs': -1
            }
        },
        'bagging': {
            'class': BaggingRegressor,
            'params': {
                'base_estimator': DecisionTreeRegressor(max_depth=10),
                'n_estimators': config['n_estimators'] // 2,
                'random_state': 42,
                'n_jobs': -1
            }
        },
        'ada_boost': {
            'class': AdaBoostRegressor,
            'params': {
                'n_estimators': config['n_estimators'] // 3,  # AdaBoost is slower
                'learning_rate': 0.1,
                'random_state': 42
            }
        },
        
        # Linear Models
        'ridge': {
            'class': Ridge,
            'params': {
                'alpha': 1.0,
                'random_state': 42
            }
        },
        'lasso': {
            'class': Lasso,
            'params': {
                'alpha': 0.1,
                'random_state': 42,
                'max_iter': 2000
            }
        },
        'elastic_net': {
            'class': ElasticNet,
            'params': {
                'alpha': 0.1,
                'l1_ratio': 0.5,
                'random_state': 42,
                'max_iter': 2000
            }
        },
        'huber': {
            'class': HuberRegressor,
            'params': {
                'epsilon': 1.35,
                'max_iter': 300
            }
        },
        'theil_sen': {
            'class': TheilSenRegressor,
            'params': {
                'random_state': 42,
                'max_subpopulation': 1e4 if not TESTING_MODE else 1e3
            }
        },
        'bayesian_ridge': {
            'class': BayesianRidge,
            'params': {
                'compute_score': True
            }
        },
        
        # Neural Network
        'mlp_regressor': {
            'class': MLPRegressor,
            'params': {
                'hidden_layer_sizes': (100, 50),
                'max_iter': config['epochs'] * 10,  # MLPRegressor uses iterations
                'random_state': 42,
                'early_stopping': True,
                'validation_fraction': 0.1,
                'n_iter_no_change': config['early_stopping']
            }
        },
        
        # Other Models
        'knn': {
            'class': KNeighborsRegressor,
            'params': {
                'n_neighbors': 5,
                'weights': 'distance'
            }
        },
        'svr_rbf': {
            'class': SVR,
            'params': {
                'kernel': 'rbf',
                'C': 1.0,
                'gamma': 'scale'
            }
        }
    }

# Initialize MODEL_REGISTRY with current configuration
# This will be updated when TRAINING_CONFIG is defined
MODEL_REGISTRY = {}

üßπ Cleaning old models and creating directories...
‚úÖ Directories cleaned and created
üöÄ ULTIMATE POLLUTION PREDICTION MODEL V5
üîß Enhanced with 100+ Features & BoxCox Transformation
üíæ Automatic Prediction Saving & Consistent Model Management


In [None]:
# =====================================
# INITIALIZE MODEL REGISTRY & ENHANCED FEATURE ENGINEERING
# =====================================

# Initialize MODEL_REGISTRY with TRAINING_CONFIG
MODEL_REGISTRY = create_model_registry(TRAINING_CONFIG)

def print_model_summary():
    """Print a summary of all available models"""
    print("\nüìã COMPREHENSIVE MODEL REGISTRY SUMMARY")
    print("=" * 60)
    
    categories = {
        'Gradient Boosting': ['catboost', 'lgb', 'xgb', 'gradient'],
        'Tree-based': ['rf', 'extra', 'ada', 'decision', 'bagging'],
        'Linear Models': ['ridge', 'lasso', 'elastic', 'huber', 'quantile', 'theil', 'bayesian', 'sgd', 'linear'],
        'Other Models': ['knn', 'svr', 'mlp']
    }
    
    for category, keywords in categories.items():
        models = [name for name in MODEL_REGISTRY.keys() 
                 if any(keyword in name for keyword in keywords)]
        if models:
            print(f"\n{category}:")
            for model in models:
                print(f"  ‚úì {model}")
    
    print(f"\nüìä Total Models: {len(MODEL_REGISTRY)}")
    print(f"üîß Training Mode: {'TESTING' if TESTING_MODE else 'PRODUCTION'}")

print_model_summary()

# =====================================
# ENHANCED FEATURE ENGINEERING (100+ Features with BoxCox)
# =====================================

def apply_boxcox_transform(data, target_col=None):
    """Enhanced BoxCox transformation with stability improvements"""
    transformed_data = data.copy()
    
    if target_col and target_col in transformed_data.columns:
        target = transformed_data[target_col]
        feature_cols = [col for col in transformed_data.columns if col != target_col]
    else:
        feature_cols = transformed_data.select_dtypes(include=[np.number]).columns
        target = None
    
    print(f"üîÑ Applying BoxCox transformation to {len(feature_cols)} features...")
    
    for col in feature_cols:
        try:
            values = transformed_data[col].values
            
            # Skip if all values are the same
            if np.std(values) < 1e-8:
                continue
                
            # Make values positive by adding constant if needed
            min_val = np.min(values)
            if min_val <= 0:
                values = values - min_val + 1
            
            # Apply BoxCox transformation
            transformed_values, _ = boxcox(values)
            transformed_data[col] = transformed_values
            
        except Exception as e:
            # If BoxCox fails, try Yeo-Johnson transformation
            try:
                transformed_values, _ = yeojohnson(transformed_data[col].values)
                transformed_data[col] = transformed_values
            except:
                # If both fail, apply log transform
                try:
                    values = transformed_data[col].values
                    if np.min(values) <= 0:
                        values = values - np.min(values) + 1
                    transformed_data[col] = np.log1p(values)
                except:
                    # Keep original if all transformations fail
                    pass
    
    print("‚úÖ BoxCox transformation completed!")
    return transformed_data

def create_ultimate_features(data, is_train=True, max_features=None):
    """Create 100+ features with enhanced engineering"""
    
    if max_features is None:
        max_features = TRAINING_CONFIG['max_features']
    
    print(f"\n? Creating ultimate features (target: {max_features})...")
    
    # Start with original features
    df = data.copy()
    initial_features = len(df.columns)
    
    # Remove target if present for feature engineering
    target_col = 'Pollution_value' if 'Pollution_value' in df.columns else None
    if target_col:
        target = df[target_col].copy()
        df = df.drop(columns=[target_col])
        print(f"Target column preserved for feature engineering")
    else:
        target = None
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    print(f"üìä Starting with {len(numeric_cols)} numeric features")
    
    # 1. Statistical Features
    print("? Creating statistical features...")
    for col in numeric_cols[:min(10, len(numeric_cols))]:  # Limit for testing mode
        if col in df.columns:
            # Rolling statistics (if time series structure exists)
            try:
                df[f'{col}_rolling_mean_3'] = df[col].rolling(window=3, min_periods=1).mean()
                df[f'{col}_rolling_std_3'] = df[col].rolling(window=3, min_periods=1).std()
                df[f'{col}_rolling_max_5'] = df[col].rolling(window=5, min_periods=1).max()
                df[f'{col}_rolling_min_5'] = df[col].rolling(window=5, min_periods=1).min()
            except:
                pass
            
            # Lag features
            try:
                df[f'{col}_lag_1'] = df[col].shift(1)
                df[f'{col}_lag_2'] = df[col].shift(2)
            except:
                pass
            
            # Mathematical transformations
            if np.min(df[col]) > 0:
                df[f'{col}_log'] = np.log1p(df[col])
                df[f'{col}_sqrt'] = np.sqrt(df[col])
                df[f'{col}_inv'] = 1 / (df[col] + 1e-8)
            
            # Squared and cubed features
            df[f'{col}_squared'] = df[col] ** 2
            df[f'{col}_cubed'] = df[col] ** 3
    
    # 2. Interaction Features
    print("üîó Creating interaction features...")
    feature_limit = min(8, len(numeric_cols))  # Limit combinations for testing
    for i, col1 in enumerate(numeric_cols[:feature_limit]):
        for j, col2 in enumerate(numeric_cols[i+1:feature_limit]):
            if col1 in df.columns and col2 in df.columns:
                try:
                    # Basic interactions
                    df[f'{col1}_mult_{col2}'] = df[col1] * df[col2]
                    df[f'{col1}_div_{col2}'] = df[col1] / (df[col2] + 1e-8)
                    df[f'{col1}_add_{col2}'] = df[col1] + df[col2]
                    df[f'{col1}_sub_{col2}'] = df[col1] - df[col2]
                    
                    # Ratio features
                    df[f'{col1}_ratio_{col2}'] = df[col1] / (df[col1] + df[col2] + 1e-8)
                except:
                    pass
    
    # 3. Polynomial Features (limited selection)
    print("üìà Creating polynomial features...")
    try:
        selected_numeric = numeric_cols[:min(5, len(numeric_cols))]  # Limit for testing
        if len(selected_numeric) >= 2:
            poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
            poly_features = poly.fit_transform(df[selected_numeric].fillna(0))
            poly_names = [f"poly_{i}" for i in range(poly_features.shape[1] - len(selected_numeric))]
            poly_df = pd.DataFrame(poly_features[:, len(selected_numeric):], 
                                 columns=poly_names, index=df.index)
            df = pd.concat([df, poly_df], axis=1)
    except Exception as e:
        print(f"‚ö†Ô∏è Polynomial features creation failed: {e}")
    
    # 4. Clustering Features
    print("üéØ Creating clustering features...")
    try:
        clustering_features = df[numeric_cols[:min(6, len(numeric_cols))]].fillna(0)
        if len(clustering_features.columns) >= 2:
            # K-means clustering
            for n_clusters in [3, 5, 8]:
                kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
                df[f'cluster_{n_clusters}'] = kmeans.fit_predict(clustering_features)
                
                # Distance to cluster centers
                centers = kmeans.cluster_centers_
                for i in range(n_clusters):
                    distances = np.sqrt(np.sum((clustering_features.values - centers[i])**2, axis=1))
                    df[f'dist_to_cluster_{n_clusters}_{i}'] = distances
    except Exception as e:
        print(f"‚ö†Ô∏è Clustering features creation failed: {e}")
    
    # 5. Time-based Features (if applicable)
    print("‚è∞ Creating time-based features...")
    try:
        # Create synthetic time features if no datetime column exists
        df['synthetic_time'] = np.arange(len(df))
        df['synthetic_time_sin'] = np.sin(2 * np.pi * df['synthetic_time'] / 24)
        df['synthetic_time_cos'] = np.cos(2 * np.pi * df['synthetic_time'] / 24)
        df['synthetic_trend'] = np.arange(len(df)) / len(df)
    except:
        pass
    
    # 6. Statistical Aggregations
    print("üìä Creating aggregation features...")
    try:
        # Row-wise statistics
        numeric_subset = df.select_dtypes(include=[np.number])
        df['row_mean'] = numeric_subset.mean(axis=1)
        df['row_std'] = numeric_subset.std(axis=1)
        df['row_max'] = numeric_subset.max(axis=1)
        df['row_min'] = numeric_subset.min(axis=1)
        df['row_median'] = numeric_subset.median(axis=1)
        df['row_skew'] = numeric_subset.skew(axis=1)
        df['row_kurt'] = numeric_subset.kurtosis(axis=1)
    except:
        pass
    
    # Apply BoxCox transformation
    if target_col:
        df = apply_boxcox_transform(df)
    else:
        df = apply_boxcox_transform(df)
    
    # Handle infinite and NaN values
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(df.median().fillna(0))
    
    # Feature selection to meet target count
    current_features = len(df.columns)
    print(f"üìä Generated {current_features} features")
    
    if current_features > max_features:
        print(f"üéØ Selecting top {max_features} features...")
        # Simple variance-based selection for now
        variances = df.var().sort_values(ascending=False)
        selected_features = variances.head(max_features).index.tolist()
        df = df[selected_features]
    
    # Restore target column if it existed
    if target_col and target is not None:
        df[target_col] = target
        print(f"Target column restored after feature engineering")
    
    final_features = len(df.columns) - (1 if target_col else 0)
    print(f"‚úÖ Ultimate feature engineering completed!")
    print(f"? Final feature count: {final_features}")
    print(f"üìà Feature increase: {final_features - initial_features}")
    
    return df

def save_predictions(predictions, model_name, timestamp):
    """Save predictions in a consistent format"""
    predictions_dir = 'predictions'
    os.makedirs(predictions_dir, exist_ok=True)
    
    # Create predictions dataframe
    pred_df = pd.DataFrame({
        'id': range(len(predictions)),
        'predicted_pollution': predictions
    })
    
    # Save with consistent naming
    filename = f"{predictions_dir}/{model_name}_{timestamp}_predictions.csv"
    pred_df.to_csv(filename, index=False)
    
    print(f"üíæ Predictions saved: {filename}")
    return filename

print("‚úÖ Enhanced feature engineering functions ready!")
print(f"üéØ Target features: {TRAINING_CONFIG['max_features']}")
print("üß™ BoxCox transformation enabled!")

In [None]:
# =====================================
# MAIN EXECUTION PIPELINE
# =====================================

def train_and_evaluate_models(X_train, X_val, y_train, y_val, timestamp):
    """Train all models and return results"""
    model_results = {}
    trained_models = {}
    
    print(f"\nüöÄ Training {len(MODEL_REGISTRY)} models...")
    print("=" * 60)
    
    for model_name, model_config in MODEL_REGISTRY.items():
        try:
            print(f"\nüîß Training {model_name}...")
            
            # Initialize model
            model = model_config['class'](**model_config['params'])
            
            # Train model
            model.fit(X_train, y_train)
            
            # Make predictions
            train_pred = model.predict(X_train)
            val_pred = model.predict(X_val)
            
            # Calculate metrics
            train_mae = mean_absolute_error(y_train, train_pred)
            val_mae = mean_absolute_error(y_val, val_pred)
            train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
            val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
            train_r2 = r2_score(y_train, train_pred)
            val_r2 = r2_score(y_val, val_pred)
            
            # Store results
            model_results[model_name] = {
                'train_mae': train_mae,
                'val_mae': val_mae,
                'train_rmse': train_rmse,
                'val_rmse': val_rmse,
                'train_r2': train_r2,
                'val_r2': val_r2,
                'model': model
            }
            
            trained_models[model_name] = model
            
            # Save model
            model_path = f"models/individual/{model_name}_{timestamp}.pkl"
            joblib.dump(model, model_path)
            
            print(f"    ‚úÖ VAL MAE: {val_mae:.4f} | VAL RMSE: {val_rmse:.4f} | VAL R¬≤: {val_r2:.4f}")
            
        except Exception as e:
            print(f"    ‚ùå Failed to train {model_name}: {e}")
            continue
    
    return model_results, trained_models

def create_ensemble_models(trained_models, X_train, X_val, y_train, y_val, timestamp):
    """Create ensemble models from trained individual models"""
    print(f"\nüéØ Creating ensemble models...")
    
    ensemble_results = {}
    
    if len(trained_models) < 2:
        print("‚ö†Ô∏è Not enough models for ensemble creation")
        return ensemble_results
    
    try:
        # Voting Regressor
        print("üó≥Ô∏è Creating Voting Regressor...")
        voting_models = list(trained_models.items())[:min(5, len(trained_models))]  # Limit for testing
        voting_regressor = VotingRegressor(voting_models)
        voting_regressor.fit(X_train, y_train)
        
        val_pred = voting_regressor.predict(X_val)
        val_mae = mean_absolute_error(y_val, val_pred)
        
        ensemble_results['voting_regressor'] = {
            'val_mae': val_mae,
            'model': voting_regressor
        }
        
        # Save ensemble model
        ensemble_path = f"models/ensembles/voting_regressor_{timestamp}.pkl"
        joblib.dump(voting_regressor, ensemble_path)
        
        print(f"    ‚úÖ Voting Regressor VAL MAE: {val_mae:.4f}")
        
    except Exception as e:
        print(f"    ‚ùå Voting Regressor failed: {e}")
    
    try:
        # Stacking Regressor (if we have enough models)
        if len(trained_models) >= 3:
            print("üìö Creating Stacking Regressor...")
            base_models = list(trained_models.items())[:3]  # Use top 3 models
            meta_model = Ridge(random_state=42)
            
            stacking_regressor = StackingRegressor(
                estimators=base_models,
                final_estimator=meta_model,
                cv=TRAINING_CONFIG['cv_folds']
            )
            stacking_regressor.fit(X_train, y_train)
            
            val_pred = stacking_regressor.predict(X_val)
            val_mae = mean_absolute_error(y_val, val_pred)
            
            ensemble_results['stacking_regressor'] = {
                'val_mae': val_mae,
                'model': stacking_regressor
            }
            
            # Save ensemble model
            ensemble_path = f"models/ensembles/stacking_regressor_{timestamp}.pkl"
            joblib.dump(stacking_regressor, ensemble_path)
            
            print(f"    ‚úÖ Stacking Regressor VAL MAE: {val_mae:.4f}")
            
    except Exception as e:
        print(f"    ‚ùå Stacking Regressor failed: {e}")
    
    return ensemble_results

def main():
    """Main execution pipeline with testing/production mode support"""
    
    print("\n" + "=" * 80)
    print("üöÄ ULTIMATE POLLUTION PREDICTION PIPELINE")
    print(f"üß™ Mode: {'TESTING' if TESTING_MODE else 'PRODUCTION'}")
    print(f"üéØ Target Features: {TRAINING_CONFIG['max_features']}")
    print("=" * 80)
    
    # Create timestamp for consistent file naming
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    try:
        # 1. Load Data
        print("\nüìÇ Loading data...")
        train_data = pd.read_csv('train.csv')
        test_data = pd.read_csv('test.csv')
        
        print(f"    ‚úÖ Train data: {train_data.shape}")
        print(f"    ‚úÖ Test data: {test_data.shape}")
        
        # 2. Feature Engineering with configurable features
        print(f"\nüîß Feature engineering with {TRAINING_CONFIG['max_features']} features...")
        train_engineered = create_ultimate_features(train_data, is_train=True)
        test_engineered = create_ultimate_features(test_data, is_train=False)
        
        # Ensure consistent columns
        common_features = list(set(train_engineered.columns) & set(test_engineered.columns))
        if 'Pollution_value' in common_features:
            common_features.remove('Pollution_value')
        
        print(f"    ‚úÖ Common features: {len(common_features)}")
        
        # 3. Prepare training data
        print("\nüéØ Preparing training data...")
        X = train_engineered[common_features]
        y = train_engineered['Pollution_value']
        X_test = test_engineered[common_features]
        
        # Train-validation split
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        print(f"    ‚úÖ Training set: {X_train.shape}")
        print(f"    ‚úÖ Validation set: {X_val.shape}")
        print(f"    ‚úÖ Test set: {X_test.shape}")
        
        # 4. Scale features
        print("\n‚öñÔ∏è Scaling features...")
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(X_test)
        
        # Convert back to DataFrames
        X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
        X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)
        X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
        
        # Save scaler
        scaler_path = f"transformations/scaler_{timestamp}.pkl"
        joblib.dump(scaler, scaler_path)
        print(f"    ‚úÖ Scaler saved: {scaler_path}")
        
        # 5. Train individual models
        model_results, trained_models = train_and_evaluate_models(
            X_train_scaled, X_val_scaled, y_train, y_val, timestamp
        )
        
        # 6. Create ensemble models
        ensemble_results = create_ensemble_models(
            trained_models, X_train_scaled, X_val_scaled, y_train, y_val, timestamp
        )
        
        # 7. Select best model and make predictions
        print(f"\nüèÜ Selecting best model...")
        all_results = {**model_results, **ensemble_results}
        
        if all_results:
            best_model_name = min(all_results.keys(), key=lambda x: all_results[x]['val_mae'])
            best_model = all_results[best_model_name]['model']
            best_mae = all_results[best_model_name]['val_mae']
            
            print(f"    üèÜ Best Model: {best_model_name}")
            print(f"    üìä Best VAL MAE: {best_mae:.4f}")
            
            # Make final predictions
            print(f"\nüîÆ Making final predictions...")
            final_predictions = best_model.predict(X_test_scaled)
            
            # Save predictions
            pred_filename = save_predictions(final_predictions, best_model_name, timestamp)
            
            # Create submission file
            submission_df = pd.DataFrame({
                'id': range(len(final_predictions)),
                'Pollution_value': final_predictions
            })
            
            submission_path = f"submissions/submission_{best_model_name}_{timestamp}.csv"
            submission_df.to_csv(submission_path, index=False)
            print(f"üì§ Submission saved: {submission_path}")
            
            # Save results summary
            results_summary = {
                'timestamp': timestamp,
                'mode': 'TESTING' if TESTING_MODE else 'PRODUCTION',
                'best_model': best_model_name,
                'best_val_mae': float(best_mae),
                'n_features': len(common_features),
                'training_config': TRAINING_CONFIG,
                'model_results': {k: {metric: float(v) for metric, v in model_results[k].items() 
                                     if metric != 'model'} for k in model_results.keys()},
                'ensemble_results': {k: {metric: float(v) for metric, v in ensemble_results[k].items() 
                                        if metric != 'model'} for k in ensemble_results.keys()}
            }
            
            results_path = f"results/results_summary_{timestamp}.json"
            with open(results_path, 'w') as f:
                json.dump(results_summary, f, indent=2)
            
            print(f"üìä Results summary saved: {results_path}")
            
            # Print final summary
            print(f"\n" + "=" * 80)
            print(f"üéâ PIPELINE COMPLETED SUCCESSFULLY!")
            print(f"üèÜ Best Model: {best_model_name}")
            print(f"üìä Validation MAE: {best_mae:.4f}")
            print(f"üîß Features Used: {len(common_features)}")
            print(f"üíæ Predictions: {pred_filename}")
            print(f"üì§ Submission: {submission_path}")
            print(f"üß™ Mode: {'TESTING' if TESTING_MODE else 'PRODUCTION'}")
            print("=" * 80)
            
        else:
            print("‚ùå No models were successfully trained!")
            
    except Exception as e:
        print(f"‚ùå Pipeline failed with error: {e}")
        import traceback
        traceback.print_exc()

# =====================================
# MANUAL EXECUTION SECTION
# =====================================

print("\nüéØ PIPELINE READY!")
print("üìù To run the pipeline, call: main()")
print("üß™ Current mode:", "TESTING" if TESTING_MODE else "PRODUCTION")
print("‚öôÔ∏è To change mode, modify TESTING_MODE in the first cell")
print("\nüí° Example usage:")
print("   main()  # Run the complete pipeline")
print("=" * 60)

In [None]:
main()