In [1]:
# ===============================
# 🔧 HUBER REGRESSION HYPERPARAMETER TUNING
# Separate file for optimizing HuberRegressor parameters
# ===============================

import pandas as pd
import numpy as np
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold
import optuna
import warnings
warnings.filterwarnings('ignore')

print("🔧 HUBER REGRESSION HYPERPARAMETER TUNING")
print("=" * 60)

# ===============================
# 📂 LOAD DATA
# ===============================
print("📂 LOADING DATA...")
df = pd.read_csv('final_df_binary.csv')
df['date'] = pd.to_datetime(df['date'])

# Feature engineering
df['year'] = df['date'].dt.year
df['is_working_day'] = ((df['is_holiday'] == 0) & (df['is_weekend'] == 0)).astype(int)

# Use training data only (2021-2023)
train_data = df[df['year'] <= 2023].copy()
working_train = train_data[train_data['is_working_day'] == 1]

print(f"✅ Data loaded: {len(working_train):,} working day records")
print(f"📊 Sections: {working_train['section_id'].nunique()}")

# ===============================
# 🎯 HUBER REGRESSION TUNING FUNCTION
# ===============================
def tune_huber_regression(section_data, section_id, n_trials=100):
    """
    Tune HuberRegressor hyperparameters for a specific section
    
    Parameters to tune:
    - epsilon: Robustness parameter (1.0 - 2.0)
    - alpha: Regularization strength (1e-6 - 1e-1)
    - max_iter: Maximum iterations (100 - 2000)
    - fit_intercept: Whether to fit intercept (True/False)
    """
    
    print(f"\n🔧 Tuning HuberRegressor for {section_id}")
    print(f"   Training samples: {len(section_data)}")
    
    # Prepare data
    X = section_data[['total_task_time_minutes']].values
    y = section_data['employees_on_duty'].values
    
    def objective(trial):
        # Hyperparameters to tune
        epsilon = trial.suggest_float('epsilon', 1.0, 2.5, step=0.05)
        alpha = trial.suggest_float('alpha', 1e-6, 1e-1, log=True)
        max_iter = trial.suggest_int('max_iter', 100, 2000, step=50)
        fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
        
        # Create model
        model = HuberRegressor(
            epsilon=epsilon,
            alpha=alpha,
            max_iter=max_iter,
            fit_intercept=fit_intercept,
            warm_start=False
        )
        
        # Cross-validation
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        
        # Use negative MAE as score (since we want to minimize MAE)
        cv_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
        
        return -cv_scores.mean()  # Return positive MAE for minimization
    
    # Create study
    study = optuna.create_study(direction='minimize', study_name=f'huber_tuning_{section_id}')
    
    # Optimize
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    # Get best parameters
    best_params = study.best_params
    best_score = study.best_value
    
    # Test final model with best parameters
    final_model = HuberRegressor(**best_params)
    final_model.fit(X, y)
    
    # Calculate detailed metrics
    y_pred = final_model.predict(X)
    final_mae = mean_absolute_error(y, y_pred)
    final_rmse = np.sqrt(mean_squared_error(y, y_pred))
    final_r2 = r2_score(y, y_pred)
    
    print(f"   ✅ Best MAE: {best_score:.4f}")
    print(f"   📊 Final metrics - MAE: {final_mae:.4f}, RMSE: {final_rmse:.4f}, R²: {final_r2:.4f}")
    print(f"   🎯 Best params: {best_params}")
    
    return {
        'section_id': section_id,
        'best_params': best_params,
        'cv_mae': best_score,
        'final_mae': final_mae,
        'final_rmse': final_rmse,
        'final_r2': final_r2,
        'training_samples': len(section_data),
        'model': final_model
    }

# ===============================
# 🚀 SECTION-WISE HYPERPARAMETER TUNING
# ===============================
print("\n🚀 STARTING SECTION-WISE TUNING...")

tuning_results = []
optimized_models = {}

sections = working_train['section_id'].unique()

for section in sections:
    section_data = working_train[working_train['section_id'] == section].copy()
    
    # Only tune if we have enough data
    if len(section_data) < 30:
        print(f"\n⚠️  Skipping {section}: Insufficient data ({len(section_data)} samples)")
        continue
    
    try:
        # Tune hyperparameters
        result = tune_huber_regression(section_data, section, n_trials=100)
        tuning_results.append(result)
        optimized_models[section] = result['model']
        
    except Exception as e:
        print(f"   ❌ Error tuning {section}: {str(e)}")
        continue

print(f"\n✅ Tuning completed for {len(tuning_results)} sections")

# ===============================
# 📊 RESULTS ANALYSIS
# ===============================
print("\n📊 TUNING RESULTS ANALYSIS")
print("=" * 60)

# Create results DataFrame
results_df = pd.DataFrame([{
    'Section': r['section_id'],
    'Best_Epsilon': r['best_params']['epsilon'],
    'Best_Alpha': r['best_params']['alpha'],
    'Best_MaxIter': r['best_params']['max_iter'],
    'Fit_Intercept': r['best_params']['fit_intercept'],
    'CV_MAE': r['cv_mae'],
    'Final_MAE': r['final_mae'],
    'Final_RMSE': r['final_rmse'],
    'Final_R2': r['final_r2'],
    'Training_Samples': r['training_samples']
} for r in tuning_results])

print("🎯 DETAILED RESULTS:")
print(results_df.round(4))

# ===============================
# 📈 COMPARISON WITH DEFAULT PARAMETERS
# ===============================
print("\n📈 COMPARISON WITH DEFAULT PARAMETERS")
print("=" * 40)

default_results = []

for section in sections:
    section_data = working_train[working_train['section_id'] == section].copy()
    
    if len(section_data) < 30:
        continue
    
    X = section_data[['total_task_time_minutes']].values
    y = section_data['employees_on_duty'].values
    
    # Default model (your current settings)
    default_model = HuberRegressor(epsilon=1.35)  # Your current default
    default_model.fit(X, y)
    
    y_pred_default = default_model.predict(X)
    default_mae = mean_absolute_error(y, y_pred_default)
    default_r2 = r2_score(y, y_pred_default)
    
    default_results.append({
        'Section': section,
        'Default_MAE': default_mae,
        'Default_R2': default_r2
    })

default_df = pd.DataFrame(default_results)

# Merge with tuned results for comparison
comparison_df = results_df[['Section', 'Final_MAE', 'Final_R2']].merge(
    default_df, on='Section', how='inner'
)

comparison_df['MAE_Improvement'] = comparison_df['Default_MAE'] - comparison_df['Final_MAE']
comparison_df['R2_Improvement'] = comparison_df['Final_R2'] - comparison_df['Default_R2']

print("🆚 TUNED vs DEFAULT COMPARISON:")
print(comparison_df.round(4))

print(f"\n📊 SUMMARY:")
print(f"   Average MAE improvement: {comparison_df['MAE_Improvement'].mean():.4f}")
print(f"   Average R² improvement: {comparison_df['R2_Improvement'].mean():.4f}")
print(f"   Sections with improved MAE: {(comparison_df['MAE_Improvement'] > 0).sum()}/{len(comparison_df)}")
print(f"   Sections with improved R²: {(comparison_df['R2_Improvement'] > 0).sum()}/{len(comparison_df)}")

# ===============================
# 💾 SAVE OPTIMIZED PARAMETERS
# ===============================
print("\n💾 SAVING OPTIMIZED PARAMETERS")
print("=" * 40)

# Create optimized parameters dictionary (for use in main code)
OPTIMIZED_HUBER_PARAMS = {}
for result in tuning_results:
    section_id = result['section_id']
    OPTIMIZED_HUBER_PARAMS[section_id] = result['best_params']

print("🔧 OPTIMIZED HUBER PARAMETERS:")
print("# Copy this to your main code:")
print("OPTIMIZED_HUBER_PARAMS = {")
for section, params in OPTIMIZED_HUBER_PARAMS.items():
    print(f"    '{section}': {params},")
print("}")

# Default parameters for new sections
print(f"\n🎯 RECOMMENDED DEFAULT PARAMETERS:")
avg_epsilon = results_df['Best_Epsilon'].mean()
most_common_intercept = results_df['Fit_Intercept'].mode()[0]
avg_max_iter = int(results_df['Best_MaxIter'].mean())

print(f"DEFAULT_HUBER_PARAMS = {{")
print(f"    'epsilon': {avg_epsilon:.4f},")
print(f"    'alpha': 0.0001,  # Median of log-uniform distribution")
print(f"    'max_iter': {avg_max_iter},")
print(f"    'fit_intercept': {most_common_intercept}")
print(f"}}")

# ===============================
# 💾 SAVE FILES
# ===============================
# Save detailed results
results_df.to_csv('huber_regression_tuning_results.csv', index=False)
comparison_df.to_csv('huber_regression_comparison.csv', index=False)

# Save optimized parameters as JSON for easy loading
import json
with open('optimized_huber_params.json', 'w') as f:
    json.dump(OPTIMIZED_HUBER_PARAMS, f, indent=2)

print(f"\n📁 FILES SAVED:")
print(f"   - huber_regression_tuning_results.csv: Detailed tuning results")
print(f"   - huber_regression_comparison.csv: Tuned vs default comparison")
print(f"   - optimized_huber_params.json: Parameters for main code")

print(f"\n🎉 HUBER REGRESSION TUNING COMPLETED!")
print(f"✅ Tuned {len(tuning_results)} sections successfully")

# ===============================
# 📋 INTEGRATION INSTRUCTIONS
# ===============================
print("\n📋 HOW TO INTEGRATE WITH YOUR MAIN CODE:")
print("=" * 50)
print("1. Copy the OPTIMIZED_HUBER_PARAMS dictionary above")
print("2. In your main code, replace the HuberRegressor creation with:")
print("""
# In Block 3 of your main code:
def get_huber_params(section_id):
    return OPTIMIZED_HUBER_PARAMS.get(section_id, DEFAULT_HUBER_PARAMS)

# Replace your model creation:
for section in working_train['section_id'].unique():
    section_data = working_train[working_train['section_id'] == section]
    if len(section_data) > 30:
        X = section_data[['total_task_time_minutes']].values
        y = section_data['employees_on_duty'].values
        
        # Get optimized parameters
        params = get_huber_params(section)
        
        # Build optimized model
        model = HuberRegressor(**params)
        model.fit(X, y)
        # ... rest of your code
""")

print("\n✅ Ready to boost your regression performance!")

🔧 HUBER REGRESSION HYPERPARAMETER TUNING
📂 LOADING DATA...


[I 2025-08-23 08:44:44,199] A new study created in memory with name: huber_tuning_SEC-001


✅ Data loaded: 4,362 working day records
📊 Sections: 6

🚀 STARTING SECTION-WISE TUNING...

🔧 Tuning HuberRegressor for SEC-001
   Training samples: 727


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-08-23 08:44:53,632] Trial 0 finished with value: 0.5439009075542561 and parameters: {'epsilon': 1.8, 'alpha': 0.0004637649370899819, 'max_iter': 1200, 'fit_intercept': True}. Best is trial 0 with value: 0.5439009075542561.
[I 2025-08-23 08:44:58,930] Trial 1 finished with value: 0.5436514941500775 and parameters: {'epsilon': 2.1500000000000004, 'alpha': 0.0007064519938686251, 'max_iter': 1900, 'fit_intercept': True}. Best is trial 1 with value: 0.5436514941500775.
[I 2025-08-23 08:44:59,080] Trial 2 finished with value: 0.5866091089487917 and parameters: {'epsilon': 1.0, 'alpha': 9.507686776416121e-05, 'max_iter': 1750, 'fit_intercept': False}. Best is trial 1 with value: 0.5436514941500775.
[I 2025-08-23 08:44:59,150] Trial 3 finished with value: 0.5853354481008503 and parameters: {'epsilon': 1.1, 'alpha': 0.0009433270597962071, 'max_iter': 1600, 'fit_intercept': False}. Best is trial 1 with value: 0.5436514941500775.
[I 2025-08-23 08:44:59,201] Trial 4 finished with value: 0.

[I 2025-08-23 08:45:07,815] A new study created in memory with name: huber_tuning_SEC-002


[I 2025-08-23 08:45:07,691] Trial 99 finished with value: 0.543587585994887 and parameters: {'epsilon': 2.45, 'alpha': 2.3566559387282013e-06, 'max_iter': 900, 'fit_intercept': True}. Best is trial 81 with value: 0.5435778822090033.
   ✅ Best MAE: 0.5436
   📊 Final metrics - MAE: 0.5420, RMSE: 0.6771, R²: 0.7948
   🎯 Best params: {'epsilon': 2.5, 'alpha': 2.3819256646753313e-06, 'max_iter': 450, 'fit_intercept': True}

🔧 Tuning HuberRegressor for SEC-002
   Training samples: 727


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-08-23 08:45:07,993] Trial 0 finished with value: 0.5656744490265578 and parameters: {'epsilon': 1.65, 'alpha': 0.020287488224781474, 'max_iter': 1850, 'fit_intercept': True}. Best is trial 0 with value: 0.5656744490265578.
[I 2025-08-23 08:45:08,088] Trial 1 finished with value: 0.5657170021783509 and parameters: {'epsilon': 2.05, 'alpha': 0.00027292942889257016, 'max_iter': 250, 'fit_intercept': True}. Best is trial 0 with value: 0.5656744490265578.
[I 2025-08-23 08:45:08,146] Trial 2 finished with value: 0.5859956075814912 and parameters: {'epsilon': 2.25, 'alpha': 0.017633225786067314, 'max_iter': 250, 'fit_intercept': False}. Best is trial 0 with value: 0.5656744490265578.
[I 2025-08-23 08:45:08,211] Trial 3 finished with value: 0.5853166155538477 and parameters: {'epsilon': 1.2, 'alpha': 4.243636460919507e-06, 'max_iter': 1750, 'fit_intercept': False}. Best is trial 0 with value: 0.5656744490265578.
[I 2025-08-23 08:45:08,400] Trial 4 finished with value: 0.585843720497500

[I 2025-08-23 08:45:22,501] A new study created in memory with name: huber_tuning_SEC-003


   ✅ Best MAE: 0.5655
   📊 Final metrics - MAE: 0.5644, RMSE: 0.7095, R²: 0.8244
   🎯 Best params: {'epsilon': 1.3, 'alpha': 0.043570676782234655, 'max_iter': 600, 'fit_intercept': True}

🔧 Tuning HuberRegressor for SEC-003
   Training samples: 727


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-08-23 08:45:22,672] Trial 0 finished with value: 0.5530205178691872 and parameters: {'epsilon': 2.3, 'alpha': 8.719075099108077e-06, 'max_iter': 1000, 'fit_intercept': True}. Best is trial 0 with value: 0.5530205178691872.
[I 2025-08-23 08:45:22,764] Trial 1 finished with value: 0.5532988036008659 and parameters: {'epsilon': 1.45, 'alpha': 7.779994292670727e-06, 'max_iter': 1000, 'fit_intercept': True}. Best is trial 0 with value: 0.5530205178691872.
[I 2025-08-23 08:45:22,804] Trial 2 finished with value: 0.5725110051078836 and parameters: {'epsilon': 2.35, 'alpha': 0.017630993482376324, 'max_iter': 1900, 'fit_intercept': False}. Best is trial 0 with value: 0.5530205178691872.
[I 2025-08-23 08:45:22,890] Trial 3 finished with value: 0.5530205178608183 and parameters: {'epsilon': 2.3, 'alpha': 0.002768394832726396, 'max_iter': 650, 'fit_intercept': True}. Best is trial 3 with value: 0.5530205178608183.
[I 2025-08-23 08:45:23,069] Trial 4 finished with value: 0.572704291056257 a

[I 2025-08-23 08:45:34,034] A new study created in memory with name: huber_tuning_SEC-004


   ✅ Best MAE: 0.5526
   📊 Final metrics - MAE: 0.5469, RMSE: 0.6941, R²: 0.7624
   🎯 Best params: {'epsilon': 1.3, 'alpha': 5.002881133425486e-06, 'max_iter': 1550, 'fit_intercept': True}

🔧 Tuning HuberRegressor for SEC-004
   Training samples: 727


  0%|          | 0/100 [00:00<?, ?it/s]

[W 2025-08-23 08:45:34,206] Trial 0 failed with parameters: {'epsilon': 1.0, 'alpha': 4.103359599451903e-06, 'max_iter': 1400, 'fit_intercept': True} because of the following error: The value nan is not acceptable.
[W 2025-08-23 08:45:34,210] Trial 0 failed with value np.float64(nan).
[I 2025-08-23 08:45:34,412] Trial 1 finished with value: 0.5397480840307908 and parameters: {'epsilon': 1.45, 'alpha': 0.00017879181443825705, 'max_iter': 1750, 'fit_intercept': True}. Best is trial 1 with value: 0.5397480840307908.
[I 2025-08-23 08:45:34,677] Trial 2 finished with value: 0.5396133028313301 and parameters: {'epsilon': 1.15, 'alpha': 4.757022719596704e-06, 'max_iter': 1150, 'fit_intercept': True}. Best is trial 2 with value: 0.5396133028313301.
[I 2025-08-23 08:45:34,786] Trial 3 finished with value: 0.5395864534858525 and parameters: {'epsilon': 1.35, 'alpha': 0.0005774554203792191, 'max_iter': 900, 'fit_intercept': True}. Best is trial 3 with value: 0.5395864534858525.
[I 2025-08-23 08:4

[I 2025-08-23 08:45:49,614] A new study created in memory with name: huber_tuning_SEC-005


   ✅ Best MAE: 0.5389
   📊 Final metrics - MAE: 0.5375, RMSE: 0.6858, R²: 0.8131
   🎯 Best params: {'epsilon': 1.0, 'alpha': 0.002086668812396698, 'max_iter': 300, 'fit_intercept': True}

🔧 Tuning HuberRegressor for SEC-005
   Training samples: 727


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-08-23 08:45:49,706] Trial 0 finished with value: 0.5223582150065817 and parameters: {'epsilon': 2.4000000000000004, 'alpha': 0.03097057704816276, 'max_iter': 850, 'fit_intercept': True}. Best is trial 0 with value: 0.5223582150065817.
[I 2025-08-23 08:45:49,750] Trial 1 finished with value: 0.5590374307213382 and parameters: {'epsilon': 2.4000000000000004, 'alpha': 0.00569225295237109, 'max_iter': 750, 'fit_intercept': False}. Best is trial 0 with value: 0.5223582150065817.
[I 2025-08-23 08:45:49,822] Trial 2 finished with value: 0.5590340424678478 and parameters: {'epsilon': 2.5, 'alpha': 1.0851412816003068e-06, 'max_iter': 1400, 'fit_intercept': False}. Best is trial 0 with value: 0.5223582150065817.
[I 2025-08-23 08:45:49,890] Trial 3 finished with value: 0.5223536915964249 and parameters: {'epsilon': 2.45, 'alpha': 0.010949294228443878, 'max_iter': 1850, 'fit_intercept': True}. Best is trial 3 with value: 0.5223536915964249.
[I 2025-08-23 08:45:49,972] Trial 4 finished with

[I 2025-08-23 08:45:55,991] A new study created in memory with name: huber_tuning_SEC-006


[I 2025-08-23 08:45:55,838] Trial 96 finished with value: 0.5252825274387285 and parameters: {'epsilon': 1.45, 'alpha': 0.0018693573291791376, 'max_iter': 1450, 'fit_intercept': True}. Best is trial 87 with value: 0.5221716554904736.
[I 2025-08-23 08:45:55,877] Trial 97 finished with value: 0.5590046431894812 and parameters: {'epsilon': 1.5, 'alpha': 0.0006682401743019747, 'max_iter': 1350, 'fit_intercept': False}. Best is trial 87 with value: 0.5221716554904736.
[I 2025-08-23 08:45:55,921] Trial 98 finished with value: 0.5222054841055067 and parameters: {'epsilon': 1.55, 'alpha': 0.06228890141124404, 'max_iter': 1400, 'fit_intercept': True}. Best is trial 87 with value: 0.5221716554904736.
[I 2025-08-23 08:45:55,966] Trial 99 finished with value: 0.524990041363396 and parameters: {'epsilon': 1.25, 'alpha': 0.010548645348029998, 'max_iter': 1500, 'fit_intercept': True}. Best is trial 87 with value: 0.5221716554904736.
   ✅ Best MAE: 0.5222
   📊 Final metrics - MAE: 0.5205, RMSE: 0.6688

  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-08-23 08:45:56,055] Trial 0 finished with value: 0.5238463617828601 and parameters: {'epsilon': 1.45, 'alpha': 0.049375241445097064, 'max_iter': 1200, 'fit_intercept': True}. Best is trial 0 with value: 0.5238463617828601.
[I 2025-08-23 08:45:56,105] Trial 1 finished with value: 0.5237951027672046 and parameters: {'epsilon': 1.9, 'alpha': 1.22156397776692e-05, 'max_iter': 1700, 'fit_intercept': True}. Best is trial 1 with value: 0.5237951027672046.
[I 2025-08-23 08:45:56,149] Trial 2 finished with value: 0.5238262534747032 and parameters: {'epsilon': 1.75, 'alpha': 0.04013981140715864, 'max_iter': 850, 'fit_intercept': True}. Best is trial 1 with value: 0.5237951027672046.
[I 2025-08-23 08:45:56,175] Trial 3 finished with value: 0.5456836230378391 and parameters: {'epsilon': 2.25, 'alpha': 0.002380895389769627, 'max_iter': 450, 'fit_intercept': False}. Best is trial 1 with value: 0.5237951027672046.
[I 2025-08-23 08:45:56,200] Trial 4 finished with value: 0.5456753631455808 and