In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import joblib
import os


import warnings
warnings.filterwarnings('ignore')

# Bayesian Optimizer

In [None]:


# Define models
def get_models():
    return {
        "Linear Regression": LinearRegression(n_jobs=1),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest": RandomForestRegressor(n_jobs=1),
        "Gradient Boosting": GradientBoostingRegressor(),
        "Lasso Regression": Lasso(),
    }

# Define scoring metrics
def get_scoring():
    return {
        'MSE': make_scorer(mean_squared_error, greater_is_better=False),
        'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
        'R2': make_scorer(r2_score)
    }

# Bayesian search space
def get_param_spaces():
    return {
        "Linear Regression": {},
        "Decision Tree": {
            'max_depth': Integer(1, 20)
        },
        "Random Forest": {
            'n_estimators': Integer(10, 200),
            'max_depth': Integer(3, 20)
        },
        "Gradient Boosting": {
            'n_estimators': Integer(10, 200),
            'learning_rate': Real(0.001, 1.0, prior='log-uniform')
        },
        "Lasso Regression": {
            'alpha': Real(1e-4, 1e1, prior='log-uniform')
        }
    }

# Create Bayesian SearchCV setup
def create_bayesian_searches(X, y, k_values):
    searches = {}
    models = get_models()
    param_spaces = get_param_spaces()

    for model_name, model in models.items():
        for k in k_values:
            cv = KFold(n_splits=k, shuffle=True, random_state=42)
            search = BayesSearchCV(
                estimator=model,
                search_spaces=param_spaces.get(model_name, {}),
                scoring='r2',
                cv=cv,
                n_iter=32,
                n_jobs=-1,
                verbose=2,
                random_state=42
            )
            key = f"{model_name} (K={k})"
            searches[key] = search
    return searches


In [None]:
param_grids = {
    "Linear Regression": {},
    "Decision Tree": {
        "max_depth": [3, 5, 7, None]
    },
    "Random Forest": {
        "n_estimators": [50, 100],
        "max_depth": [5, 10]
    },
    "Gradient Boosting": {
        "n_estimators": [50, 100],
        "learning_rate": [0.01, 0.1]
    },
    "Lasso Regression": {
        "alpha": [0.01, 0.1, 1.0, 10.0]
    }
    # "XGBoost": {
    #     "n_estimators": [50, 100],
    #     "max_depth": [3, 5],
    #     "learning_rate": [0.01, 0.1]
    # },
    # "LightGBM": {
    #     "n_estimators": [50, 100],
    #     "learning_rate": [0.01, 0.1],
    #     "num_leaves": [31, 50]
    # },
    # "CatBoost": {
    #     "iterations": [100, 200],
    #     "learning_rate": [0.01, 0.1],
    #     "depth": [4, 6]
    # }
}

In [None]:
def train_with_progress_bayesian(searches, X, y, save_dir="saved_models_bayes"):
    all_results = []

    # Create directory to save models
    os.makedirs(save_dir, exist_ok=True)

    total_tasks = len(searches)
    pbar = tqdm(total=total_tasks, desc="Bayesian Training", ncols=100)

    for name, search in searches.items():
        print(f"\n🔍 Training with Bayesian Optimization: {name}")
        start_time = time.time()

        # Fit model
        search.fit(X, y)

        best_model = search.best_estimator_
        best_params = search.best_params_

        # Predict on training set
        y_pred = best_model.predict(X)

        mean_mse = mean_squared_error(y, y_pred)
        mean_mae = mean_absolute_error(y, y_pred)
        mean_r2  = r2_score(y, y_pred)

        # Save the model
        safe_model_name = name.replace(" ", "_").replace("(", "").replace(")", "").replace("=", "")
        model_path = os.path.join(save_dir, f"{safe_model_name}.pkl")
        joblib.dump(best_model, model_path)

        # Record results
        all_results.append({
            "Model": name,
            "Best Params": best_params,
            "MSE": mean_mse,
            "MAE": mean_mae,
            "R2": mean_r2,
            "Training Time (s)": round(time.time() - start_time, 2)
        })

        pbar.update(1)

    pbar.close()

    # Save results as DataFrame
    results_df = pd.DataFrame(all_results)
    results_df = results_df.sort_values(by="R2", ascending=False)
    return results_df


In [None]:
k_values = [3,5,7]
# Create grid searches
searches = create_grid_searches(X_train, y_train, param_grids, k_values)

# Run training
results_df = train_with_progress(searches, X_train, y_train)


Training Models:   0%|                                                       | 0/15 [00:00<?, ?it/s]


🔍 Training: Linear Regression (K=3)
n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 119
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 1
n_resources: 119
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Training Models:   7%|███▏                                           | 1/15 [00:01<00:22,  1.61s/it]


🔍 Training: Linear Regression (K=5)
n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 119
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 1
n_resources: 119
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Training Models:  13%|██████▎                                        | 2/15 [00:03<00:23,  1.79s/it]


🔍 Training: Linear Regression (K=7)
n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 119
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 1
n_resources: 119
Fitting 7 folds for each of 1 candidates, totalling 7 fits


Training Models:  20%|█████████▍                                     | 3/15 [00:05<00:24,  2.07s/it]


🔍 Training: Decision Tree (K=3)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 3 folds for each of 2 candidates, totalling 6 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Training Models:  27%|████████████▌                                  | 4/15 [00:09<00:27,  2.50s/it]


🔍 Training: Decision Tree (K=5)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 5 folds for each of 2 candidates, totalling 10 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Training Models:  33%|███████████████▋                               | 5/15 [00:13<00:30,  3.06s/it]


🔍 Training: Decision Tree (K=7)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 7 folds for each of 4 candidates, totalling 28 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 7 folds for each of 2 candidates, totalling 14 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 7 folds for each of 1 candidates, totalling 7 fits


Training Models:  40%|██████████████████▊                            | 6/15 [00:17<00:32,  3.64s/it]


🔍 Training: Random Forest (K=3)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 3 folds for each of 2 candidates, totalling 6 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Training Models:  47%|█████████████████████▉                         | 7/15 [01:07<02:28, 18.61s/it]


🔍 Training: Random Forest (K=5)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 5 folds for each of 2 candidates, totalling 10 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Training Models:  53%|█████████████████████████                      | 8/15 [02:18<04:08, 35.45s/it]


🔍 Training: Random Forest (K=7)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 7 folds for each of 4 candidates, totalling 28 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 7 folds for each of 2 candidates, totalling 14 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 7 folds for each of 1 candidates, totalling 7 fits


Training Models:  60%|████████████████████████████▏                  | 9/15 [03:20<04:22, 43.75s/it]


🔍 Training: Gradient Boosting (K=3)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 3 folds for each of 2 candidates, totalling 6 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Training Models:  67%|██████████████████████████████▋               | 10/15 [03:36<02:55, 35.13s/it]


🔍 Training: Gradient Boosting (K=5)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 5 folds for each of 2 candidates, totalling 10 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Training Models:  73%|█████████████████████████████████▋            | 11/15 [03:56<02:01, 30.45s/it]


🔍 Training: Gradient Boosting (K=7)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 7 folds for each of 4 candidates, totalling 28 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 7 folds for each of 2 candidates, totalling 14 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 7 folds for each of 1 candidates, totalling 7 fits


Training Models:  80%|████████████████████████████████████▊         | 12/15 [04:19<01:24, 28.25s/it]


🔍 Training: Lasso Regression (K=3)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 3 folds for each of 2 candidates, totalling 6 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Training Models:  87%|███████████████████████████████████████▊      | 13/15 [04:22<00:41, 20.52s/it]


🔍 Training: Lasso Regression (K=5)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 5 folds for each of 2 candidates, totalling 10 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Training Models:  93%|██████████████████████████████████████████▉   | 14/15 [04:25<00:15, 15.37s/it]


🔍 Training: Lasso Regression (K=7)
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 29
max_resources_: 119
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 29
Fitting 7 folds for each of 4 candidates, totalling 28 fits
----------
iter: 1
n_candidates: 2
n_resources: 58
Fitting 7 folds for each of 2 candidates, totalling 14 fits
----------
iter: 2
n_candidates: 1
n_resources: 116
Fitting 7 folds for each of 1 candidates, totalling 7 fits


Training Models: 100%|██████████████████████████████████████████████| 15/15 [04:30<00:00, 18.03s/it]


In [None]:
from sklearn.preprocessing import MinMaxScaler

def get_best_overall_model(results_df):
    df = results_df.copy()

    # Invert MSE and MAE (since lower is better, we flip them for scoring)
    df["Inv_MSE"] = -df["MSE"]
    df["Inv_MAE"] = -df["MAE"]

    # Normalize all metrics to [0, 1] range
    scaler = MinMaxScaler()
    df[["Norm_R2", "Norm_MSE", "Norm_MAE"]] = scaler.fit_transform(
        df[["R2", "Inv_MSE", "Inv_MAE"]]
    )

    # Combine all three normalized scores
    df["Combined_Score"] = df["Norm_R2"] + df["Norm_MSE"] + df["Norm_MAE"]

    # Sort by combined score
    df = df.sort_values(by="Combined_Score", ascending=False).reset_index(drop=True)

    # Print the best model's stats
    print("🏆 Best Model Based on Combined R², MSE, and MAE:\n")
    print(df.loc[0, ["Model", "R2", "MSE", "MAE", "Combined_Score"]])

    return df.loc[0]

best_model = get_best_overall_model(results_df)


🏆 Best Model Based on Combined R², MSE, and MAE:

Model             Decision Tree (K=5)
R2                                1.0
MSE                               0.0
MAE                               0.0
Combined_Score                    3.0
Name: 0, dtype: object
