In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time
import os
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import joblib


import warnings
warnings.filterwarnings('ignore')

In [13]:
Path_Data = '3_merged_data3.txt'

In [14]:
df = pd.read_csv(f"{Path_Data}", sep='\t')
df.head()

Unnamed: 0,ID,avg7_calingiri,SNOO_500610_1,SNOO_500610_2,SNOO_505150_1,SNOO_104700A_1,SNOO_104700A_2,SNOO_129020A_1,SNOO_129020A_2,SNOO_414470A_1,...,SNOO_526990_1,SNOO_418580A_1,SNOO_508440_1,SNOO_508440_2,SNOO_103660AB_1,SNOO_152090A_1,SNOO_152090A_2,SNOO_007570A_1,SNOO_153580A_1,SNOO_153580A_2
0,14FG141,4.67,0,1,1,0,1,1,0,1,...,0,1,1,0,1,0,0,0,0,1
1,15FG033,4.83,1,0,1,1,0,0,0,1,...,1,1,0,1,1,1,0,0,1,0
2,15FG037,3.33,1,0,1,1,0,0,0,1,...,0,1,0,0,1,1,0,1,1,0
3,15FG038,5.0,1,0,0,1,0,1,0,0,...,1,0,1,0,1,1,0,0,1,0
4,15FG047,2.0,1,0,0,0,1,0,0,1,...,1,1,0,1,1,0,1,0,1,0


## Splitting Data intro Train/ Test

In [36]:
# Separate features and target
X = df.drop(columns=['ID', 'avg7_calingiri'])
y = df['avg7_calingiri']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2,  random_state=42
)

print(f"X_train shape: {X_train.shape}")  # 80%
print(f"X_test shape: {X_test.shape}")
    # 20%


X_train shape: (119, 33048)
X_test shape: (30, 33048)


# Bayesian Optimizer

In [None]:


# Define models
def get_models():
    return {
        "Linear Regression": LinearRegression(n_jobs=4),
        "Decision Tree": DecisionTreeRegressor(),
        #"Random Forest": RandomForestRegressor(n_jobs=2),
        "Gradient Boosting": GradientBoostingRegressor(),
        "Lasso Regression": Lasso(),
    }

# Define scoring metrics
def get_scoring():
    return {
        'MSE': make_scorer(mean_squared_error, greater_is_better=False),
        'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
        'R2': make_scorer(r2_score)
    }

# Create Bayesian SearchCV setup
def create_bayesian_searches(X, y, param_spaces, k_values):
    searches = {}
    models = get_models()

    for model_name, model in models.items():
        search_spaces = param_spaces.get(model_name, {})
        if not search_spaces:
            print(f"Skipping {model_name} because search space is empty.")
            continue

        for k in k_values:
            cv = KFold(n_splits=k, shuffle=True, random_state=42)
            search = BayesSearchCV(
                estimator=model,
                search_spaces=search_spaces,
                scoring='r2',
                cv=cv,
                n_iter=32,
                n_jobs=-1,
                verbose=2,
                random_state=42
            )
            key = f"{model_name} (K={k})"
            searches[key] = search

    return searches



In [None]:
param_spaces = {
    "Linear Regression": {},  # no params to tune
    "Decision Tree": {
        "max_depth": Integer(3, 20)  # continuous range from 3 to 20
    },
    #"Random Forest": {
     #   "n_estimators": Integer(50, 200),
      #  "max_depth": Integer(5, 20)
    #},
    "Gradient Boosting": {
        "n_estimators": Integer(50, 200),
        "learning_rate": Real(0.01, 0.1, prior='log-uniform')
    },
    "Lasso Regression": {
        "alpha": Real(0.01, 10.0, prior='log-uniform')
    }
}


In [39]:
from tqdm import tqdm
import time
import os
import joblib
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def train_with_progress_bayesian(searches, X, y, save_dir="saved_models_bayes"):
    all_results = []

    os.makedirs(save_dir, exist_ok=True)
    total_tasks = len(searches)
    pbar = tqdm(total=total_tasks, desc="Bayesian Training", ncols=100)

    for idx, (name, search) in enumerate(searches.items(), 1):
        print(f"\nüîç Starting training for: {name}")
        start_time = time.time()

        # Fit model quietly
        search.fit(X, y)

        best_model = search.best_estimator_
        best_params = search.best_params_

        y_pred = best_model.predict(X)

        mean_mse = mean_squared_error(y, y_pred)
        mean_mae = mean_absolute_error(y, y_pred)
        mean_r2  = r2_score(y, y_pred)

        safe_name = name.replace(" ", "_").replace("(", "").replace(")", "").replace("=", "")
        model_path = os.path.join(save_dir, f"{safe_name}.pkl")
        joblib.dump(best_model, model_path)

        elapsed = round(time.time() - start_time, 2)
        print(f"‚úÖ Finished {name} | Time: {elapsed}s | R2: {mean_r2:.4f} | MSE: {mean_mse:.4f}")

        all_results.append({
            "Model": name,
            "Best Params": best_params,
            "MSE": mean_mse,
            "MAE": mean_mae,
            "R2": mean_r2,
            "Training Time (s)": elapsed
        })

        pbar.update(1)

        # Optional: print every 2 or 3 models milestone
        if idx % 3 == 0 or idx == total_tasks:
            print(f"üöÄ Progress milestone: {idx}/{total_tasks} models trained.")

    pbar.close()

    results_df = pd.DataFrame(all_results)
    results_df = results_df.sort_values(by="R2", ascending=False)
    return results_df


In [None]:
k_values = [3,5,7]
# Create grid searches
searches = create_bayesian_searches(X_train, y_train, param_spaces, k_values)

# Run training
results_df = train_with_progress_bayesian(searches, X_train, y_train)


Bayesian Training:   0%|                                                      | 0/9 [00:00<?, ?it/s]


üîç Starting training for: Random Forest (K=3)
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Bayesian Training:   0%|                                                      | 0/9 [14:48<?, ?it/s]


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

Bayesian Training:  11%|‚ñà‚ñà‚ñà‚ñà‚ñà                                        | 1/9 [06:03<48:28, 363.59s/it]

‚úÖ Finished Random Forest (K=3) | Time: 363.59s | R2: 0.8566 | MSE: 0.2309

üîç Starting training for: Random Forest (K=5)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalli

In [None]:
from sklearn.preprocessing import MinMaxScaler

def get_best_overall_model(results_df):
    df = results_df.copy()

    # Invert MSE and MAE (since lower is better, we flip them for scoring)
    df["Inv_MSE"] = -df["MSE"]
    df["Inv_MAE"] = -df["MAE"]

    # Normalize all metrics to [0, 1] range
    scaler = MinMaxScaler()
    df[["Norm_R2", "Norm_MSE", "Norm_MAE"]] = scaler.fit_transform(
        df[["R2", "Inv_MSE", "Inv_MAE"]]
    )

    # Combine all three normalized scores
    df["Combined_Score"] = df["Norm_R2"] + df["Norm_MSE"] + df["Norm_MAE"]

    # Sort by combined score
    df = df.sort_values(by="Combined_Score", ascending=False).reset_index(drop=True)

    # Print the best model's stats
    print("üèÜ Best Model Based on Combined R¬≤, MSE, and MAE:\n")
    print(df.loc[0, ["Model", "R2", "MSE", "MAE", "Combined_Score"]])

    return df.loc[0]

best_model = get_best_overall_model(results_df)


NameError: name 'results_df' is not defined