<a href="https://colab.research.google.com/github/coding-cosmos/Radial-Gate-Cavitation-Index/blob/main/Radial_Gate_Non_Dim_Cavitation_Index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas
!pip install optuna
!pip install scikit-learn
!pip install numpy



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Training


## Utils

In [None]:
!pip install optuna
!pip install catboost



In [None]:
from sklearn.metrics import r2_score

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import optuna

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import optuna.visualization as vis

In [None]:
import pandas as pd
import numpy as np


# Load and rename columns
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/non_dim.csv")
df.columns = ['AR', 'phi', 'R/L', 'Fr', 'Qa/Qw', 'sigma']
X = df.drop(columns=['sigma'])
y = df['sigma']


In [None]:
# This will show which columns have object (non-numeric) types
print(X.dtypes[X.dtypes == 'object'])


Series([], dtype: object)


In [None]:
def split_data(df, target_col='sigma', test_size=0.2, random_state=42):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
def train_model(model_cls, X, y, split_ratio=0.2, tune_func=None, model_name="Model", n_trials=30, **kwargs):
    """
    Trains any model with optional Optuna tuning.

    Args:
        model_cls: A model class like GradientBoostingRegressor
        X: Feature dataframe
        y: Target series
        split_ratio: Float ratio for test set
        tune_func: A function accepting (trial, X_train, y_train, X_test, y_test) â†’ rmse
        model_name: Name for logging
        n_trials: Optuna tuning trials (only used if tune_func is provided)
        **kwargs: Default model kwargs if no tuning is done

    Returns:
        model, rmse, y_pred, y_test
    """

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=42)

    if tune_func:
        study = optuna.create_study(direction="minimize")
        study.optimize(lambda trial: tune_func(trial, X_train, y_train, X_test, y_test, model_cls), n_trials=n_trials)
        best_params = study.best_params
        print(f"âœ… Best params for {model_name}: {best_params}")
        model = model_cls(**best_params)

        fig = vis.plot_optimization_history(study)
        fig.show()
    else:
        model = model_cls(**kwargs)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"ðŸ“Š {model_name} RMSE: {rmse:.4f}")

    return {
    "model": model,
    "rmse": rmse,
    "X_train": X_train,
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test,
    "y_pred": y_pred
    }



## GBM

In [None]:
def objective_gbm(trial, X_train, y_train, X_test, y_test,model_cls):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', [ 'sqrt', 'log2', None])
    }

    model = model_cls(**params)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

### 80-20 split

In [None]:
gbm_20 = train_model(GradientBoostingRegressor,X,y,0.2,objective_gbm,"GBM",n_trials=300)

[I 2025-06-14 06:35:57,267] A new study created in memory with name: no-name-e76236e8-8fe7-40f8-aca8-60a428d7c050
[I 2025-06-14 06:35:57,798] Trial 0 finished with value: 2.8260277204866577 and parameters: {'n_estimators': 451, 'max_depth': 10, 'learning_rate': 0.01883281806348058, 'subsample': 0.8050727741511653, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': 'sqrt'}. Best is trial 0 with value: 2.8260277204866577.
[I 2025-06-14 06:35:58,348] Trial 1 finished with value: 5.894656621230822 and parameters: {'n_estimators': 372, 'max_depth': 6, 'learning_rate': 0.03415663414045043, 'subsample': 0.8709461582195265, 'min_samples_split': 9, 'min_samples_leaf': 16, 'max_features': None}. Best is trial 0 with value: 2.8260277204866577.
[I 2025-06-14 06:35:58,520] Trial 2 finished with value: 16.85811753606746 and parameters: {'n_estimators': 116, 'max_depth': 6, 'learning_rate': 0.0008397724642403774, 'subsample': 0.6798757075240525, 'min_samples_split': 14, 'min_samples_leaf'

âœ… Best params for GBM: {'n_estimators': 465, 'max_depth': 6, 'learning_rate': 0.11441069333248899, 'subsample': 0.9698690613239506, 'min_samples_split': 17, 'min_samples_leaf': 6, 'max_features': None}


ðŸ“Š GBM RMSE: 0.2735


âœ… Best params for GBM: {'n_estimators': 465, 'max_depth': 6, 'learning_rate': 0.11441069333248899, 'subsample': 0.9698690613239506, 'min_samples_split': 17, 'min_samples_leaf': 6, 'max_features': None}


In [None]:
params_gbm_20 = {'n_estimators': 465, 'max_depth': 6, 'learning_rate': 0.11441069333248899, 'subsample': 0.9698690613239506, 'min_samples_split': 17, 'min_samples_leaf': 6, 'max_features': None}
gbm_20 = train_model(GradientBoostingRegressor,X,y,0.2,None,"GBM",n_trials=0,**params_gbm_20)

ðŸ“Š GBM RMSE: 0.2536


In [None]:
df_train = gbm_20['X_train'].copy()
df_test = gbm_20['X_test'].copy()

df_train['sigma'] = gbm_20['y_train']
df_test['sigma']=gbm_20['y_test']

df_test['gbm'] = gbm_20['y_pred']
df_train['gbm'] = gbm_20['model'].predict(gbm_20['X_train'])

In [None]:
print(gbm_20['rmse'])
print(r2_score(gbm_20['y_test'],gbm_20['y_pred']))

0.2535641734751448
0.9998128723885918


### 75-25 split

In [None]:
gbm_25 = train_model(GradientBoostingRegressor,X,y,0.25,objective_gbm,"GBM",n_trials=300)

[I 2025-06-14 06:39:31,021] A new study created in memory with name: no-name-c902b0fc-90e2-44f4-961f-3e05d3ad1f80
[I 2025-06-14 06:39:31,353] Trial 0 finished with value: 9.136209938964617 and parameters: {'n_estimators': 206, 'max_depth': 10, 'learning_rate': 0.004356502365588571, 'subsample': 0.9163556379620832, 'min_samples_split': 20, 'min_samples_leaf': 11, 'max_features': None}. Best is trial 0 with value: 9.136209938964617.
[I 2025-06-14 06:39:31,752] Trial 1 finished with value: 6.226387130800863 and parameters: {'n_estimators': 363, 'max_depth': 10, 'learning_rate': 0.16722623955851892, 'subsample': 0.6826408689326329, 'min_samples_split': 16, 'min_samples_leaf': 20, 'max_features': 'sqrt'}. Best is trial 1 with value: 6.226387130800863.
[I 2025-06-14 06:39:32,147] Trial 2 finished with value: 6.528659199888438 and parameters: {'n_estimators': 311, 'max_depth': 9, 'learning_rate': 0.01070022298593184, 'subsample': 0.6437774311183118, 'min_samples_split': 13, 'min_samples_leaf'

âœ… Best params for GBM: {'n_estimators': 449, 'max_depth': 6, 'learning_rate': 0.06106496573021183, 'subsample': 0.7819150855414359, 'min_samples_split': 14, 'min_samples_leaf': 3, 'max_features': None}


ðŸ“Š GBM RMSE: 0.3098


In [None]:
print(gbm_25['rmse'])
print(r2_score(gbm_25['y_test'],gbm_25['y_pred']))

0.3097654658865451
0.999728645563752


### 70-30 split

In [None]:
gbm_30 = train_model(GradientBoostingRegressor,X,y,0.3,objective_gbm,"GBM",n_trials=300)

[I 2025-06-14 06:43:24,022] A new study created in memory with name: no-name-2ea9110b-13d2-4ff6-8bfc-61dd56e961ff
[I 2025-06-14 06:43:24,523] Trial 0 finished with value: 7.1079630657554445 and parameters: {'n_estimators': 357, 'max_depth': 6, 'learning_rate': 0.18317072428253306, 'subsample': 0.9141841708091294, 'min_samples_split': 19, 'min_samples_leaf': 16, 'max_features': None}. Best is trial 0 with value: 7.1079630657554445.
[I 2025-06-14 06:43:24,998] Trial 1 finished with value: 1.0307362879310416 and parameters: {'n_estimators': 440, 'max_depth': 4, 'learning_rate': 0.03452949046003714, 'subsample': 0.5246294124598748, 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_features': None}. Best is trial 1 with value: 1.0307362879310416.
[I 2025-06-14 06:43:25,188] Trial 2 finished with value: 19.32283632449132 and parameters: {'n_estimators': 169, 'max_depth': 8, 'learning_rate': 0.00012295923458610094, 'subsample': 0.9680518972704611, 'min_samples_split': 3, 'min_samples_leaf'

âœ… Best params for GBM: {'n_estimators': 307, 'max_depth': 6, 'learning_rate': 0.08061153187376019, 'subsample': 0.888938375477266, 'min_samples_split': 14, 'min_samples_leaf': 2, 'max_features': None}


ðŸ“Š GBM RMSE: 0.4120


In [None]:
print(gbm_30['rmse'])
print(r2_score(gbm_30['y_test'],gbm_30['y_pred']))

0.4120379134774273
0.9995535170393566


### 60-40 split

In [None]:
gbm_40 = train_model(GradientBoostingRegressor,X,y,0.4,objective_gbm,"GBM",n_trials=300)

[I 2025-06-14 06:47:27,355] A new study created in memory with name: no-name-a79af7ed-71c0-4f66-8ce6-355d79a315db
[I 2025-06-14 06:47:27,702] Trial 0 finished with value: 2.4274472060872534 and parameters: {'n_estimators': 292, 'max_depth': 9, 'learning_rate': 0.26561892540038473, 'subsample': 0.8701188660502037, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 0 with value: 2.4274472060872534.
[I 2025-06-14 06:47:28,008] Trial 1 finished with value: 18.70962531176309 and parameters: {'n_estimators': 298, 'max_depth': 5, 'learning_rate': 0.00015732898378281233, 'subsample': 0.7686918035271478, 'min_samples_split': 9, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 0 with value: 2.4274472060872534.
[I 2025-06-14 06:47:28,087] Trial 2 finished with value: 7.866207283337257 and parameters: {'n_estimators': 75, 'max_depth': 2, 'learning_rate': 0.06753310039458164, 'subsample': 0.5142862036828909, 'min_samples_split': 4, 'min_samples_leaf':

âœ… Best params for GBM: {'n_estimators': 235, 'max_depth': 9, 'learning_rate': 0.04751938291386739, 'subsample': 0.8867276354299711, 'min_samples_split': 7, 'min_samples_leaf': 3, 'max_features': None}


ðŸ“Š GBM RMSE: 0.9002


In [None]:
print(gbm_40['rmse'])
print(r2_score(gbm_40['y_test'],gbm_40['y_pred']))

0.9001550936716726
0.9978207092093027


## XGBoost

In [None]:
def objective_xgb(trial, X_train, y_train, X_test, y_test,model_cls):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5)
    }

    model = model_cls(
        **params,
        objective='reg:squarederror',
        verbosity=0,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


### 80-20 split

In [None]:
xgb_20 = train_model(XGBRegressor,X,y,0.2,objective_xgb,"XGBoost",n_trials=300)

[I 2025-06-14 06:51:29,243] A new study created in memory with name: no-name-b5e5f179-8f55-48d2-bbc9-3cb57525bafc
[I 2025-06-14 06:51:29,452] Trial 0 finished with value: 15.21365349803083 and parameters: {'n_estimators': 382, 'max_depth': 8, 'learning_rate': 0.0009873427957131403, 'subsample': 0.8104436389877816, 'colsample_bytree': 0.5881333029609459, 'gamma': 0.14095743684141016, 'reg_alpha': 4.54116691362737, 'reg_lambda': 1.1491106505935023}. Best is trial 0 with value: 15.21365349803083.
[I 2025-06-14 06:51:29,512] Trial 1 finished with value: 4.710441622243292 and parameters: {'n_estimators': 102, 'max_depth': 9, 'learning_rate': 0.030923936404505036, 'subsample': 0.7052488300573976, 'colsample_bytree': 0.682148252615662, 'gamma': 1.287745472002157, 'reg_alpha': 4.819673897287739, 'reg_lambda': 3.2393104034228704}. Best is trial 1 with value: 4.710441622243292.
[I 2025-06-14 06:51:29,576] Trial 2 finished with value: 1.3361604895095514 and parameters: {'n_estimators': 199, 'max_

âœ… Best params for XGBoost: {'n_estimators': 385, 'max_depth': 4, 'learning_rate': 0.24391279082974093, 'subsample': 0.5918556835987723, 'colsample_bytree': 0.8427619844455038, 'gamma': 0.0969936996408106, 'reg_alpha': 0.6473301773166262, 'reg_lambda': 0.6151572928889338}


ðŸ“Š XGBoost RMSE: 2.2629


âœ… Best params for XGBoost: {'n_estimators': 385, 'max_depth': 4, 'learning_rate': 0.24391279082974093, 'subsample': 0.5918556835987723, 'colsample_bytree': 0.8427619844455038, 'gamma': 0.0969936996408106, 'reg_alpha': 0.6473301773166262, 'reg_lambda': 0.6151572928889338}


In [None]:
params_xgb_20 = {'n_estimators': 385, 'max_depth': 4, 'learning_rate': 0.24391279082974093, 'subsample': 0.5918556835987723, 'colsample_bytree': 0.8427619844455038, 'gamma': 0.0969936996408106, 'reg_alpha': 0.6473301773166262, 'reg_lambda': 0.6151572928889338}
xgb_20 = train_model(XGBRegressor,X,y,0.2,None,"XGBoost",n_trials=0,**params_xgb_20)
df_test['xgb'] = xgb_20['y_pred']
df_train['xgb'] = xgb_20['model'].predict(xgb_20['X_train'])

ðŸ“Š XGBoost RMSE: 2.2629


In [None]:
print(xgb_20['rmse'])
print(r2_score(xgb_20['y_test'],xgb_20['y_pred']))

2.2628747157625444
0.9850966770977682


### 75-25 split

In [None]:
xgb_25 = train_model(XGBRegressor,X,y,0.25,objective_xgb,"XGBoost",n_trials=300)

[I 2025-06-14 06:52:35,349] A new study created in memory with name: no-name-bf1611e6-1175-4ddb-849b-51272f556a2c
[I 2025-06-14 06:52:35,884] Trial 0 finished with value: 1.7383479602075267 and parameters: {'n_estimators': 358, 'max_depth': 10, 'learning_rate': 0.14979745956863882, 'subsample': 0.818748428766539, 'colsample_bytree': 0.9797925034691428, 'gamma': 0.31699215016444127, 'reg_alpha': 4.219540862927121, 'reg_lambda': 2.3944450746526846}. Best is trial 0 with value: 1.7383479602075267.
[I 2025-06-14 06:52:36,223] Trial 1 finished with value: 4.233111362610787 and parameters: {'n_estimators': 206, 'max_depth': 3, 'learning_rate': 0.015008710714615752, 'subsample': 0.6764421064324642, 'colsample_bytree': 0.759722629991112, 'gamma': 2.8494565716137066, 'reg_alpha': 4.28179334748863, 'reg_lambda': 0.2051735276682709}. Best is trial 0 with value: 1.7383479602075267.
[I 2025-06-14 06:52:36,562] Trial 2 finished with value: 17.159691932436267 and parameters: {'n_estimators': 264, 'ma

âœ… Best params for XGBoost: {'n_estimators': 465, 'max_depth': 3, 'learning_rate': 0.07044267895385936, 'subsample': 0.6082401343072665, 'colsample_bytree': 0.8811645865754034, 'gamma': 0.17732343174486007, 'reg_alpha': 0.9206266228643112, 'reg_lambda': 0.45093221911206827}


ðŸ“Š XGBoost RMSE: 1.0986


In [None]:
print(xgb_25['rmse'])
print(r2_score(xgb_25['y_test'],xgb_25['y_pred']))

1.0986409975878557
0.9965866365719656


### 70-30 split

In [None]:
xgb_30 = train_model(XGBRegressor,X,y,0.3,objective_xgb,"XGBoost",n_trials=300)

[I 2025-06-14 06:55:16,500] A new study created in memory with name: no-name-85df0c93-07a6-4166-9b61-90dce1d3f9b8
[I 2025-06-14 06:55:16,546] Trial 0 finished with value: 1.9643963953696553 and parameters: {'n_estimators': 163, 'max_depth': 3, 'learning_rate': 0.26281033012518984, 'subsample': 0.8406817954499616, 'colsample_bytree': 0.773434481332195, 'gamma': 2.4817156182468376, 'reg_alpha': 2.6722923807827588, 'reg_lambda': 3.3540188482118753}. Best is trial 0 with value: 1.9643963953696553.
[I 2025-06-14 06:55:16,612] Trial 1 finished with value: 0.9848806067705617 and parameters: {'n_estimators': 183, 'max_depth': 4, 'learning_rate': 0.1312437490805247, 'subsample': 0.614672861105062, 'colsample_bytree': 0.9478252485011891, 'gamma': 0.10195039143656603, 'reg_alpha': 0.1309918622853612, 'reg_lambda': 1.343162466860715}. Best is trial 1 with value: 0.9848806067705617.
[I 2025-06-14 06:55:16,703] Trial 2 finished with value: 3.4467088781012998 and parameters: {'n_estimators': 276, 'ma

âœ… Best params for XGBoost: {'n_estimators': 183, 'max_depth': 4, 'learning_rate': 0.1312437490805247, 'subsample': 0.614672861105062, 'colsample_bytree': 0.9478252485011891, 'gamma': 0.10195039143656603, 'reg_alpha': 0.1309918622853612, 'reg_lambda': 1.343162466860715}


ðŸ“Š XGBoost RMSE: 1.4119


In [None]:
print(xgb_30['rmse'])
print(r2_score(xgb_30['y_test'],xgb_30['y_pred']))

1.4119009631639892
0.9947574932324358


### 60-40 split

In [None]:
xgb_40 = train_model(XGBRegressor,X,y,0.4,objective_xgb,"XGBoost",n_trials=300)

[I 2025-06-14 06:56:11,324] A new study created in memory with name: no-name-99f29db9-3ca5-4b1e-b842-da7844a47918
[I 2025-06-14 06:56:11,465] Trial 0 finished with value: 4.711996385160978 and parameters: {'n_estimators': 318, 'max_depth': 7, 'learning_rate': 0.0209619950198368, 'subsample': 0.5078808838459408, 'colsample_bytree': 0.7896949002950077, 'gamma': 0.5228106991235976, 'reg_alpha': 4.356380763615379, 'reg_lambda': 4.737702472041907}. Best is trial 0 with value: 4.711996385160978.
[I 2025-06-14 06:56:11,622] Trial 1 finished with value: 2.590717515739344 and parameters: {'n_estimators': 493, 'max_depth': 7, 'learning_rate': 0.022018313260489624, 'subsample': 0.5217684629574904, 'colsample_bytree': 0.6704695993344287, 'gamma': 1.2301305404455216, 'reg_alpha': 2.8000435470376472, 'reg_lambda': 0.7823737848727008}. Best is trial 1 with value: 2.590717515739344.
[I 2025-06-14 06:56:11,751] Trial 2 finished with value: 17.98907711404923 and parameters: {'n_estimators': 279, 'max_de

âœ… Best params for XGBoost: {'n_estimators': 396, 'max_depth': 3, 'learning_rate': 0.1131009174884585, 'subsample': 0.9046474904991803, 'colsample_bytree': 0.900583323224076, 'gamma': 0.00125669237491835, 'reg_alpha': 0.7525527034195134, 'reg_lambda': 0.18682377653107315}


ðŸ“Š XGBoost RMSE: 1.5774


In [None]:
print(xgb_40['rmse'])
print(r2_score(xgb_40['y_test'],xgb_40['y_pred']))

1.5774009421323207
0.9933078650726305


## CatBoost

In [None]:
def objective_catb(trial, X_train, y_train, X_test, y_test,model_cls):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 500),
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0)
    }

    model = model_cls(
        **params,
        loss_function='RMSE',
        verbose=0,
        random_seed=42
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


### 80-20 split

In [None]:
catb_20 = train_model(CatBoostRegressor,X,y,0.2,objective_catb,"CatBoost",n_trials=300)

[I 2025-06-14 06:58:38,830] A new study created in memory with name: no-name-68e45752-e9e8-4696-9205-b401c5d3da68
[I 2025-06-14 06:58:39,807] Trial 0 finished with value: 2.401654056798179 and parameters: {'iterations': 492, 'depth': 7, 'learning_rate': 0.0746089947955115, 'l2_leaf_reg': 9.506847405026026, 'bagging_temperature': 0.21025187853512928, 'random_strength': 0.0012703206913612375}. Best is trial 0 with value: 2.401654056798179.
[I 2025-06-14 06:58:39,925] Trial 1 finished with value: 1.3296118294537853 and parameters: {'iterations': 313, 'depth': 3, 'learning_rate': 0.13299747404260692, 'l2_leaf_reg': 3.3966805140384735, 'bagging_temperature': 0.2242999848556576, 'random_strength': 0.7781890231972681}. Best is trial 1 with value: 1.3296118294537853.
[I 2025-06-14 06:58:40,119] Trial 2 finished with value: 1.9377188743103317 and parameters: {'iterations': 209, 'depth': 6, 'learning_rate': 0.053140975745744036, 'l2_leaf_reg': 5.856498085353975, 'bagging_temperature': 0.22379391

âœ… Best params for CatBoost: {'iterations': 295, 'depth': 5, 'learning_rate': 0.2153257951221818, 'l2_leaf_reg': 1.4311480526560383, 'bagging_temperature': 0.38040268431334634, 'random_strength': 0.7566611066941972}


0:	learn: 13.9859573	total: 749us	remaining: 220ms
1:	learn: 11.9542824	total: 1.54ms	remaining: 225ms
2:	learn: 10.1480071	total: 2.04ms	remaining: 199ms
3:	learn: 8.6553456	total: 2.56ms	remaining: 186ms
4:	learn: 7.5172955	total: 3.06ms	remaining: 178ms
5:	learn: 6.6137839	total: 3.59ms	remaining: 173ms
6:	learn: 5.6300940	total: 4.12ms	remaining: 169ms
7:	learn: 4.8326383	total: 4.66ms	remaining: 167ms
8:	learn: 4.2590793	total: 5.13ms	remaining: 163ms
9:	learn: 3.8751917	total: 5.6ms	remaining: 160ms
10:	learn: 3.4418411	total: 6.23ms	remaining: 161ms
11:	learn: 3.0072405	total: 6.71ms	remaining: 158ms
12:	learn: 2.7064387	total: 7.21ms	remaining: 157ms
13:	learn: 2.4984379	total: 7.67ms	remaining: 154ms
14:	learn: 2.2747638	total: 8.16ms	remaining: 152ms
15:	learn: 2.0857050	total: 8.67ms	remaining: 151ms
16:	learn: 1.9541530	total: 9.17ms	remaining: 150ms
17:	learn: 1.8442813	total: 9.7ms	remaining: 149ms
18:	learn: 1.7422253	total: 10.2ms	remaining: 148ms
19:	learn: 1.6814022	t

âœ… Best params for CatBoost: {'iterations': 295, 'depth': 5, 'learning_rate': 0.2153257951221818, 'l2_leaf_reg': 1.4311480526560383, 'bagging_temperature': 0.38040268431334634, 'random_strength': 0.7566611066941972}


In [None]:
params_catb_20 =  {'iterations': 295, 'depth': 5, 'learning_rate': 0.2153257951221818, 'l2_leaf_reg': 1.4311480526560383, 'bagging_temperature': 0.38040268431334634, 'random_strength': 0.7566611066941972}
catb_20 = train_model(CatBoostRegressor,X,y,0.2,None,"CatBoost",n_trials=0,**params_catb_20)
df_test['catb'] = catb_20['y_pred']
df_train['catb'] = catb_20['model'].predict(catb_20['X_train'])

0:	learn: 13.9859573	total: 5.8ms	remaining: 1.71s
1:	learn: 11.9542824	total: 6.63ms	remaining: 971ms
2:	learn: 10.1480071	total: 7.12ms	remaining: 693ms
3:	learn: 8.6553456	total: 7.61ms	remaining: 554ms
4:	learn: 7.5172955	total: 8.12ms	remaining: 471ms
5:	learn: 6.6137839	total: 8.56ms	remaining: 413ms
6:	learn: 5.6300940	total: 9.04ms	remaining: 372ms
7:	learn: 4.8326383	total: 9.62ms	remaining: 345ms
8:	learn: 4.2590793	total: 10.1ms	remaining: 320ms
9:	learn: 3.8751917	total: 10.6ms	remaining: 301ms
10:	learn: 3.4418411	total: 11.1ms	remaining: 286ms
11:	learn: 3.0072405	total: 11.6ms	remaining: 273ms
12:	learn: 2.7064387	total: 12.1ms	remaining: 262ms
13:	learn: 2.4984379	total: 12.5ms	remaining: 251ms
14:	learn: 2.2747638	total: 13ms	remaining: 242ms
15:	learn: 2.0857050	total: 13.5ms	remaining: 235ms
16:	learn: 1.9541530	total: 14ms	remaining: 229ms
17:	learn: 1.8442813	total: 14.5ms	remaining: 224ms
18:	learn: 1.7422253	total: 15ms	remaining: 218ms
19:	learn: 1.6814022	total

In [None]:
print(catb_20['rmse'])
print(r2_score(catb_20['y_test'],catb_20['y_pred']))

2.2701979895565967
0.9850000586518992


### 75-25 split

In [None]:
catb_25 = train_model(CatBoostRegressor,X,y,0.25,objective_catb,"XGBoost",n_trials=300)

[I 2025-06-14 07:01:49,966] A new study created in memory with name: no-name-460e8bd9-2b89-4574-b7e2-cbdf1ca33b61
[I 2025-06-14 07:01:50,051] Trial 0 finished with value: 2.0040646545204406 and parameters: {'iterations': 183, 'depth': 3, 'learning_rate': 0.07325646522914088, 'l2_leaf_reg': 8.55895788431939, 'bagging_temperature': 0.31401812710135424, 'random_strength': 0.04019766083028509}. Best is trial 0 with value: 2.0040646545204406.
[I 2025-06-14 07:01:51,166] Trial 1 finished with value: 18.091969047262793 and parameters: {'iterations': 300, 'depth': 9, 'learning_rate': 0.00019877126715989432, 'l2_leaf_reg': 2.9799301690046636, 'bagging_temperature': 0.6723380659710458, 'random_strength': 0.11914311635656072}. Best is trial 0 with value: 2.0040646545204406.
[I 2025-06-14 07:01:51,395] Trial 2 finished with value: 17.56505618662603 and parameters: {'iterations': 108, 'depth': 8, 'learning_rate': 0.0012352935253963024, 'l2_leaf_reg': 4.9662442823891, 'bagging_temperature': 0.387195

âœ… Best params for XGBoost: {'iterations': 500, 'depth': 3, 'learning_rate': 0.29957288766288986, 'l2_leaf_reg': 1.0089356680022186, 'bagging_temperature': 0.20021656521551484, 'random_strength': 0.5349694210560942}


0:	learn: 13.4945113	total: 437us	remaining: 219ms
1:	learn: 10.8875240	total: 790us	remaining: 197ms
2:	learn: 9.0118764	total: 1.13ms	remaining: 187ms
3:	learn: 7.4308006	total: 1.75ms	remaining: 217ms
4:	learn: 6.3368007	total: 2ms	remaining: 198ms
5:	learn: 5.3545678	total: 2.23ms	remaining: 184ms
6:	learn: 4.6354497	total: 2.45ms	remaining: 173ms
7:	learn: 4.2374608	total: 2.7ms	remaining: 166ms
8:	learn: 3.9162131	total: 2.99ms	remaining: 163ms
9:	learn: 3.6390919	total: 3.24ms	remaining: 159ms
10:	learn: 3.3722073	total: 3.5ms	remaining: 156ms
11:	learn: 3.0996373	total: 3.79ms	remaining: 154ms
12:	learn: 2.8992964	total: 4.01ms	remaining: 150ms
13:	learn: 2.7305964	total: 4.25ms	remaining: 147ms
14:	learn: 2.5718235	total: 4.5ms	remaining: 146ms
15:	learn: 2.4857019	total: 4.78ms	remaining: 145ms
16:	learn: 2.3533482	total: 5ms	remaining: 142ms
17:	learn: 2.2617597	total: 5.22ms	remaining: 140ms
18:	learn: 2.1828637	total: 5.48ms	remaining: 139ms
19:	learn: 2.1055018	total: 5.6

In [None]:
print(catb_25['rmse'])
print(r2_score(catb_25['y_test'],catb_25['y_pred']))

1.1069537635461864
0.9965347873548718


### 70-30 split

In [None]:
catb_30 = train_model(CatBoostRegressor,X,y,0.3,objective_catb,"XGBoost",n_trials=300)

[I 2025-06-14 07:03:34,024] A new study created in memory with name: no-name-282b4deb-8db7-4715-bbb5-1bfe2068ee8d
[I 2025-06-14 07:03:34,615] Trial 0 finished with value: 18.832226140686654 and parameters: {'iterations': 467, 'depth': 5, 'learning_rate': 0.00014899271923869223, 'l2_leaf_reg': 2.9724925535780584, 'bagging_temperature': 0.18713854139967012, 'random_strength': 0.9742834305994184}. Best is trial 0 with value: 18.832226140686654.
[I 2025-06-14 07:03:35,020] Trial 1 finished with value: 12.854811392307184 and parameters: {'iterations': 486, 'depth': 4, 'learning_rate': 0.001367298282774015, 'l2_leaf_reg': 1.9927683388160387, 'bagging_temperature': 0.006243063660004933, 'random_strength': 0.8337062278501991}. Best is trial 1 with value: 12.854811392307184.
[I 2025-06-14 07:03:35,249] Trial 2 finished with value: 15.159428518749738 and parameters: {'iterations': 448, 'depth': 4, 'learning_rate': 0.0011324081679360857, 'l2_leaf_reg': 5.981005053559756, 'bagging_temperature': 0.

âœ… Best params for XGBoost: {'iterations': 460, 'depth': 3, 'learning_rate': 0.14816764110912456, 'l2_leaf_reg': 8.28586267189315, 'bagging_temperature': 0.533899159331603, 'random_strength': 0.931125567942684}


0:	learn: 15.0391796	total: 327us	remaining: 150ms
1:	learn: 13.8846112	total: 617us	remaining: 141ms
2:	learn: 12.8906965	total: 887us	remaining: 135ms
3:	learn: 12.0853517	total: 1.12ms	remaining: 128ms
4:	learn: 11.2905847	total: 1.35ms	remaining: 123ms
5:	learn: 10.8638603	total: 1.6ms	remaining: 121ms
6:	learn: 10.1462226	total: 1.89ms	remaining: 122ms
7:	learn: 9.6421186	total: 2.11ms	remaining: 119ms
8:	learn: 9.4747509	total: 2.33ms	remaining: 117ms
9:	learn: 8.9152890	total: 2.57ms	remaining: 116ms
10:	learn: 8.4039364	total: 2.83ms	remaining: 115ms
11:	learn: 7.8399307	total: 3.04ms	remaining: 114ms
12:	learn: 7.5731679	total: 3.27ms	remaining: 112ms
13:	learn: 7.2258758	total: 3.53ms	remaining: 113ms
14:	learn: 6.8642650	total: 3.85ms	remaining: 114ms
15:	learn: 6.5405619	total: 4.09ms	remaining: 114ms
16:	learn: 6.2918715	total: 4.35ms	remaining: 113ms
17:	learn: 6.0005955	total: 4.6ms	remaining: 113ms
18:	learn: 5.6688459	total: 4.83ms	remaining: 112ms
19:	learn: 5.3913862

In [None]:
print(catb_30['rmse'])
print(r2_score(catb_30['y_test'],catb_30['y_pred']))

1.126182031507171
0.9966646015824928


### 60-40 split

In [None]:
catb_40 = train_model(CatBoostRegressor,X,y,0.4,objective_catb,"XGBoost",n_trials=300)

[I 2025-06-14 07:08:57,119] A new study created in memory with name: no-name-fb97908f-b496-4199-872c-cfca5fb3610b
[I 2025-06-14 07:08:57,505] Trial 0 finished with value: 17.79640478348706 and parameters: {'iterations': 450, 'depth': 6, 'learning_rate': 0.0002547899340258567, 'l2_leaf_reg': 1.1277924659424672, 'bagging_temperature': 0.10358091219583887, 'random_strength': 0.24773519456515314}. Best is trial 0 with value: 17.79640478348706.
[I 2025-06-14 07:08:58,116] Trial 1 finished with value: 2.414732257511789 and parameters: {'iterations': 490, 'depth': 7, 'learning_rate': 0.03976788012969304, 'l2_leaf_reg': 4.333309095424598, 'bagging_temperature': 0.512649768416472, 'random_strength': 0.4903620275831956}. Best is trial 1 with value: 2.414732257511789.
[I 2025-06-14 07:08:59,448] Trial 2 finished with value: 15.920531442385498 and parameters: {'iterations': 383, 'depth': 9, 'learning_rate': 0.0009658749346567386, 'l2_leaf_reg': 3.0249750901218695, 'bagging_temperature': 0.39405176

âœ… Best params for XGBoost: {'iterations': 398, 'depth': 3, 'learning_rate': 0.22307153555404363, 'l2_leaf_reg': 2.3037445840748596, 'bagging_temperature': 0.5072198482279355, 'random_strength': 0.9563802115556648}


0:	learn: 13.8640395	total: 312us	remaining: 124ms
1:	learn: 11.8680490	total: 841us	remaining: 167ms
2:	learn: 10.3204157	total: 1.07ms	remaining: 141ms
3:	learn: 9.2860466	total: 1.29ms	remaining: 127ms
4:	learn: 8.4300998	total: 1.52ms	remaining: 120ms
5:	learn: 7.4011078	total: 1.77ms	remaining: 116ms
6:	learn: 6.5201117	total: 2.06ms	remaining: 115ms
7:	learn: 5.9652420	total: 2.28ms	remaining: 111ms
8:	learn: 5.5018926	total: 2.52ms	remaining: 109ms
9:	learn: 5.1101970	total: 2.76ms	remaining: 107ms
10:	learn: 4.6574241	total: 3.01ms	remaining: 106ms
11:	learn: 4.1758203	total: 3.25ms	remaining: 104ms
12:	learn: 3.8601306	total: 3.56ms	remaining: 105ms
13:	learn: 3.6462576	total: 3.82ms	remaining: 105ms
14:	learn: 3.4780589	total: 4.05ms	remaining: 103ms
15:	learn: 3.3357010	total: 4.3ms	remaining: 103ms
16:	learn: 3.0824641	total: 4.56ms	remaining: 102ms
17:	learn: 2.8538179	total: 4.81ms	remaining: 102ms
18:	learn: 2.7402827	total: 5.05ms	remaining: 101ms
19:	learn: 2.6410677	t

In [None]:
print(catb_40['rmse'])
print(r2_score(catb_40['y_test'],catb_40['y_pred']))

1.2714488813474472
0.9956521124337117


## XRT

In [None]:
def objective_xrt(trial, X_train, y_train, X_test, y_test,model_cls):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
    }

    model = model_cls(**params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


### 80-20 split

In [None]:
xrt_20 = train_model(ExtraTreesRegressor,X,y,0.2,objective_xrt,"XRT",n_trials=300)

[I 2025-06-14 07:11:05,197] A new study created in memory with name: no-name-012d8d26-70ca-4b5d-a4ed-70ecc55d8f66
[I 2025-06-14 07:11:05,957] Trial 0 finished with value: 8.655430701475924 and parameters: {'n_estimators': 417, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'log2'}. Best is trial 0 with value: 8.655430701475924.
[I 2025-06-14 07:11:06,663] Trial 1 finished with value: 8.242995695082552 and parameters: {'n_estimators': 490, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 1 with value: 8.242995695082552.
[I 2025-06-14 07:11:07,072] Trial 2 finished with value: 9.646996337829322 and parameters: {'n_estimators': 259, 'max_depth': 19, 'min_samples_split': 3, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 1 with value: 8.242995695082552.
[I 2025-06-14 07:11:07,513] Trial 3 finished with value: 10.521251013945925 and parameters: {'n_estimators': 297, 'max_depth': 15, 'min_sa

âœ… Best params for XRT: {'n_estimators': 337, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š XRT RMSE: 2.0804


âœ… Best params for XRT: {'n_estimators': 337, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
params_xrt_20 = {'n_estimators': 337, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}
xrt_20 = train_model(ExtraTreesRegressor,X,y,0.2,None,"XRT",n_trials=0,**params_xrt_20)
df_test['xrt'] = xrt_20['y_pred']
df_train['xrt'] = xrt_20['model'].predict(xrt_20['X_train'])

ðŸ“Š XRT RMSE: 2.1851


In [None]:
print(xrt_20['rmse'])
print(r2_score(xrt_20['y_test'],xrt_20['y_pred']))

2.1851336404977655
0.9861030947119207


### 75-25 split

In [None]:
xrt_25 = train_model(ExtraTreesRegressor,X,y,0.25,objective_xrt,"XRT",n_trials=300)

[I 2025-06-14 07:19:24,355] A new study created in memory with name: no-name-55aca24b-4987-4aa0-aeb7-b0eea613c9ab
[I 2025-06-14 07:19:24,641] Trial 0 finished with value: 5.6389128075982216 and parameters: {'n_estimators': 183, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 5.6389128075982216.
[I 2025-06-14 07:19:25,344] Trial 1 finished with value: 4.182622584625054 and parameters: {'n_estimators': 479, 'max_depth': 11, 'min_samples_split': 8, 'min_samples_leaf': 6, 'max_features': None}. Best is trial 1 with value: 4.182622584625054.
[I 2025-06-14 07:19:25,928] Trial 2 finished with value: 10.966541751546046 and parameters: {'n_estimators': 394, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 1 with value: 4.182622584625054.
[I 2025-06-14 07:19:26,332] Trial 3 finished with value: 11.037072664912646 and parameters: {'n_estimators': 271, 'max_depth': 10, 'min_sa

âœ… Best params for XRT: {'n_estimators': 336, 'max_depth': 13, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š XRT RMSE: 2.0199


In [None]:
print(xrt_25['rmse'])
print(r2_score(xrt_25['y_test'],xrt_25['y_pred']))

2.0199389780344883
0.9884615511921205


### 70-30 split

In [None]:
xrt_30 = train_model(ExtraTreesRegressor,X,y,0.3,objective_xrt,"XRT",n_trials=300)

[I 2025-06-14 07:22:52,093] A new study created in memory with name: no-name-5177e922-f382-4799-86ed-c6faa70cfe88
[I 2025-06-14 07:22:52,352] Trial 0 finished with value: 8.519761443226143 and parameters: {'n_estimators': 165, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 8.519761443226143.
[I 2025-06-14 07:22:52,853] Trial 1 finished with value: 6.4292128274739015 and parameters: {'n_estimators': 327, 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 1 with value: 6.4292128274739015.
[I 2025-06-14 07:22:53,247] Trial 2 finished with value: 5.054718362911412 and parameters: {'n_estimators': 263, 'max_depth': 15, 'min_samples_split': 4, 'min_samples_leaf': 7, 'max_features': None}. Best is trial 2 with value: 5.054718362911412.
[I 2025-06-14 07:22:53,539] Trial 3 finished with value: 4.0911469660150965 and parameters: {'n_estimators': 175, 'max_depth': 16, 'min_samp

âœ… Best params for XRT: {'n_estimators': 450, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š XRT RMSE: 1.8797


In [None]:
print(xrt_30['rmse'])
print(r2_score(xrt_30['y_test'],xrt_30['y_pred']))

1.8796964870938275
0.9907080686962657


### 60-40 split

In [None]:
xrt_40 = train_model(ExtraTreesRegressor,X,y,0.4,objective_xrt,"XRT",n_trials=300)

[I 2025-06-14 07:27:13,018] A new study created in memory with name: no-name-6c16c9a7-e4bd-43f9-96c3-6bb5cbd7b99a
[I 2025-06-14 07:27:13,317] Trial 0 finished with value: 2.500032226971723 and parameters: {'n_estimators': 180, 'max_depth': 15, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': None}. Best is trial 0 with value: 2.500032226971723.
[I 2025-06-14 07:27:13,959] Trial 1 finished with value: 7.7404397031377465 and parameters: {'n_estimators': 453, 'max_depth': 18, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 0 with value: 2.500032226971723.
[I 2025-06-14 07:27:14,253] Trial 2 finished with value: 11.041159988290124 and parameters: {'n_estimators': 201, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 0 with value: 2.500032226971723.
[I 2025-06-14 07:27:14,428] Trial 3 finished with value: 10.01928029462742 and parameters: {'n_estimators': 108, 'max_depth': 3, 'min_sampl

âœ… Best params for XRT: {'n_estimators': 335, 'max_depth': 13, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š XRT RMSE: 1.6150


In [None]:
print(xrt_40['rmse'])
print(r2_score(xrt_40['y_test'],xrt_40['y_pred']))

1.6149784383826478
0.9929852216366082


## Random Forest

In [None]:
def objective_rf(trial, X_train, y_train, X_test, y_test,model_cls):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
    }

    model = model_cls(**params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


### 80-20 split

In [None]:
rf_20 = train_model(RandomForestRegressor,X,y,0.2,objective_rf,"RF",n_trials=300)

[I 2025-06-14 07:31:12,448] A new study created in memory with name: no-name-cefb9496-93f2-47aa-a884-20b609371fde
[I 2025-06-14 07:31:12,967] Trial 0 finished with value: 4.722272615021845 and parameters: {'n_estimators': 127, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': None}. Best is trial 0 with value: 4.722272615021845.
[I 2025-06-14 07:31:13,433] Trial 1 finished with value: 6.302679404174394 and parameters: {'n_estimators': 221, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 6, 'max_features': 'log2'}. Best is trial 0 with value: 4.722272615021845.
[I 2025-06-14 07:31:13,819] Trial 2 finished with value: 4.285290180837671 and parameters: {'n_estimators': 176, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 2 with value: 4.285290180837671.
[I 2025-06-14 07:31:14,853] Trial 3 finished with value: 3.8812372181780352 and parameters: {'n_estimators': 500, 'max_depth': 6, 'min_sample

âœ… Best params for RF: {'n_estimators': 167, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š RF RMSE: 0.7528


âœ… Best params for RF: {'n_estimators': 167, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
params_rf_20 =  {'n_estimators': 167, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}
rf_20 = train_model(RandomForestRegressor,X,y,0.2,None,"rf",n_trials=0,**params_rf_20)
df_test['rf'] = rf_20['y_pred']
df_train['rf'] = rf_20['model'].predict(rf_20['X_train'])


ðŸ“Š rf RMSE: 0.8091


In [None]:
print(rf_20['rmse'])
print(r2_score(rf_20['y_test'],rf_20['y_pred']))

0.7527901835466577
0.9983506607815842


### 75-25 split

In [None]:
rf_25 = train_model(RandomForestRegressor,X,y,0.25,objective_rf,"RF",n_trials=300)

[I 2025-06-14 07:35:08,886] A new study created in memory with name: no-name-9972064e-ec9b-40b6-af66-1f5372b59865
[I 2025-06-14 07:35:09,765] Trial 0 finished with value: 1.2476498039806345 and parameters: {'n_estimators': 405, 'max_depth': 11, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': None}. Best is trial 0 with value: 1.2476498039806345.
[I 2025-06-14 07:35:10,492] Trial 1 finished with value: 7.745919200735437 and parameters: {'n_estimators': 359, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 0 with value: 1.2476498039806345.
[I 2025-06-14 07:35:10,946] Trial 2 finished with value: 7.983397122324196 and parameters: {'n_estimators': 226, 'max_depth': 12, 'min_samples_split': 4, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 0 with value: 1.2476498039806345.
[I 2025-06-14 07:35:11,573] Trial 3 finished with value: 1.1990754685330867 and parameters: {'n_estimators': 289, 'max_depth': 19, 'min_s

âœ… Best params for RF: {'n_estimators': 363, 'max_depth': 19, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š RF RMSE: 0.8321


In [None]:
print(rf_25['rmse'])
print(r2_score(rf_25['y_test'],rf_25['y_pred']))

0.8321493443205844
0.9980417259052343


### 70-30 split

In [None]:
rf_30 = train_model(RandomForestRegressor,X,y,0.3,objective_rf,"RF",n_trials=300)

[I 2025-06-14 07:39:11,614] A new study created in memory with name: no-name-499eb22b-5b2b-4ba8-af86-81954ed45bb0
[I 2025-06-14 07:39:12,654] Trial 0 finished with value: 7.297173426826169 and parameters: {'n_estimators': 340, 'max_depth': 13, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': None}. Best is trial 0 with value: 7.297173426826169.
[I 2025-06-14 07:39:13,851] Trial 1 finished with value: 5.706230834344228 and parameters: {'n_estimators': 420, 'max_depth': 8, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 1 with value: 5.706230834344228.
[I 2025-06-14 07:39:14,154] Trial 2 finished with value: 7.440267933081043 and parameters: {'n_estimators': 107, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 1 with value: 5.706230834344228.
[I 2025-06-14 07:39:14,704] Trial 3 finished with value: 5.79426096965523 and parameters: {'n_estimators': 184, 'max_depth': 8, 'min_samples_s

âœ… Best params for RF: {'n_estimators': 228, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š RF RMSE: 1.0066


In [None]:
print(rf_30['rmse'])
print(r2_score(rf_30['y_test'],rf_30['y_pred']))

1.006621817016953
0.997335208659067


### 60-40 split

In [None]:
rf_40 = train_model(RandomForestRegressor,X,y,0.4,objective_rf,"RF",n_trials=300)

[I 2025-06-14 07:43:37,652] A new study created in memory with name: no-name-86886f61-e79f-4426-96ca-830a7c63fa8c
[I 2025-06-14 07:43:38,076] Trial 0 finished with value: 9.222989207708858 and parameters: {'n_estimators': 201, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 0 with value: 9.222989207708858.
[I 2025-06-14 07:43:39,028] Trial 1 finished with value: 2.467084722001972 and parameters: {'n_estimators': 435, 'max_depth': 16, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': None}. Best is trial 1 with value: 2.467084722001972.
[I 2025-06-14 07:43:39,370] Trial 2 finished with value: 9.288937014105578 and parameters: {'n_estimators': 166, 'max_depth': 8, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 1 with value: 2.467084722001972.
[I 2025-06-14 07:43:40,138] Trial 3 finished with value: 6.949171823413235 and parameters: {'n_estimators': 371, 'max_depth': 15, 'min_sampl

âœ… Best params for RF: {'n_estimators': 424, 'max_depth': 19, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š RF RMSE: 1.7178


In [None]:
print(rf_40['rmse'])
print(r2_score(rf_40['y_test'],rf_40['y_pred']))

1.7177590834898422
0.99206393887044


## Saving Files

In [None]:
df_train.to_csv('ndim_training.csv',index=False)
df_test.to_csv('ndim_testing.csv',index=False)