<a href="https://colab.research.google.com/github/coding-cosmos/Radial-Gate-Cavitation-Index/blob/main/Radial_Gate_Dim_Cavitation_Index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Archive

## Data Loading

In [None]:
import pandas as pd
import optuna
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load and rename columns
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dim.csv")
df.columns = ['AR', 'L', 'phi', 'QW', 'Qa', 'h', 'Va', 'R', 'sigma']


In [None]:
def split_data(df, target_col='sigma', test_size=0.2, random_state=42):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
df.head()

Unnamed: 0,AR,L,phi,QW,Qa,h,Va,R,sigma
0,0.6,1,10,1.6,0.0,0.94,0.0,0.004,19.936623
1,0.6,1,10,3.2,3.46714,0.94,2.18,0.004,4.780871
2,0.6,1,10,4.8,10.846741,0.94,6.82,0.004,1.330924
3,0.6,1,10,6.4,16.683624,0.94,10.49,0.004,0.069292
4,0.6,1,10,8.0,23.458861,0.94,14.75,0.004,-0.69154


## GBM

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
def objective_gbr(trial, X_train, y_train, X_test, y_test):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', [ 'sqrt', 'log2', None])
    }

    model = GradientBoostingRegressor(**params)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


In [None]:
def tune_and_train_gbr(X_train, y_train, X_test, y_test, n_trials=50):
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective_gbr(trial, X_train, y_train, X_test, y_test), n_trials=n_trials)

    print("Best Trial Parameters (GBR):", study.best_trial.params)

    best_params = study.best_trial.params
    model = GradientBoostingRegressor(**best_params)
    model.fit(X_train, y_train)

    return model, study


In [None]:
def evaluate_split_gbr(df, test_ratio, n_trials=50):
    X_train, X_test, y_train, y_test = split_data(df, test_size=test_ratio)
    model, study = tune_and_train_gbr(X_train, y_train, X_test, y_test, n_trials=n_trials)

    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(f"âœ… [GBR] Test Ratio: {test_ratio} â†’ Final Test RMSE: {rmse:.4f}")
    return rmse, study.best_trial.params


In [None]:
def compare_splits_gbr(df, split_list=[0.1, 0.2, 0.3], n_trials=30):
    results = []
    for split in split_list:
        print(f"\nðŸ§ª [GBR] Evaluating Test Split: {split}")
        rmse, params = evaluate_split_gbr(df, test_ratio=split, n_trials=n_trials)
        results.append({'split': split, 'rmse': rmse, 'params': params})
    return pd.DataFrame(results)


In [None]:
results_gbr = compare_splits_gbr(df, split_list=[0.2, 0.25, 0.3,0.4], n_trials=100)
print("\nðŸ“ˆ [GBR] Results Summary:")
print(results_gbr[['split', 'rmse']])


[I 2025-06-09 16:43:35,873] A new study created in memory with name: no-name-71077bcc-3440-465b-9e15-90f7b78a70d4



ðŸ§ª [GBR] Evaluating Test Split: 0.2


[I 2025-06-09 16:43:36,246] Trial 0 finished with value: 0.5928612373697129 and parameters: {'n_estimators': 185, 'max_depth': 7, 'learning_rate': 0.13695261080212182, 'subsample': 0.8597076714084161, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_features': None}. Best is trial 0 with value: 0.5928612373697129.
[I 2025-06-09 16:43:36,652] Trial 1 finished with value: 10.406254812509275 and parameters: {'n_estimators': 369, 'max_depth': 5, 'learning_rate': 0.002973949598197249, 'subsample': 0.7092871272327844, 'min_samples_split': 2, 'min_samples_leaf': 14, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5928612373697129.
[I 2025-06-09 16:43:36,972] Trial 2 finished with value: 1.4955060785277103 and parameters: {'n_estimators': 218, 'max_depth': 5, 'learning_rate': 0.2530625322255241, 'subsample': 0.7649814593835482, 'min_samples_split': 4, 'min_samples_leaf': 19, 'max_features': None}. Best is trial 0 with value: 0.5928612373697129.
[I 2025-06-09 16:43:37,150] Trial 3 fi

Best Trial Parameters (GBR): {'n_estimators': 488, 'max_depth': 9, 'learning_rate': 0.030526105174335707, 'subsample': 0.9118171027555896, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_features': None}


[I 2025-06-09 16:44:57,850] A new study created in memory with name: no-name-9ee5726f-e2b3-4ba9-9b11-647690e601c0


âœ… [GBR] Test Ratio: 0.2 â†’ Final Test RMSE: 0.4100

ðŸ§ª [GBR] Evaluating Test Split: 0.25


[I 2025-06-09 16:44:58,184] Trial 0 finished with value: 1.4939237799478122 and parameters: {'n_estimators': 271, 'max_depth': 3, 'learning_rate': 0.21315552620750342, 'subsample': 0.6977330718111907, 'min_samples_split': 18, 'min_samples_leaf': 3, 'max_features': None}. Best is trial 0 with value: 1.4939237799478122.
[I 2025-06-09 16:44:59,066] Trial 1 finished with value: 8.828265763260223 and parameters: {'n_estimators': 486, 'max_depth': 6, 'learning_rate': 0.0016575051818014106, 'subsample': 0.837077168195244, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': None}. Best is trial 0 with value: 1.4939237799478122.
[I 2025-06-09 16:44:59,406] Trial 2 finished with value: 3.0789171110041473 and parameters: {'n_estimators': 335, 'max_depth': 5, 'learning_rate': 0.05530536152810521, 'subsample': 0.8833301527707436, 'min_samples_split': 2, 'min_samples_leaf': 11, 'max_features': 'sqrt'}. Best is trial 0 with value: 1.4939237799478122.
[I 2025-06-09 16:44:59,525] Trial 3 fi

Best Trial Parameters (GBR): {'n_estimators': 143, 'max_depth': 5, 'learning_rate': 0.19256503107836104, 'subsample': 0.9277753803970199, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': None}


[I 2025-06-09 16:45:39,415] A new study created in memory with name: no-name-84427794-2c30-49f9-87b2-41401a089ac3


âœ… [GBR] Test Ratio: 0.25 â†’ Final Test RMSE: 0.5793

ðŸ§ª [GBR] Evaluating Test Split: 0.3


[I 2025-06-09 16:45:39,754] Trial 0 finished with value: 7.29877007859747 and parameters: {'n_estimators': 342, 'max_depth': 5, 'learning_rate': 0.007786119562498568, 'subsample': 0.5024050360143695, 'min_samples_split': 10, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 0 with value: 7.29877007859747.
[I 2025-06-09 16:45:40,204] Trial 1 finished with value: 3.040732787002013 and parameters: {'n_estimators': 298, 'max_depth': 6, 'learning_rate': 0.27531694674618973, 'subsample': 0.8236909724798087, 'min_samples_split': 4, 'min_samples_leaf': 15, 'max_features': None}. Best is trial 1 with value: 3.040732787002013.
[I 2025-06-09 16:45:40,444] Trial 2 finished with value: 18.276467997518424 and parameters: {'n_estimators': 235, 'max_depth': 4, 'learning_rate': 0.0004593525223348803, 'subsample': 0.9140547715898273, 'min_samples_split': 16, 'min_samples_leaf': 16, 'max_features': 'log2'}. Best is trial 1 with value: 3.040732787002013.
[I 2025-06-09 16:45:40,760] Trial 3 fin

Best Trial Parameters (GBR): {'n_estimators': 104, 'max_depth': 7, 'learning_rate': 0.12302867401017932, 'subsample': 0.991251750051151, 'min_samples_split': 13, 'min_samples_leaf': 2, 'max_features': None}


[I 2025-06-09 16:46:33,349] A new study created in memory with name: no-name-f66df0f5-952d-49fa-9ac2-f1a5f428cbac


âœ… [GBR] Test Ratio: 0.3 â†’ Final Test RMSE: 0.8250

ðŸ§ª [GBR] Evaluating Test Split: 0.4


[I 2025-06-09 16:46:33,936] Trial 0 finished with value: 11.501557039522927 and parameters: {'n_estimators': 264, 'max_depth': 4, 'learning_rate': 0.003933446160576122, 'subsample': 0.8859707776132849, 'min_samples_split': 14, 'min_samples_leaf': 13, 'max_features': 'log2'}. Best is trial 0 with value: 11.501557039522927.
[I 2025-06-09 16:46:34,836] Trial 1 finished with value: 16.287532732684948 and parameters: {'n_estimators': 358, 'max_depth': 9, 'learning_rate': 0.0007083165570817755, 'subsample': 0.8707127436397735, 'min_samples_split': 20, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 0 with value: 11.501557039522927.
[I 2025-06-09 16:46:35,388] Trial 2 finished with value: 2.83108903179839 and parameters: {'n_estimators': 299, 'max_depth': 2, 'learning_rate': 0.06781607583827615, 'subsample': 0.5066150389014852, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 2 with value: 2.83108903179839.
[I 2025-06-09 16:46:36,162] Trial 3

Best Trial Parameters (GBR): {'n_estimators': 487, 'max_depth': 9, 'learning_rate': 0.019028953389915445, 'subsample': 0.9265276748450405, 'min_samples_split': 16, 'min_samples_leaf': 1, 'max_features': None}
âœ… [GBR] Test Ratio: 0.4 â†’ Final Test RMSE: 1.0112

ðŸ“ˆ [GBR] Results Summary:
   split      rmse
0   0.20  0.410044
1   0.25  0.579292
2   0.30  0.825048
3   0.40  1.011173


## ðŸ“ˆ Gradient Boosting Regressor (GBR) Results Summary

| Split Ratio |   RMSE    |
|-------------|-----------|
| 0.20        | 0.387702  |
| 0.25        | 0.625173  |
| 0.30        | 0.943514  |
| 0.40        | 1.126326  |


## XGBoost

In [None]:
!pip install xgboost --quiet


In [None]:
from xgboost import XGBRegressor

In [None]:
def objective_xgb(trial, X_train, y_train, X_test, y_test):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5)
    }

    model = XGBRegressor(
        **params,
        objective='reg:squarederror',
        verbosity=0,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


In [None]:
def tune_and_train_xgb(X_train, y_train, X_test, y_test, n_trials=50):
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective_xgb(trial, X_train, y_train, X_test, y_test), n_trials=n_trials)

    print("Best Trial Parameters (XGB):", study.best_trial.params)

    best_params = study.best_trial.params
    model = XGBRegressor(
        **best_params,
        objective='reg:squarederror',
        verbosity=0,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    return model, study


In [None]:
def evaluate_split_xgb(df, test_ratio, n_trials=50):
    X_train, X_test, y_train, y_test = split_data(df, test_size=test_ratio)
    model, study = tune_and_train_xgb(X_train, y_train, X_test, y_test, n_trials=n_trials)

    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(f"âœ… [XGB] Test Ratio: {test_ratio} â†’ Final Test RMSE: {rmse:.4f}")
    return rmse, study.best_trial.params


In [None]:
def compare_splits_xgb(df, split_list=[0.1, 0.2, 0.3], n_trials=30):
    results = []
    for split in split_list:
        print(f"\nðŸ§ª [XGB] Evaluating Test Split: {split}")
        rmse, params = evaluate_split_xgb(df, test_ratio=split, n_trials=n_trials)
        results.append({'split': split, 'rmse': rmse, 'params': params})
    return pd.DataFrame(results)


In [None]:
results_xgb = compare_splits_xgb(df, split_list=[0.2, 0.25, 0.3,0.4], n_trials=100)
print("\nðŸ“ˆ [XGB] Results Summary:")
print(results_xgb[['split', 'rmse']])


## ðŸ“ˆ XGBoost Regressor (XGB) Results Summary

| Split Ratio |   RMSE    |
|-------------|-----------|
| 0.20        | 0.876111  |
| 0.25        | 0.874201  |
| 0.30        | 0.818592  |
| 0.40        | 1.216925  |


## Catboost

In [None]:
!pip install catboost --quiet


In [None]:
from catboost import CatBoostRegressor

In [None]:
def objective_catboost(trial, X_train, y_train, X_test, y_test):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 500),
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0)
    }

    model = CatBoostRegressor(
        **params,
        loss_function='RMSE',
        verbose=0,
        random_seed=42
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


In [None]:
def tune_and_train_catboost(X_train, y_train, X_test, y_test, n_trials=50):
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective_catboost(trial, X_train, y_train, X_test, y_test), n_trials=n_trials)

    print("Best Trial Parameters (CatBoost):", study.best_trial.params)

    best_params = study.best_trial.params
    model = CatBoostRegressor(
        **best_params,
        loss_function='RMSE',
        verbose=0,
        random_seed=42
    )
    model.fit(X_train, y_train)

    return model, study


In [None]:
def evaluate_split_catboost(df, test_ratio, n_trials=50):
    X_train, X_test, y_train, y_test = split_data(df, test_size=test_ratio)
    model, study = tune_and_train_catboost(X_train, y_train, X_test, y_test, n_trials=n_trials)

    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(f"âœ… [CatBoost] Test Ratio: {test_ratio} â†’ Final Test RMSE: {rmse:.4f}")
    return rmse, study.best_trial.params


In [None]:
def compare_splits_catboost(df, split_list=[0.1, 0.2, 0.3], n_trials=30):
    results = []
    for split in split_list:
        print(f"\nðŸ§ª [CatBoost] Evaluating Test Split: {split}")
        rmse, params = evaluate_split_catboost(df, test_ratio=split, n_trials=n_trials)
        results.append({'split': split, 'rmse': rmse, 'params': params})
    return pd.DataFrame(results)


In [None]:
results_catboost = compare_splits_catboost(df, split_list=[0.2, 0.25, 0.3,0.4], n_trials=100)
print("\nðŸ“ˆ [CatBoost] Results Summary:")
print(results_catboost[['split', 'rmse']])


## ðŸ“ˆ CatBoost Regressor Results Summary

| Split Ratio |   RMSE    |
|-------------|-----------|
| 0.20        | 0.370242  |
| 0.25        | 1.242720  |
| 0.30        | 0.904113  |
| 0.40        | 1.131831  |


## XRT

In [None]:
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

In [None]:
def objective_xrt(trial, X_train, y_train, X_test, y_test):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
    }

    model = ExtraTreesRegressor(**params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


In [None]:
def tune_and_train_xrt(X_train, y_train, X_test, y_test, n_trials=50):
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective_xrt(trial, X_train, y_train, X_test, y_test), n_trials=n_trials)

    print("Best Trial Parameters (XRT):", study.best_trial.params)

    model = ExtraTreesRegressor(**study.best_trial.params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    return model, study


In [None]:
def evaluate_split_xrt(df, test_ratio, n_trials=50):
    X_train, X_test, y_train, y_test = split_data(df, test_size=test_ratio)
    model, study = tune_and_train_xrt(X_train, y_train, X_test, y_test, n_trials=n_trials)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(f"âœ… [XRT] Test Ratio: {test_ratio} â†’ Final Test RMSE: {rmse:.4f}")
    return rmse, study.best_trial.params


In [None]:
def compare_splits_xrt(df, split_list=[0.1, 0.2, 0.3], n_trials=30):
    results = []
    for split in split_list:
        print(f"\nðŸ§ª [XRT] Evaluating Test Split: {split}")
        rmse, params = evaluate_split_xrt(df, test_ratio=split, n_trials=n_trials)
        results.append({'split': split, 'rmse': rmse, 'params': params})
    return pd.DataFrame(results)


In [None]:
results_xrt = compare_splits_xrt(df, split_list=[0.2, 0.25,.3,.4], n_trials=100)
print("\nðŸ“Š XRT Results:\n", results_xrt[['split', 'rmse']])

## ðŸ“Š Extra Trees Regressor (XRT) Results Summary

| Split Ratio |   RMSE    |
|-------------|-----------|
| 0.20        | 2.178530  |
| 0.25        | 2.270443  |
| 0.30        | 1.921216  |
| 0.40        | 1.839868  |


## Random Forest

In [None]:
def objective_rf(trial, X_train, y_train, X_test, y_test):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
    }

    model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


In [None]:
def tune_and_train_rf(X_train, y_train, X_test, y_test, n_trials=50):
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective_rf(trial, X_train, y_train, X_test, y_test), n_trials=n_trials)

    print("Best Trial Parameters (RF):", study.best_trial.params)

    model = RandomForestRegressor(**study.best_trial.params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    return model, study


In [None]:
def evaluate_split_rf(df, test_ratio, n_trials=50):
    X_train, X_test, y_train, y_test = split_data(df, test_size=test_ratio)
    model, study = tune_and_train_rf(X_train, y_train, X_test, y_test, n_trials=n_trials)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(f"âœ… [RF] Test Ratio: {test_ratio} â†’ Final Test RMSE: {rmse:.4f}")
    return rmse, study.best_trial.params


In [None]:
def compare_splits_rf(df, split_list=[0.1, 0.2, 0.3], n_trials=30):
    results = []
    for split in split_list:
        print(f"\nðŸ§ª [RF] Evaluating Test Split: {split}")
        rmse, params = evaluate_split_rf(df, test_ratio=split, n_trials=n_trials)
        results.append({'split': split, 'rmse': rmse, 'params': params})
    return pd.DataFrame(results)


In [None]:
results_rf = compare_splits_rf(df, split_list=[0.2, 0.25,0.3,0.4], n_trials=100)
print("\nðŸ“Š RF Results:\n", results_rf[['split', 'rmse']])

## ðŸ“Š Random Forest Regressor (RF) Results Summary

| Split Ratio |   RMSE    |
|-------------|-----------|
| 0.20        | 1.065148  |
| 0.25        | 1.733187  |
| 0.30        | 1.749576  |
| 0.40        | 2.334590  |


## Plots


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_feature_importance(model, feature_names=None, top_n=20, title="Feature Importance"):
    # Fallback feature names
    if feature_names is None:
        feature_names = [f"Feature {i}" for i in range(len(model.feature_importances_))]

    importances = model.feature_importances_
    indices = np.argsort(importances)[-top_n:][::-1]  # Top-N important features

    plt.figure(figsize=(10, 6))
    plt.barh(range(top_n), importances[indices], align='center')
    plt.yticks(range(top_n), [feature_names[i] for i in indices])
    plt.xlabel("Importance Score")
    plt.title(title)
    plt.gca().invert_yaxis()
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()


# Training


## Utils

In [None]:
!pip install optuna
!pip install catboost



In [None]:
from sklearn.metrics import r2_score

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import optuna

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import optuna.visualization as vis

In [None]:
import pandas as pd
import numpy as np


# Load and rename columns
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dim.csv")
df.columns = ['AR', 'L', 'phi', 'QW', 'Qa', 'h', 'Va', 'R', 'sigma']
X = df.drop(columns=['sigma'])
y = df['sigma']


In [None]:
def split_data(df, target_col='sigma', test_size=0.2, random_state=42):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
def train_model(model_cls, X, y, split_ratio=0.2, tune_func=None, model_name="Model", n_trials=30, **kwargs):
    """
    Trains any model with optional Optuna tuning.

    Args:
        model_cls: A model class like GradientBoostingRegressor
        X: Feature dataframe
        y: Target series
        split_ratio: Float ratio for test set
        tune_func: A function accepting (trial, X_train, y_train, X_test, y_test) â†’ rmse
        model_name: Name for logging
        n_trials: Optuna tuning trials (only used if tune_func is provided)
        **kwargs: Default model kwargs if no tuning is done

    Returns:
        model, rmse, y_pred, y_test
    """

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=42)

    if tune_func:
        study = optuna.create_study(direction="minimize")
        study.optimize(lambda trial: tune_func(trial, X_train, y_train, X_test, y_test, model_cls), n_trials=n_trials)
        best_params = study.best_params
        print(f"âœ… Best params for {model_name}: {best_params}")
        model = model_cls(**best_params)

        fig = vis.plot_optimization_history(study)
        fig.show()
    else:
        model = model_cls(**kwargs)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"ðŸ“Š {model_name} RMSE: {rmse:.4f}")

    return {
    "model": model,
    "rmse": rmse,
    "X_train": X_train,
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test,
    "y_pred": y_pred
    }



## GBM

In [None]:
def objective_gbm(trial, X_train, y_train, X_test, y_test,model_cls):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', [ 'sqrt', 'log2', None])
    }

    model = model_cls(**params)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

### 80-20 split

In [None]:
gbm_20 = train_model(GradientBoostingRegressor,X,y,0.2,objective_gbm,"GBM",n_trials=300)

[I 2025-06-10 10:35:26,710] A new study created in memory with name: no-name-a67ce725-83b9-4933-9835-8f90b074c134
[I 2025-06-10 10:35:26,947] Trial 0 finished with value: 3.54485016692805 and parameters: {'n_estimators': 277, 'max_depth': 2, 'learning_rate': 0.12249114004207073, 'subsample': 0.5248634864707018, 'min_samples_split': 13, 'min_samples_leaf': 14, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.54485016692805.
[I 2025-06-10 10:35:27,122] Trial 1 finished with value: 2.4045936190725516 and parameters: {'n_estimators': 151, 'max_depth': 7, 'learning_rate': 0.11396397186272961, 'subsample': 0.9132801633320546, 'min_samples_split': 9, 'min_samples_leaf': 17, 'max_features': 'log2'}. Best is trial 1 with value: 2.4045936190725516.
[I 2025-06-10 10:35:27,349] Trial 2 finished with value: 6.985495729523642 and parameters: {'n_estimators': 223, 'max_depth': 8, 'learning_rate': 0.017147332446286367, 'subsample': 0.88384757287026, 'min_samples_split': 3, 'min_samples_leaf': 18

âœ… Best params for GBM: {'n_estimators': 470, 'max_depth': 6, 'learning_rate': 0.05148810384435254, 'subsample': 0.939879106935373, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š GBM RMSE: 0.3708


 Best params for GBM:
 {'n_estimators': 470, 'max_depth': 6, 'learning_rate': 0.05148810384435254, 'subsample': 0.939879106935373, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
params_gbm_20 =  {'n_estimators': 470, 'max_depth': 6, 'learning_rate': 0.05148810384435254, 'subsample': 0.939879106935373, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None}
gbm_20 = train_model(GradientBoostingRegressor,X,y,0.2,None,"GBM",n_trials=0,**params_gbm_20)

ðŸ“Š GBM RMSE: 0.3632


In [None]:
df_train = gbm_20['X_train'].copy()
df_test = gbm_20['X_test'].copy()

df_train['sigma'] = gbm_20['y_train']
df_test['sigma']=gbm_20['y_test']

df_test['gbm'] = gbm_20['y_pred']
df_train['gbm'] = gbm_20['model'].predict(gbm_20['X_train'])

In [None]:
print(gbm_20['rmse'])
print(r2_score(gbm_20['y_test'],gbm_20['y_pred']))

0.363177557043869
0.9996161159240194


### 75-25 split

In [None]:
gbm_25 = train_model(GradientBoostingRegressor,X,y,0.25,objective_gbm,"GBM",n_trials=300)

[I 2025-06-10 10:39:40,321] A new study created in memory with name: no-name-d4cb00f8-60f4-40bd-b486-c464f2623b7e
[I 2025-06-10 10:39:40,611] Trial 0 finished with value: 18.458934010524874 and parameters: {'n_estimators': 52, 'max_depth': 9, 'learning_rate': 0.00043163519958431065, 'subsample': 0.5709492765465257, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 18.458934010524874.
[I 2025-06-10 10:39:41,307] Trial 1 finished with value: 11.045589624317826 and parameters: {'n_estimators': 150, 'max_depth': 7, 'learning_rate': 0.003923036744306018, 'subsample': 0.7213078799368995, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 1 with value: 11.045589624317826.
[I 2025-06-10 10:39:41,817] Trial 2 finished with value: 4.908290494803182 and parameters: {'n_estimators': 121, 'max_depth': 7, 'learning_rate': 0.02120181115962228, 'subsample': 0.8000434619101651, 'min_samples_split': 2, 'min_samples_leaf

âœ… Best params for GBM: {'n_estimators': 245, 'max_depth': 8, 'learning_rate': 0.21716690864560365, 'subsample': 0.8161012700934703, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š GBM RMSE: 0.8438


Best params for GBM: {'n_estimators': 245, 'max_depth': 8, 'learning_rate': 0.21716690864560365, 'subsample': 0.8161012700934703, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
print(gbm_25['rmse'])
print(r2_score(gbm_25['y_test'],gbm_25['y_pred']))

0.8438356373644588
0.9979863376323415


### 70-30 split

In [None]:
gbm_30 = train_model(GradientBoostingRegressor,X,y,0.3,objective_gbm,"GBM",n_trials=300)

[I 2025-06-10 10:45:51,568] A new study created in memory with name: no-name-43172231-2ff5-4c50-96c6-e7958f3cbf30
[I 2025-06-10 10:45:51,854] Trial 0 finished with value: 7.623905841052336 and parameters: {'n_estimators': 76, 'max_depth': 8, 'learning_rate': 0.0703009960003349, 'subsample': 0.5009142327238466, 'min_samples_split': 20, 'min_samples_leaf': 15, 'max_features': 'log2'}. Best is trial 0 with value: 7.623905841052336.
[I 2025-06-10 10:45:52,222] Trial 1 finished with value: 3.5933068862554305 and parameters: {'n_estimators': 117, 'max_depth': 5, 'learning_rate': 0.13484758792713328, 'subsample': 0.6251021814223949, 'min_samples_split': 18, 'min_samples_leaf': 10, 'max_features': None}. Best is trial 1 with value: 3.5933068862554305.
[I 2025-06-10 10:45:53,374] Trial 2 finished with value: 2.4776965655757466 and parameters: {'n_estimators': 304, 'max_depth': 7, 'learning_rate': 0.015022538771459948, 'subsample': 0.5328664657191984, 'min_samples_split': 18, 'min_samples_leaf':

âœ… Best params for GBM: {'n_estimators': 268, 'max_depth': 5, 'learning_rate': 0.1300039682896726, 'subsample': 0.9735309515633264, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': None}


ðŸ“Š GBM RMSE: 0.5609


 Best params for GBM: {'n_estimators': 268, 'max_depth': 5, 'learning_rate': 0.1300039682896726, 'subsample': 0.9735309515633264, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': None}


In [None]:
print(gbm_30['rmse'])
print(r2_score(gbm_30['y_test'],gbm_30['y_pred']))

0.5608845863306797
0.999172672345476


### 60-40 split

In [None]:
gbm_40 = train_model(GradientBoostingRegressor,X,y,0.4,objective_gbm,"GBM",n_trials=300)

[I 2025-06-10 10:49:03,607] A new study created in memory with name: no-name-239f2069-1de7-40b9-8130-723458cad7bc
[I 2025-06-10 10:49:04,141] Trial 0 finished with value: 14.314399268325758 and parameters: {'n_estimators': 166, 'max_depth': 3, 'learning_rate': 0.0033459507685838765, 'subsample': 0.8134324331666954, 'min_samples_split': 16, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 0 with value: 14.314399268325758.
[I 2025-06-10 10:49:05,586] Trial 1 finished with value: 3.6152792659883093 and parameters: {'n_estimators': 394, 'max_depth': 10, 'learning_rate': 0.2982956250111022, 'subsample': 0.5027352152176731, 'min_samples_split': 20, 'min_samples_leaf': 8, 'max_features': None}. Best is trial 1 with value: 3.6152792659883093.
[I 2025-06-10 10:49:06,779] Trial 2 finished with value: 7.301428343344192 and parameters: {'n_estimators': 377, 'max_depth': 5, 'learning_rate': 0.008457796743173398, 'subsample': 0.505952153060902, 'min_samples_split': 14, 'min_samples_lea

âœ… Best params for GBM: {'n_estimators': 438, 'max_depth': 5, 'learning_rate': 0.022634858676799895, 'subsample': 0.9999114789075334, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š GBM RMSE: 0.7856


Best params for GBM: {'n_estimators': 438, 'max_depth': 5, 'learning_rate': 0.022634858676799895, 'subsample': 0.9999114789075334, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
print(gbm_40['rmse'])
print(r2_score(gbm_40['y_test'],gbm_40['y_pred']))

0.7855542438646939
0.9983402875366336


## XGBoost

In [None]:
def objective_xgb(trial, X_train, y_train, X_test, y_test,model_cls):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5)
    }

    model = model_cls(
        **params,
        objective='reg:squarederror',
        verbosity=0,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


### 80-20 split

In [None]:
xgb_20 = train_model(XGBRegressor,X,y,0.2,objective_xgb,"XGBoost",n_trials=300)

[I 2025-06-10 10:54:23,688] A new study created in memory with name: no-name-239e6aea-3df3-4a02-a6f3-a4500254532c
[I 2025-06-10 10:54:24,339] Trial 0 finished with value: 18.393053257308825 and parameters: {'n_estimators': 91, 'max_depth': 4, 'learning_rate': 0.00013551442864688686, 'subsample': 0.5654013568255851, 'colsample_bytree': 0.8193605560073338, 'gamma': 4.103760048675415, 'reg_alpha': 2.3931552330620933, 'reg_lambda': 1.667920842293034}. Best is trial 0 with value: 18.393053257308825.
[I 2025-06-10 10:54:24,820] Trial 1 finished with value: 18.14938355115149 and parameters: {'n_estimators': 219, 'max_depth': 10, 'learning_rate': 0.00015154012153207144, 'subsample': 0.5736125298099581, 'colsample_bytree': 0.8904152503403873, 'gamma': 0.37983925729273615, 'reg_alpha': 4.098814251619087, 'reg_lambda': 4.363201362056555}. Best is trial 1 with value: 18.14938355115149.
[I 2025-06-10 10:54:25,260] Trial 2 finished with value: 14.024629815709192 and parameters: {'n_estimators': 148,

âœ… Best params for XGBoost: {'n_estimators': 474, 'max_depth': 9, 'learning_rate': 0.29787739449223777, 'subsample': 0.9615035752763753, 'colsample_bytree': 0.7162874323190875, 'gamma': 0.479575241737767, 'reg_alpha': 0.02176590366042399, 'reg_lambda': 0.0828552712757255}


ðŸ“Š XGBoost RMSE: 3.0582


Best params for XGBoost: {'n_estimators': 474, 'max_depth': 9, 'learning_rate': 0.29787739449223777, 'subsample': 0.9615035752763753, 'colsample_bytree': 0.7162874323190875, 'gamma': 0.479575241737767, 'reg_alpha': 0.02176590366042399, 'reg_lambda': 0.0828552712757255}


In [None]:
params_xgb_20 =  {'n_estimators': 474, 'max_depth': 9, 'learning_rate': 0.29787739449223777, 'subsample': 0.9615035752763753, 'colsample_bytree': 0.7162874323190875, 'gamma': 0.479575241737767, 'reg_alpha': 0.02176590366042399, 'reg_lambda': 0.0828552712757255}
xgb_20 = train_model(XGBRegressor,X,y,0.2,None,"XGBoost",n_trials=0,**params_xgb_20)
df_test['xgb'] = xgb_20['y_pred']
df_train['xgb'] = xgb_20['model'].predict(xgb_20['X_train'])


ðŸ“Š XGBoost RMSE: 3.0582


In [None]:
print(xgb_20['rmse'])
print(r2_score(xgb_20['y_test'],xgb_20['y_pred']))

3.0581532420794537
0.9727804726859727


### 75-25 split

In [None]:
xgb_25 = train_model(XGBRegressor,X,y,0.25,objective_xgb,"XGBoost",n_trials=300)

[I 2025-06-10 10:56:35,837] A new study created in memory with name: no-name-246225b2-9f03-4c80-92e2-f8db4252b826
[I 2025-06-10 10:56:35,926] Trial 0 finished with value: 1.5624230014607237 and parameters: {'n_estimators': 478, 'max_depth': 7, 'learning_rate': 0.22434710843533295, 'subsample': 0.7350324960002189, 'colsample_bytree': 0.7553834106467852, 'gamma': 3.904547710538329, 'reg_alpha': 4.846752843953191, 'reg_lambda': 0.2041658455916523}. Best is trial 0 with value: 1.5624230014607237.
[I 2025-06-10 10:56:36,080] Trial 1 finished with value: 17.453937782116817 and parameters: {'n_estimators': 408, 'max_depth': 3, 'learning_rate': 0.0002776262421361295, 'subsample': 0.7786039800446514, 'colsample_bytree': 0.8984584051216179, 'gamma': 0.7494909904884295, 'reg_alpha': 0.7045171589512161, 'reg_lambda': 3.2338193336533703}. Best is trial 0 with value: 1.5624230014607237.
[I 2025-06-10 10:56:36,164] Trial 2 finished with value: 1.8883923983980917 and parameters: {'n_estimators': 208, 

âœ… Best params for XGBoost: {'n_estimators': 282, 'max_depth': 10, 'learning_rate': 0.29822983013279014, 'subsample': 0.9723479839889592, 'colsample_bytree': 0.9853016095269126, 'gamma': 0.35130109188669056, 'reg_alpha': 0.0007089589188676824, 'reg_lambda': 0.3606999776384528}


ðŸ“Š XGBoost RMSE: 3.4731


 Best params for XGBoost: {'n_estimators': 282, 'max_depth': 10, 'learning_rate': 0.29822983013279014, 'subsample': 0.9723479839889592, 'colsample_bytree': 0.9853016095269126, 'gamma': 0.35130109188669056, 'reg_alpha': 0.0007089589188676824, 'reg_lambda': 0.3606999776384528}


In [None]:
print(xgb_25['rmse'])
print(r2_score(xgb_25['y_test'],xgb_25['y_pred']))

3.4731474997323457
0.9658871987956715


### 70-30 split

In [None]:
xgb_30 = train_model(XGBRegressor,X,y,0.3,objective_xgb,"XGBoost",n_trials=300)

[I 2025-06-10 10:57:28,359] A new study created in memory with name: no-name-62ad4c8f-0055-423b-8034-78b741686b3b
[I 2025-06-10 10:57:28,403] Trial 0 finished with value: 2.2684178019929546 and parameters: {'n_estimators': 86, 'max_depth': 10, 'learning_rate': 0.14030207711058956, 'subsample': 0.75041591646634, 'colsample_bytree': 0.738199134841788, 'gamma': 2.62137114431233, 'reg_alpha': 4.761216494904972, 'reg_lambda': 4.856002289299951}. Best is trial 0 with value: 2.2684178019929546.
[I 2025-06-10 10:57:28,443] Trial 1 finished with value: 2.138801699745684 and parameters: {'n_estimators': 63, 'max_depth': 9, 'learning_rate': 0.18446840414901758, 'subsample': 0.995381218037791, 'colsample_bytree': 0.5813954729117641, 'gamma': 1.6741219581662992, 'reg_alpha': 3.54702769933564, 'reg_lambda': 0.745890797390607}. Best is trial 1 with value: 2.138801699745684.
[I 2025-06-10 10:57:28,572] Trial 2 finished with value: 18.079077051896906 and parameters: {'n_estimators': 291, 'max_depth': 3

âœ… Best params for XGBoost: {'n_estimators': 61, 'max_depth': 7, 'learning_rate': 0.2967085650085825, 'subsample': 0.8930713069551885, 'colsample_bytree': 0.9114019015420918, 'gamma': 0.5811756521277528, 'reg_alpha': 0.39906006797945187, 'reg_lambda': 0.09917585203830015}


ðŸ“Š XGBoost RMSE: 3.2002


 Best params for XGBoost: {'n_estimators': 61, 'max_depth': 7, 'learning_rate': 0.2967085650085825, 'subsample': 0.8930713069551885, 'colsample_bytree': 0.9114019015420918, 'gamma': 0.5811756521277528, 'reg_alpha': 0.39906006797945187, 'reg_lambda': 0.09917585203830015}


In [None]:
print(xgb_30['rmse'])
print(r2_score(xgb_30['y_test'],xgb_30['y_pred']))

3.2001856853771606
0.9730672383737788


### 60-40 split

In [None]:
xgb_40 = train_model(XGBRegressor,X,y,0.4,objective_xgb,"XGBoost",n_trials=300)

[I 2025-06-10 11:01:44,911] A new study created in memory with name: no-name-6ceb6f85-b3e0-484a-b381-0d7dafc6bb9e
[I 2025-06-10 11:01:45,007] Trial 0 finished with value: 17.895973510240747 and parameters: {'n_estimators': 298, 'max_depth': 3, 'learning_rate': 0.0004211769639795837, 'subsample': 0.9437914252590436, 'colsample_bytree': 0.9407922513226521, 'gamma': 2.126115947228098, 'reg_alpha': 3.961206554381887, 'reg_lambda': 3.3447977530149964}. Best is trial 0 with value: 17.895973510240747.
[I 2025-06-10 11:01:45,106] Trial 1 finished with value: 2.9378465427342593 and parameters: {'n_estimators': 461, 'max_depth': 3, 'learning_rate': 0.049064597103606014, 'subsample': 0.7728650423723638, 'colsample_bytree': 0.5867828598391345, 'gamma': 2.970478055123183, 'reg_alpha': 0.5881766093441665, 'reg_lambda': 2.928760016623581}. Best is trial 1 with value: 2.9378465427342593.
[I 2025-06-10 11:01:45,302] Trial 2 finished with value: 11.677007263366834 and parameters: {'n_estimators': 458, '

âœ… Best params for XGBoost: {'n_estimators': 185, 'max_depth': 8, 'learning_rate': 0.21932501868055337, 'subsample': 0.9479944529057306, 'colsample_bytree': 0.9609027901120029, 'gamma': 0.15345884455941475, 'reg_alpha': 0.10847615507514398, 'reg_lambda': 2.952482260788569}


ðŸ“Š XGBoost RMSE: 2.5937


Best params for XGBoost: {'n_estimators': 185, 'max_depth': 8, 'learning_rate': 0.21932501868055337, 'subsample': 0.9479944529057306, 'colsample_bytree': 0.9609027901120029, 'gamma': 0.15345884455941475, 'reg_alpha': 0.10847615507514398, 'reg_lambda': 2.952482260788569}


In [None]:
np.savetxt('xgb_40_pred.csv',xgb_40['y_pred'],delimiter=',')

In [None]:
print(xgb_40['rmse'])
print(r2_score(xgb_40['y_test'],xgb_40['y_pred']))

2.593712721067415
0.9819064309959284


## CatBoost

In [None]:
def objective_catb(trial, X_train, y_train, X_test, y_test,model_cls):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 500),
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0)
    }

    model = model_cls(
        **params,
        loss_function='RMSE',
        verbose=0,
        random_seed=42
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


### 80-20 split

In [None]:
catb_20 = train_model(CatBoostRegressor,X,y,0.2,objective_catb,"CatBoost",n_trials=300)

[I 2025-06-10 11:05:41,668] A new study created in memory with name: no-name-19f2aede-69e7-496f-9ce9-0d9cd5bc10d9
[I 2025-06-10 11:05:43,269] Trial 0 finished with value: 17.880162032474136 and parameters: {'iterations': 382, 'depth': 9, 'learning_rate': 0.00018518301857711758, 'l2_leaf_reg': 7.36566216340755, 'bagging_temperature': 0.8982730337000819, 'random_strength': 0.5626072698900614}. Best is trial 0 with value: 17.880162032474136.
[I 2025-06-10 11:05:43,381] Trial 1 finished with value: 1.3899582801986894 and parameters: {'iterations': 193, 'depth': 4, 'learning_rate': 0.19183383729891404, 'l2_leaf_reg': 4.142171904131428, 'bagging_temperature': 0.21839539443096012, 'random_strength': 0.027243653385479516}. Best is trial 1 with value: 1.3899582801986894.
[I 2025-06-10 11:05:44,508] Trial 2 finished with value: 18.025794178431646 and parameters: {'iterations': 250, 'depth': 9, 'learning_rate': 0.00022056652541490557, 'l2_leaf_reg': 9.216758730290508, 'bagging_temperature': 0.085

âœ… Best params for CatBoost: {'iterations': 475, 'depth': 8, 'learning_rate': 0.12760508328206074, 'l2_leaf_reg': 6.147645152771797, 'bagging_temperature': 0.9768427864973606, 'random_strength': 0.028282687916864738}


0:	learn: 15.5917076	total: 2.62ms	remaining: 1.24s
1:	learn: 14.3710005	total: 5.25ms	remaining: 1.24s
2:	learn: 13.2643552	total: 7.56ms	remaining: 1.19s
3:	learn: 12.2437981	total: 9.82ms	remaining: 1.16s
4:	learn: 11.2849820	total: 12.1ms	remaining: 1.14s
5:	learn: 10.4011200	total: 14.3ms	remaining: 1.12s
6:	learn: 9.6113402	total: 16.6ms	remaining: 1.11s
7:	learn: 8.8903487	total: 18.9ms	remaining: 1.1s
8:	learn: 8.2270519	total: 21.2ms	remaining: 1.1s
9:	learn: 7.5987795	total: 23.5ms	remaining: 1.09s
10:	learn: 7.0228308	total: 25.8ms	remaining: 1.09s
11:	learn: 6.4929026	total: 28.1ms	remaining: 1.08s
12:	learn: 6.0070519	total: 30.3ms	remaining: 1.08s
13:	learn: 5.5537681	total: 32.5ms	remaining: 1.07s
14:	learn: 5.1585455	total: 34.8ms	remaining: 1.07s
15:	learn: 4.7995215	total: 37.1ms	remaining: 1.06s
16:	learn: 4.4489586	total: 39.4ms	remaining: 1.06s
17:	learn: 4.1286004	total: 45.5ms	remaining: 1.16s
18:	learn: 3.8322781	total: 49.6ms	remaining: 1.19s
19:	learn: 3.56209

Best params for CatBoost: {'iterations': 475, 'depth': 8, 'learning_rate': 0.12760508328206074, 'l2_leaf_reg': 6.147645152771797, 'bagging_temperature': 0.9768427864973606, 'random_strength': 0.028282687916864738}


In [None]:
params_catb_20 =  {'iterations': 475, 'depth': 8, 'learning_rate': 0.12760508328206074, 'l2_leaf_reg': 6.147645152771797, 'bagging_temperature': 0.9768427864973606, 'random_strength': 0.028282687916864738}
catb_20 = train_model(CatBoostRegressor,X,y,0.2,None,"CatBoost",n_trials=0,**params_catb_20)
df_test['catb'] = catb_20['y_pred']
df_train['catb'] = catb_20['model'].predict(catb_20['X_train'])

0:	learn: 15.5917076	total: 52.7ms	remaining: 25s
1:	learn: 14.3710005	total: 55.2ms	remaining: 13s
2:	learn: 13.2643552	total: 57.6ms	remaining: 9.06s
3:	learn: 12.2437981	total: 60ms	remaining: 7.07s
4:	learn: 11.2849820	total: 62.3ms	remaining: 5.86s
5:	learn: 10.4011200	total: 64.8ms	remaining: 5.06s
6:	learn: 9.6113402	total: 67.1ms	remaining: 4.49s
7:	learn: 8.8903487	total: 69.5ms	remaining: 4.06s
8:	learn: 8.2270519	total: 71.8ms	remaining: 3.72s
9:	learn: 7.5987795	total: 74.1ms	remaining: 3.45s
10:	learn: 7.0228308	total: 76.7ms	remaining: 3.23s
11:	learn: 6.4929026	total: 79ms	remaining: 3.05s
12:	learn: 6.0070519	total: 81.2ms	remaining: 2.88s
13:	learn: 5.5537681	total: 83.5ms	remaining: 2.75s
14:	learn: 5.1585455	total: 85.9ms	remaining: 2.63s
15:	learn: 4.7995215	total: 88.3ms	remaining: 2.53s
16:	learn: 4.4489586	total: 90.6ms	remaining: 2.44s
17:	learn: 4.1286004	total: 92.9ms	remaining: 2.36s
18:	learn: 3.8322781	total: 95.1ms	remaining: 2.28s
19:	learn: 3.5620941	tot

In [None]:
print(catb_20['rmse'])
print(r2_score(catb_20['y_test'],catb_20['y_pred']))

0.4644953003721656
0.9993720504695821


### 75-25 split

In [None]:
catb_25 = train_model(CatBoostRegressor,X,y,0.25,objective_catb,"XGBoost",n_trials=300)

[I 2025-06-10 11:15:09,077] A new study created in memory with name: no-name-ed2e3644-06a4-465c-aa4d-2e9c2c56b00d
[I 2025-06-10 11:15:09,572] Trial 0 finished with value: 12.562431862099755 and parameters: {'iterations': 497, 'depth': 6, 'learning_rate': 0.0016272600846948275, 'l2_leaf_reg': 7.621694371226833, 'bagging_temperature': 0.03017380089993482, 'random_strength': 0.7060641721508808}. Best is trial 0 with value: 12.562431862099755.
[I 2025-06-10 11:15:15,397] Trial 1 finished with value: 2.4960280156083448 and parameters: {'iterations': 380, 'depth': 10, 'learning_rate': 0.019293252585141484, 'l2_leaf_reg': 9.864263920889575, 'bagging_temperature': 0.041809731044647136, 'random_strength': 0.2987301234714709}. Best is trial 1 with value: 2.4960280156083448.
[I 2025-06-10 11:15:16,037] Trial 2 finished with value: 13.19998260358515 and parameters: {'iterations': 253, 'depth': 8, 'learning_rate': 0.0023462030037695596, 'l2_leaf_reg': 2.6786970444890663, 'bagging_temperature': 0.09

âœ… Best params for XGBoost: {'iterations': 284, 'depth': 7, 'learning_rate': 0.29502578244224387, 'l2_leaf_reg': 6.487300955051108, 'bagging_temperature': 0.40895182336708574, 'random_strength': 0.02163534274496521}


0:	learn: 13.7070734	total: 1.55ms	remaining: 440ms
1:	learn: 11.3405526	total: 3.08ms	remaining: 434ms
2:	learn: 9.4106120	total: 4.34ms	remaining: 406ms
3:	learn: 7.8892396	total: 5.68ms	remaining: 397ms
4:	learn: 6.5555415	total: 6.94ms	remaining: 387ms
5:	learn: 5.4862942	total: 8.26ms	remaining: 383ms
6:	learn: 4.6208683	total: 9.56ms	remaining: 378ms
7:	learn: 3.8937510	total: 10.8ms	remaining: 372ms
8:	learn: 3.3519205	total: 12.1ms	remaining: 369ms
9:	learn: 2.8723561	total: 13.5ms	remaining: 370ms
10:	learn: 2.5005618	total: 14.9ms	remaining: 370ms
11:	learn: 2.1852879	total: 16.2ms	remaining: 368ms
12:	learn: 1.9033762	total: 17.5ms	remaining: 366ms
13:	learn: 1.6785680	total: 18.8ms	remaining: 363ms
14:	learn: 1.4746328	total: 20.1ms	remaining: 360ms
15:	learn: 1.3278060	total: 21.4ms	remaining: 358ms
16:	learn: 1.2041601	total: 22.6ms	remaining: 356ms
17:	learn: 1.1134589	total: 23.9ms	remaining: 353ms
18:	learn: 1.0400918	total: 25.2ms	remaining: 352ms
19:	learn: 0.9594676

Best params for XGBoost: {'iterations': 284, 'depth': 7, 'learning_rate': 0.29502578244224387, 'l2_leaf_reg': 6.487300955051108, 'bagging_temperature': 0.40895182336708574, 'random_strength': 0.02163534274496521}


In [None]:
print(catb_25['rmse'])
print(r2_score(catb_25['y_test'],catb_25['y_pred']))

1.026613707256386
0.997019528005229


### 70-30 split

In [None]:
catb_30 = train_model(CatBoostRegressor,X,y,0.3,objective_catb,"XGBoost",n_trials=300)

[I 2025-06-10 11:20:55,397] A new study created in memory with name: no-name-75d44387-c457-4fc9-b82b-939b80f3bd66
[I 2025-06-10 11:20:57,502] Trial 0 finished with value: 14.936841489357924 and parameters: {'iterations': 250, 'depth': 10, 'learning_rate': 0.002086393048181334, 'l2_leaf_reg': 3.786432569852468, 'bagging_temperature': 0.300117243680011, 'random_strength': 0.7155773674356485}. Best is trial 0 with value: 14.936841489357924.
[I 2025-06-10 11:20:57,574] Trial 1 finished with value: 2.290549851297682 and parameters: {'iterations': 124, 'depth': 3, 'learning_rate': 0.23813024531258947, 'l2_leaf_reg': 5.706287061753196, 'bagging_temperature': 0.3901918217836662, 'random_strength': 0.9538943179777971}. Best is trial 1 with value: 2.290549851297682.
[I 2025-06-10 11:20:57,896] Trial 2 finished with value: 18.750257378582244 and parameters: {'iterations': 458, 'depth': 5, 'learning_rate': 0.0001485681216332315, 'l2_leaf_reg': 4.06170726463866, 'bagging_temperature': 0.27720969134

âœ… Best params for XGBoost: {'iterations': 432, 'depth': 7, 'learning_rate': 0.10951107707069647, 'l2_leaf_reg': 4.975540962749977, 'bagging_temperature': 0.2899365498132961, 'random_strength': 0.0010724605829358148}


0:	learn: 15.0700935	total: 2.53ms	remaining: 1.09s
1:	learn: 14.0873185	total: 3.91ms	remaining: 841ms
2:	learn: 13.1715698	total: 5.23ms	remaining: 747ms
3:	learn: 12.3089238	total: 6.48ms	remaining: 693ms
4:	learn: 11.5139003	total: 7.76ms	remaining: 663ms
5:	learn: 10.7616789	total: 9.04ms	remaining: 642ms
6:	learn: 10.0660513	total: 12.1ms	remaining: 733ms
7:	learn: 9.4214190	total: 13.6ms	remaining: 720ms
8:	learn: 8.8168050	total: 14.9ms	remaining: 700ms
9:	learn: 8.2585563	total: 16.2ms	remaining: 683ms
10:	learn: 7.7298716	total: 17.5ms	remaining: 671ms
11:	learn: 7.2262733	total: 18.8ms	remaining: 658ms
12:	learn: 6.7741074	total: 20.1ms	remaining: 647ms
13:	learn: 6.3472979	total: 21.4ms	remaining: 638ms
14:	learn: 5.9407156	total: 22.6ms	remaining: 629ms
15:	learn: 5.5705108	total: 23.9ms	remaining: 621ms
16:	learn: 5.2068306	total: 25.1ms	remaining: 614ms
17:	learn: 4.8804258	total: 26.4ms	remaining: 607ms
18:	learn: 4.5718789	total: 27.7ms	remaining: 603ms
19:	learn: 4.29

Best params for XGBoost: {'iterations': 432, 'depth': 7, 'learning_rate': 0.10951107707069647, 'l2_leaf_reg': 4.975540962749977, 'bagging_temperature': 0.2899365498132961, 'random_strength': 0.0010724605829358148}


In [None]:
print(catb_30['rmse'])
print(r2_score(catb_30['y_test'],catb_30['y_pred']))

1.143669760443151
0.9965602109701764


### 60-40 split

In [None]:
catb_40 = train_model(CatBoostRegressor,X,y,0.4,objective_catb,"XGBoost",n_trials=300)

[I 2025-06-10 11:26:49,472] A new study created in memory with name: no-name-be8a2984-69e0-4d98-bfd3-544a6b72e9cc
[I 2025-06-10 11:26:49,648] Trial 0 finished with value: 17.329600289367622 and parameters: {'iterations': 353, 'depth': 4, 'learning_rate': 0.0005134399030716602, 'l2_leaf_reg': 2.761858422287292, 'bagging_temperature': 0.9439942302228551, 'random_strength': 0.5943627771561347}. Best is trial 0 with value: 17.329600289367622.
[I 2025-06-10 11:26:49,845] Trial 1 finished with value: 15.919283833499389 and parameters: {'iterations': 412, 'depth': 4, 'learning_rate': 0.0007782080517950374, 'l2_leaf_reg': 2.4879056913696367, 'bagging_temperature': 0.022866476213864173, 'random_strength': 0.6275725679506}. Best is trial 1 with value: 15.919283833499389.
[I 2025-06-10 11:26:49,979] Trial 2 finished with value: 1.9620038422804296 and parameters: {'iterations': 263, 'depth': 4, 'learning_rate': 0.25800861168429995, 'l2_leaf_reg': 8.941211993567876, 'bagging_temperature': 0.6129688

âœ… Best params for XGBoost: {'iterations': 308, 'depth': 7, 'learning_rate': 0.18048101611327508, 'l2_leaf_reg': 4.45312371645208, 'bagging_temperature': 0.12993641515551854, 'random_strength': 0.04542843622792515}


0:	learn: 13.7778139	total: 1.5ms	remaining: 461ms
1:	learn: 12.3500308	total: 3.13ms	remaining: 479ms
2:	learn: 11.0484470	total: 4.46ms	remaining: 453ms
3:	learn: 9.9266772	total: 5.7ms	remaining: 433ms
4:	learn: 8.8917596	total: 6.93ms	remaining: 420ms
5:	learn: 7.9796692	total: 8.21ms	remaining: 413ms
6:	learn: 7.1813910	total: 9.55ms	remaining: 411ms
7:	learn: 6.4707548	total: 10.8ms	remaining: 405ms
8:	learn: 5.8405046	total: 12.1ms	remaining: 403ms
9:	learn: 5.2494360	total: 13.4ms	remaining: 398ms
10:	learn: 4.7526760	total: 14.7ms	remaining: 397ms
11:	learn: 4.3456784	total: 16ms	remaining: 396ms
12:	learn: 3.9374244	total: 17.3ms	remaining: 393ms
13:	learn: 3.5781817	total: 18.6ms	remaining: 390ms
14:	learn: 3.2470501	total: 19.8ms	remaining: 387ms
15:	learn: 2.9537652	total: 21.2ms	remaining: 387ms
16:	learn: 2.7225797	total: 22.5ms	remaining: 385ms
17:	learn: 2.4876735	total: 23.7ms	remaining: 382ms
18:	learn: 2.2700953	total: 24.9ms	remaining: 380ms
19:	learn: 2.0833894	to

 Best params for XGBoost: {'iterations': 308, 'depth': 7, 'learning_rate': 0.18048101611327508, 'l2_leaf_reg': 4.45312371645208, 'bagging_temperature': 0.12993641515551854, 'random_strength': 0.04542843622792515}


In [None]:
print(catb_40['rmse'])
print(r2_score(catb_40['y_test'],catb_40['y_pred']))

1.520809847646709
0.9937794278884887


## XRT

In [None]:
def objective_xrt(trial, X_train, y_train, X_test, y_test,model_cls):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
    }

    model = model_cls(**params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


### 80-20 split

In [None]:
xrt_20 = train_model(ExtraTreesRegressor,X,y,0.2,objective_xrt,"XRT",n_trials=300)

[I 2025-06-10 11:34:51,872] A new study created in memory with name: no-name-692ad18a-7ca9-40aa-8b90-577c00ebe809
[I 2025-06-10 11:34:52,041] Trial 0 finished with value: 6.556589495744115 and parameters: {'n_estimators': 107, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': None}. Best is trial 0 with value: 6.556589495744115.
[I 2025-06-10 11:34:52,278] Trial 1 finished with value: 10.4192511161507 and parameters: {'n_estimators': 160, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': 'sqrt'}. Best is trial 0 with value: 6.556589495744115.
[I 2025-06-10 11:34:52,796] Trial 2 finished with value: 6.6836599706990825 and parameters: {'n_estimators': 347, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 0 with value: 6.556589495744115.
[I 2025-06-10 11:34:53,334] Trial 3 finished with value: 6.827349757187714 and parameters: {'n_estimators': 370, 'max_depth': 16, 'min_samples

âœ… Best params for XRT: {'n_estimators': 500, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š XRT RMSE: 2.5147


 Best params for XRT: {'n_estimators': 500, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
params_xrt_20 = {'n_estimators': 500, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': None}
xrt_20 = train_model(ExtraTreesRegressor,X,y,0.2,None,"XGBoost",n_trials=0,**params_xrt_20)
df_test['xrt'] = xrt_20['y_pred']
df_train['xrt'] = xrt_20['model'].predict(xrt_20['X_train'])

ðŸ“Š XGBoost RMSE: 2.3566


In [None]:
df_train

Unnamed: 0,AR,L,phi,QW,Qa,h,Va,R,sigma,gbm,xgb,rf,catb,xrt
75,0.600000,4,20,13.8,31.999477,1.8800,20.12,0.006970,-2.652349,-2.625629,-1.805584,-2.460530,-2.658484,-2.579392
442,1.000000,6,30,7.2,4.166930,7.6656,2.62,0.009108,8.551258,8.524340,8.208776,8.438880,8.556668,8.518850
15,0.600000,4,10,3.2,4.007887,0.9400,2.52,0.004000,4.712516,4.774893,4.730857,4.856677,4.734766,4.710889
325,1.666667,4,60,18.4,3.292193,3.2400,2.07,0.012050,5.227411,5.220242,5.210557,5.468330,5.193445,5.229596
388,1.000000,6,15,10.0,23.681522,6.5610,14.89,0.006653,-1.063138,-1.073187,-1.118477,-0.994750,-1.118550,-1.046462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.600000,6,30,6.6,2.671925,2.8200,1.68,0.009260,10.289565,10.311876,10.445992,10.307833,10.203969,10.274233
270,1.666667,4,30,16.5,21.184545,1.6200,13.32,0.006910,-0.881867,-0.863431,-1.368138,-0.809074,-0.914021,-0.879191
348,1.000000,2,10,10.8,50.257628,5.8785,31.60,0.005542,-3.396751,-3.381058,-3.094707,-3.243805,-3.302909,-3.337046
435,1.000000,4,30,7.2,3.991983,7.6656,2.51,0.009108,8.595263,8.666544,8.814178,8.620855,8.580244,8.577713


In [None]:
print(xrt_20['rmse'])
print(r2_score(xrt_20['y_test'],xrt_20['y_pred']))

2.3566426952006303
0.9838359727300842


### 75-25 split

In [None]:
xrt_25 = train_model(ExtraTreesRegressor,X,y,0.25,objective_xrt,"XRT",n_trials=300)

[I 2025-06-10 11:40:15,299] A new study created in memory with name: no-name-f0bb4e11-e43d-41cc-8068-ea0dfa06e72a
[I 2025-06-10 11:40:15,633] Trial 0 finished with value: 5.765816528299789 and parameters: {'n_estimators': 239, 'max_depth': 16, 'min_samples_split': 5, 'min_samples_leaf': 7, 'max_features': None}. Best is trial 0 with value: 5.765816528299789.
[I 2025-06-10 11:40:16,208] Trial 1 finished with value: 8.194388140231283 and parameters: {'n_estimators': 414, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 0 with value: 5.765816528299789.
[I 2025-06-10 11:40:16,789] Trial 2 finished with value: 6.603649355175614 and parameters: {'n_estimators': 399, 'max_depth': 15, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 5.765816528299789.
[I 2025-06-10 11:40:17,372] Trial 3 finished with value: 4.4892127890643705 and parameters: {'n_estimators': 411, 'max_depth': 18, 'min_samp

âœ… Best params for XRT: {'n_estimators': 133, 'max_depth': 19, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š XRT RMSE: 2.5143


Best params for XRT: {'n_estimators': 133, 'max_depth': 19, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
print(xrt_25['rmse'])
print(r2_score(xrt_25['y_test'],xrt_25['y_pred']))

2.514268347399359
0.9821230187687087


### 70-30 split

In [None]:
xrt_30 = train_model(ExtraTreesRegressor,X,y,0.3,objective_xrt,"XRT",n_trials=300)

[I 2025-06-10 11:42:56,082] A new study created in memory with name: no-name-cdbc7959-bf9f-4f36-a079-5120fcd87d7b
[I 2025-06-10 11:42:56,394] Trial 0 finished with value: 9.119034078623018 and parameters: {'n_estimators': 203, 'max_depth': 19, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 0 with value: 9.119034078623018.
[I 2025-06-10 11:42:56,987] Trial 1 finished with value: 5.649333752537292 and parameters: {'n_estimators': 405, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 1 with value: 5.649333752537292.
[I 2025-06-10 11:42:57,154] Trial 2 finished with value: 11.857191462550862 and parameters: {'n_estimators': 105, 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 1 with value: 5.649333752537292.
[I 2025-06-10 11:42:57,707] Trial 3 finished with value: 6.001382146609643 and parameters: {'n_estimators': 360, 'max_depth': 12, 'min_samp

âœ… Best params for XRT: {'n_estimators': 109, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š XRT RMSE: 2.1309


Best params for XRT: {'n_estimators': 109, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
print(xrt_30['rmse'])
print(r2_score(xrt_30['y_test'],xrt_30['y_pred']))

2.130884627783374
0.988058733149753


### 60-40 split

In [None]:
xrt_40 = train_model(ExtraTreesRegressor,X,y,0.4,objective_xrt,"XRT",n_trials=300)

[I 2025-06-10 11:46:31,954] A new study created in memory with name: no-name-601b9e12-6aed-46d4-9d7c-b2f012675282
[I 2025-06-10 11:46:32,604] Trial 0 finished with value: 7.636629052574809 and parameters: {'n_estimators': 449, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 7, 'max_features': None}. Best is trial 0 with value: 7.636629052574809.
[I 2025-06-10 11:46:33,245] Trial 1 finished with value: 9.717030717227132 and parameters: {'n_estimators': 469, 'max_depth': 3, 'min_samples_split': 10, 'min_samples_leaf': 7, 'max_features': None}. Best is trial 0 with value: 7.636629052574809.
[I 2025-06-10 11:46:33,907] Trial 2 finished with value: 5.719784906026775 and parameters: {'n_estimators': 440, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 2 with value: 5.719784906026775.
[I 2025-06-10 11:46:34,195] Trial 3 finished with value: 3.065184534200874 and parameters: {'n_estimators': 183, 'max_depth': 8, 'min_samples_s

âœ… Best params for XRT: {'n_estimators': 495, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š XRT RMSE: 2.0911


 Best params for XRT: {'n_estimators': 495, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
print(xrt_40['rmse'])
print(r2_score(xrt_40['y_test'],xrt_40['y_pred']))

2.0911477595510464
0.9882388459414551


In [None]:
np.savetxt('xrt_40_pred.csv',xrt_40['y_pred'],delimiter=',')

## Random Forest

In [None]:
def objective_rf(trial, X_train, y_train, X_test, y_test,model_cls):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
    }

    model = model_cls(**params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse


### 80-20 split

In [None]:
rf_20 = train_model(RandomForestRegressor,X,y,0.2,objective_rf,"RF",n_trials=300)

[I 2025-06-10 11:50:52,043] A new study created in memory with name: no-name-2b747a27-5eee-4b2f-8937-c29ed0124667
[I 2025-06-10 11:50:52,434] Trial 0 finished with value: 5.800361879511057 and parameters: {'n_estimators': 176, 'max_depth': 4, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 5.800361879511057.
[I 2025-06-10 11:50:53,142] Trial 1 finished with value: 6.748541794951315 and parameters: {'n_estimators': 354, 'max_depth': 4, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 0 with value: 5.800361879511057.
[I 2025-06-10 11:50:53,398] Trial 2 finished with value: 4.648561615577745 and parameters: {'n_estimators': 116, 'max_depth': 13, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 2 with value: 4.648561615577745.
[I 2025-06-10 11:50:55,002] Trial 3 finished with value: 2.583513015185168 and parameters: {'n_estimators': 457, 'max_depth': 20, 'min_sample

âœ… Best params for RF: {'n_estimators': 475, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š RF RMSE: 1.0437


Best params for RF: {'n_estimators': 475, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
params_rf_20 =  {'n_estimators': 475, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}
rf_20 = train_model(RandomForestRegressor,X,y,0.2,None,"rf",n_trials=0,**params_rf_20)
df_test['rf'] = rf_20['y_pred']
df_train['rf'] = rf_20['model'].predict(rf_20['X_train'])


ðŸ“Š rf RMSE: 1.0440


In [None]:
print(rf_20['rmse'])
print(r2_score(rf_20['y_test'],rf_20['y_pred']))

1.0439802540141483
0.9968279015310852


### 75-25 split

In [None]:
rf_25 = train_model(RandomForestRegressor,X,y,0.25,objective_rf,"RF",n_trials=300)

[I 2025-06-10 11:56:37,539] A new study created in memory with name: no-name-0e76a160-a056-498a-ad0f-9fb9a9ededbb
[I 2025-06-10 11:56:38,587] Trial 0 finished with value: 2.246030229028692 and parameters: {'n_estimators': 456, 'max_depth': 14, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': None}. Best is trial 0 with value: 2.246030229028692.
[I 2025-06-10 11:56:39,458] Trial 1 finished with value: 3.7961421778229623 and parameters: {'n_estimators': 417, 'max_depth': 11, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 0 with value: 2.246030229028692.
[I 2025-06-10 11:56:40,500] Trial 2 finished with value: 10.818167230060569 and parameters: {'n_estimators': 490, 'max_depth': 19, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': None}. Best is trial 0 with value: 2.246030229028692.
[I 2025-06-10 11:56:41,279] Trial 3 finished with value: 3.6610926669286386 and parameters: {'n_estimators': 179, 'max_depth': 6, 'min_sampl

âœ… Best params for RF: {'n_estimators': 223, 'max_depth': 19, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š RF RMSE: 1.9210


Best params for RF: {'n_estimators': 223, 'max_depth': 19, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
print(rf_25['rmse'])
print(r2_score(rf_25['y_test'],rf_25['y_pred']))

1.920963738292659
0.9895645961271149


### 70-30 split

In [None]:
rf_30 = train_model(RandomForestRegressor,X,y,0.3,objective_rf,"RF",n_trials=300)

[I 2025-06-10 12:01:24,272] A new study created in memory with name: no-name-2757c5ea-edce-40aa-9f33-ee17f9ece819
[I 2025-06-10 12:01:25,137] Trial 0 finished with value: 3.8060000623660164 and parameters: {'n_estimators': 350, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': None}. Best is trial 0 with value: 3.8060000623660164.
[I 2025-06-10 12:01:25,858] Trial 1 finished with value: 2.722653090667343 and parameters: {'n_estimators': 291, 'max_depth': 17, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': None}. Best is trial 1 with value: 2.722653090667343.
[I 2025-06-10 12:01:26,590] Trial 2 finished with value: 10.333829578825194 and parameters: {'n_estimators': 322, 'max_depth': 18, 'min_samples_split': 4, 'min_samples_leaf': 9, 'max_features': None}. Best is trial 1 with value: 2.722653090667343.
[I 2025-06-10 12:01:26,914] Trial 3 finished with value: 10.724530328336805 and parameters: {'n_estimators': 119, 'max_depth': 8, 'min_samples_

âœ… Best params for RF: {'n_estimators': 494, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š RF RMSE: 1.7094


Best params for RF: {'n_estimators': 494, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
print(rf_30['rmse'])
print(r2_score(rf_30['y_test'],rf_30['y_pred']))

1.709389289149332
0.9923155553361421


### 60-40 split

In [None]:
rf_40 = train_model(RandomForestRegressor,X,y,0.4,objective_rf,"RF",n_trials=300)

[I 2025-06-10 12:07:20,600] A new study created in memory with name: no-name-bf2cd9c3-05a8-4769-beed-46026bd098f2
[I 2025-06-10 12:07:21,022] Trial 0 finished with value: 10.087619869201056 and parameters: {'n_estimators': 139, 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 8, 'max_features': None}. Best is trial 0 with value: 10.087619869201056.
[I 2025-06-10 12:07:21,465] Trial 1 finished with value: 7.108025613043324 and parameters: {'n_estimators': 155, 'max_depth': 17, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 1 with value: 7.108025613043324.
[I 2025-06-10 12:07:21,874] Trial 2 finished with value: 8.543342146254693 and parameters: {'n_estimators': 143, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 7, 'max_features': 'log2'}. Best is trial 1 with value: 7.108025613043324.
[I 2025-06-10 12:07:22,632] Trial 3 finished with value: 6.254012235818292 and parameters: {'n_estimators': 228, 'max_depth': 18, 'min_sampl

âœ… Best params for RF: {'n_estimators': 208, 'max_depth': 18, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


ðŸ“Š RF RMSE: 2.2609


Best params for RF: {'n_estimators': 208, 'max_depth': 18, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}


In [None]:
print(rf_40['rmse'])
print(r2_score(rf_40['y_test'],rf_40['y_pred']))

2.260893527087054
0.986251962190871


## Saving files


In [None]:
df_train.to_csv('dim_training.csv',index=False)
df_test.to_csv('dim_testing.csv',index=False)