In [1]:
import pandas as pd
import numpy as np
import os
import sys

sys.path.append("../..")
from analysis_functions import *

In [2]:
# Загрузка данных
X_stress_components_new = opener("X_stress_components_new", path_import="../../resourses")
X_strain_components_new = opener("X_strain_components_new", path_import="../../resourses")
y_stress_components_new = opener("y_stress_components_new", path_import="../../resourses")
y_strain_components_new = opener("y_strain_components_new", path_import="../../resourses")
# X_stress_components_other = opener('X_stress_components_other')
# X_strain_components_other = opener('X_strain_components_other')
# y_stress_components_other = opener('y_stress_components_other')
# y_strain_components_other = opener('y_strain_components_other')


../../resourses/X_stress_components_new.pkl
../../resourses/X_strain_components_new.pkl
../../resourses/y_stress_components_new.pkl
../../resourses/y_strain_components_new.pkl


In [3]:
component_num = 2

X, y = (X_stress_components_new[component_num], 
        y_stress_components_new[component_num])

## Простое обучение

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Разделение данных на обучающую, валидационную и тестовую выборки
cur_X_train, cur_X_test, cur_y_train, cur_y_test = split_transform_one_comp_train_test(
    X, y
)

# Обучение модели
model = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    loss="squared_error",
    random_state=RANDOM_STATE,
)
model.fit(cur_X_train, cur_y_train)

# Предсказание
cur_y_pred = model.predict(cur_X_test)

# Оценка качества
rmse = np.sqrt(mean_squared_error(cur_y_test, cur_y_pred))
print(f"RMSE test: {rmse:.2f}")

# На трейне
cur_y_pred_train = model.predict(cur_X_train)
rmse_train = np.sqrt(mean_squared_error(cur_y_train, cur_y_pred_train))
print(f"RMSE train: {rmse_train:.2f}")


RMSE test: 8.69
RMSE train: 8.66


## С оптимизацией гиперпараметров

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import cross_val_score

n_splits = 3
# Preparing datasets
cur_X_test, cur_y_test, val_list_X, val_list_y, train_list_X, train_list_y = (
    split_transform_one_comp_cv(X, y, n_splits=n_splits)
)

def do_optuna_for_boosting(X, y, n_trials=100, **kwargs):
    n_splits = kwargs.get("n_splits", 3)

    def optuna_boosting_val(trial):
        n_estimators = trial.suggest_int("n_estimators", 50, 300)
        max_depth = trial.suggest_int("max_depth", 1, 10)
        learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True)

        params = {
            "n_estimators": n_estimators,
            "max_depth": max_depth,
            "learning_rate": learning_rate,
            "random_state": RANDOM_STATE,
        }

        # Fitting and scoring `n_split` times
        errors = np.zeros((n_splits, 9))

        for split_idx in range(n_splits):
            regr = GradientBoostingRegressor(**params)

            cur_X_train = train_list_X[split_idx]
            cur_y_train = train_list_y[split_idx]

            cur_X_val = val_list_X[split_idx]
            cur_y_val = val_list_y[split_idx]

            cur_X_train, cur_y_train = clean_input_array(cur_X_train, cur_y_train)
            cur_X_val, cur_y_val = clean_input_array(cur_X_val, cur_y_val)

            regr.fit(cur_X_train, cur_y_train)

            #######  Validation  ########
            #  Prediction
            cur_prediction = regr.predict(cur_X_val)
            # Scoring
            errors[split_idx] = scorer(cur_y_val, cur_prediction, regr, cur_X_train)

        # Collect validation result
        val_metrics = choose_worst(errors)
        return_value = (
            val_metrics[-1] if pd.notnull(val_metrics[-1]) else +1e6
        )  # для rmse
        return return_value

    # Create a study object to optimize the objective
    study = optuna.create_study(direction="minimize")  # rmse
    study.optimize(optuna_boosting_val, n_trials=n_trials, n_jobs=-1)

    # Print the best hyperparameters found by Optuna
    best_params = study.best_params
    best_value = study.best_value
    print("Best Hyperparameters:", best_params)

    return best_params, cur_X_test, cur_y_test, best_value

# Use the function
best_params, cur_X_test, cur_y_test, best_value = do_optuna_for_boosting(X, y, n_splits=n_splits)

# Предсказание
model = GradientBoostingRegressor(**best_params, random_state=RANDOM_STATE)
model.fit(cur_X_train, cur_y_train)
cur_y_pred = model.predict(cur_X_test)

# Оценка качества
rmse = np.sqrt(mean_squared_error(cur_y_test, cur_y_pred))
print(f"RMSE test: {rmse:.2f}")

# На трейне
cur_y_pred_train = model.predict(cur_X_train)
rmse_train = np.sqrt(mean_squared_error(cur_y_train, cur_y_pred_train))
print(f"RMSE train: {rmse_train:.2f}")

[I 2025-03-02 00:27:03,149] A new study created in memory with name: no-name-801e570f-81fb-4805-8eb2-832af10c9dcc
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-1)
[I 2025-03-02 00:27:08,007] Trial 4 finished with value: 30.140614541444045 and parameters: {'n_estimators': 251, 'max_depth': 1, 'learning_rate': 0.0562816623488786}. Best is trial 4 with value: 30.140614541444045.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-1)
[I 2025-03-02 00:27:08,484] Trial 3 finished with value: 45.3705798446044 and parameters: {'n_estimators': 274, 'max_depth': 1, 'learning_rate': 0.0025043889423723583}. Best is trial 4 with value: 30.140614541444045.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-1)
[I 2025-03-02 00:27:08,955] Trial 6 finished with value: 45.9086041290469 and parameters: {'n_estimators': 87, 'max_depth': 5, 'learning_rate': 0.003251172923466756}. Best is trial 4 with value: 30.140614541444045.
  learning_rate = tria

Best Hyperparameters: {'n_estimators': 254, 'max_depth': 10, 'learning_rate': 0.07123668319491888}
RMSE test: 1.06
RMSE train: 0.38
