# Import bibliotek

In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, KFold
from  sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer
import optuna
import typing
import plotly.graph_objs as go
import itertools
import warnings
from scipy.special import digamma
from sklearn.neighbors import NearestNeighbors, KDTree
warnings.filterwarnings('ignore')
SEED = 17

# Ładowanie danych

In [2]:
data=pd.read_csv("../data/data_eda.csv")
data=data.drop(columns=['Unnamed: 0'])
features=data.columns.tolist()
features.remove('stars')
target='stars'
data

Unnamed: 0,pages,stars,reviews,series,mix,character,plot,funny,lighthearted,emotional,...,author_stars,Fiction,Nonfiction,Literary,Fantasy,Crime,Social,Children,Romans,Realism
0,273,4.00,2017,0,0.44,0.51,0.02,0.27,0.37,0.91,...,4.305000,1,1,0,0,0,1,0,1,1
1,302,3.78,7330,0,0.39,0.42,0.17,0.03,0.01,0.18,...,3.670000,1,0,0,0,1,0,0,0,0
2,400,4.15,16761,0,0.51,0.39,0.08,0.02,0.01,0.88,...,0.000000,1,0,1,0,0,0,0,0,0
3,459,4.16,2128,1,0.48,0.10,0.40,0.04,0.02,0.07,...,0.000000,1,0,0,1,0,0,0,0,0
4,160,3.65,6634,1,0.28,0.16,0.54,0.92,0.73,0.00,...,4.115000,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6925,432,4.15,30643,0,0.48,0.05,0.46,0.00,0.00,0.40,...,3.856667,1,0,0,1,1,0,0,0,0
6926,352,3.62,1058,0,0.55,0.13,0.30,0.15,0.10,0.25,...,3.700000,1,0,0,1,0,1,0,0,0
6927,535,3.88,30975,1,0.45,0.08,0.45,0.14,0.19,0.31,...,3.870000,1,0,0,1,0,0,1,0,0
6928,472,3.88,5914,1,0.64,0.12,0.22,0.07,0.00,0.36,...,3.660000,1,0,0,1,0,0,1,0,0


$\text{Podział danych na zbiór treningowy i testowy}$

In [3]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=SEED)

In [4]:
def perform_cv(X: pd.DataFrame, y: pd.Series, algorithm: typing.Any, cv: typing.Any = KFold(n_splits=5, shuffle=True, random_state=SEED), metric: typing.Any = mean_squared_error) -> typing.List[float]:
    """
    Perform cross-validation and return list of scores
    
    Args:
        X (pd.DataFrame): input data
        y (pd.Series): target data
        algorithm (typing.Any): algorithm to use for training and prediction
        cv (typing.Any): cross-validation strategy
        metric (typing.Any): metric to use for evaluation
    
    Returns:
        typing.List[float]: list of scores in order: train_scores, validation_scores
    """
    train_scores = []
    validation_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        algorithm.fit(X_train, y_train)
        y_train_pred = algorithm.predict(X_train)
        y_val_pred = algorithm.predict(X_val)
        train_scores.append(metric(y_train, y_train_pred, squared=False))
        validation_scores.append(metric(y_val, y_val_pred, squared=False))
    return np.mean(train_scores), np.mean(validation_scores)

def test_evaluation(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, algorithm: typing.Any, metric: typing.Any = mean_squared_error) -> float:
    """
    Train the algorithm on the train data and evaluate on the test data
    
    Args:
        X_train (pd.DataFrame): input train data
        y_train (pd.Series): target train data
        X_test (pd.DataFrame): input test data
        y_test (pd.Series): target test data
        algorithm (typing.Any): algorithm to use for training and prediction
        metric (typing.Any): metric to use for evaluation
    
    Returns:
        float: test score
    """
    algorithm.fit(X_train, y_train)
    y_test_pred = algorithm.predict(X_test)
    return metric(y_test, y_test_pred, squared=False)

## Base score

In [5]:
model = MLPRegressor(random_state=SEED)
train_scores, validation_scores = perform_cv(train_data[features], train_data[target], model)
test_score = test_evaluation(train_data[features], train_data[target], test_data[features], test_data[target], model)
print("Średni błąd RMSE na zbiorze treningowym: {:.5f}".format(train_scores))
print("Średni błąd RMSE na zbiorze walidacyjnym: {:.5f}".format(validation_scores))
print("Błąd RMSE na zbiorze testowym: {:.5f}".format(test_score))

Średni błąd RMSE na zbiorze treningowym: 2.16058
Średni błąd RMSE na zbiorze walidacyjnym: 2.06503
Błąd RMSE na zbiorze testowym: 1.87393


$\text{Wyniki dla modelu bazowego wyglądają bardzo słabo w porównaniu do pozostałych algorytmów.}$<p>
$\text{Warto jednak zaznaczyć, że w przypadku sieci neuronowych, podobnie jak w KNN, należy znormalizować dane, ponieważ algorytmy te są wrażliwe na skalę danych.}$<p>
$\text{W celu normalizacji danych przetestujemy kilka różnych narzędzi.}$<p>
$\text{Transformacja zostanie przeprowadzona jedynie na zmiennych ciągłych.}$<p>
$\text{Ponieważ informacje na temat rozkładu zmiennych powinny być znane tylko dla danych treningowych, to zmodyfikujemy funkcje ewaluacyjne.}$<p>
$\text{Aby była możliwość porównania wyników z tymi uzyskanymi dla pozostałych algorytmów, przed porównaniem predykcji z wartościami prawdziwymi, dokonamy odwrotnej transformacji, w celu uzyskania pierwotnej skali danych.}$

In [6]:
def perform_cv_scaling(X: pd.DataFrame, y: pd.Series, algorithm: typing.Any, cv: typing.Any = KFold(n_splits=5, shuffle=True, random_state=SEED), metric: typing.Any = mean_squared_error, scaler: typing.Any = StandardScaler(), features_to_scale: typing.List[str] = None, target_to_scale: bool = False) -> typing.List[float]:
    """
    Perform cross-validation and return list of scores
    
    Args:
        X (pd.DataFrame): input data
        y (pd.Series): target data
        algorithm (typing.Any): algorithm to use for training and prediction
        cv (typing.Any): cross-validation strategy
        metric (typing.Any): metric to use for evaluation
        scaler (typing.Any): scaler to use for scaling
        features_to_scale (typing.List[str]): list of features to scale
        target_to_scale (bool): whether to scale target data
    
    Returns:
        typing.List[float]: list of scores in order: train_scores, validation_scores
    """
    train_scores = []
    validation_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
        X_val[features_to_scale] = scaler.transform(X_val[features_to_scale])
        if target_to_scale:
            y_train = scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()
            y_val = scaler.transform(y_val.values.reshape(-1, 1)).ravel()    
        algorithm.fit(X_train, y_train)
        y_train_pred = algorithm.predict(X_train)
        y_val_pred = algorithm.predict(X_val)
        if(target_to_scale):
            y_train = scaler.inverse_transform(y_train.reshape(-1, 1)).ravel()
            y_val = scaler.inverse_transform(y_val.reshape(-1, 1)).ravel()
            y_train_pred = scaler.inverse_transform(y_train_pred.reshape(-1, 1)).ravel()
            y_val_pred = scaler.inverse_transform(y_val_pred.reshape(-1, 1)).ravel()
        train_scores.append(metric(y_train, y_train_pred, squared=False))
        validation_scores.append(metric(y_val, y_val_pred, squared=False))
    return np.mean(train_scores), np.mean(validation_scores)

def test_evaluation_scaling(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, algorithm: typing.Any, metric: typing.Any = mean_squared_error, scaler: typing.Any = StandardScaler(), features_to_scale: typing.List[str] = None, target_to_scale: bool = False) -> float:
    """
    Train the algorithm on the train data and evaluate on the test data
    
    Args:
        X_train (pd.DataFrame): input train data
        y_train (pd.Series): target train data
        X_test (pd.DataFrame): input test data
        y_test (pd.Series): target test data
        algorithm (typing.Any): algorithm to use for training and prediction
        metric (typing.Any): metric to use for evaluation.
        scaler (typing.Any): scaler to use for scaling
        features_to_scale (typing.List[str]): list of features to scale
        target_to_scale (bool): whether to scale target data
    
    Returns:
        float: test score
    """
    X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
    X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])
    if target_to_scale:
        y_train = scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()
        y_test = scaler.transform(y_test.values.reshape(-1, 1)).ravel()
    algorithm.fit(X_train, y_train)
    y_test_pred = algorithm.predict(X_test)
    if target_to_scale:
        y_test = scaler.inverse_transform(y_test.reshape(-1, 1)).ravel()
        y_test_pred = scaler.inverse_transform(y_test_pred.reshape(-1, 1)).ravel()
    return metric(y_test, y_test_pred, squared=False)

In [7]:
continous_features = ["pages", "reviews", "mix", "character", "plot", "funny", "lighthearted", "emotional", "hopeful", "inspiring", "relaxing", "tense", "sad", "reflective", "adventurous", "challenging", "informative", "mysterious", "dark", "author_count", "author_stars"]
scalers = [StandardScaler(), MinMaxScaler(), RobustScaler(), QuantileTransformer()]
scalers_names = ["StandardScaler", "MinMaxScaler", "RobustScaler", "QuantileTransformer"]
for scaler, scaler_name in zip(scalers, scalers_names):
    train_scores, validation_scores = perform_cv_scaling(train_data[features], train_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
    print("Średni błąd RMSE na zbiorze treningowym z użyciem skaler: {} wynosi: {:.5f}".format(scaler_name, train_scores))
    print("Średni błąd RMSE na zbiorze walidacyjnym z użyciem skaler: {} wynosi: {:.5f}".format(scaler_name, validation_scores))
    print()

Średni błąd RMSE na zbiorze treningowym z użyciem skaler: StandardScaler wynosi: 0.13254
Średni błąd RMSE na zbiorze walidacyjnym z użyciem skaler: StandardScaler wynosi: 0.20105

Średni błąd RMSE na zbiorze treningowym z użyciem skaler: MinMaxScaler wynosi: 0.20128
Średni błąd RMSE na zbiorze walidacyjnym z użyciem skaler: MinMaxScaler wynosi: 0.21464

Średni błąd RMSE na zbiorze treningowym z użyciem skaler: RobustScaler wynosi: 0.13630
Średni błąd RMSE na zbiorze walidacyjnym z użyciem skaler: RobustScaler wynosi: 0.19931

Średni błąd RMSE na zbiorze treningowym z użyciem skaler: QuantileTransformer wynosi: 0.19140
Średni błąd RMSE na zbiorze walidacyjnym z użyciem skaler: QuantileTransformer wynosi: 0.23032



$\text{Wyniki walidacji krzyżowej sugerują wykorzystanie narzędzia RobustScaler do transformacji zmiennych ciągłych.}$<p>
$\text{Zaletą RobustScaler jest zwiększona odporność na obserwacje odstające, co jest istotne w przypadku niektórych zmiennych ciągłych występujących w zbiorze danych (np.: author\_count, reviews, czy pages).}$<p>
$\text{Sprawdźmy wyniki na zbiorze testowym po dokonaniu standaryzacji.}$

In [8]:
model = MLPRegressor(random_state=SEED)
scaler = RobustScaler()
train_scores, validation_scores = perform_cv_scaling(train_data[features], train_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
test_score = test_evaluation_scaling(train_data[features], train_data[target], test_data[features], test_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
print("Średni błąd RMSE na zbiorze treningowym: {:.5f}".format(train_scores))
print("Średni błąd RMSE na zbiorze walidacyjnym: {:.5f}".format(validation_scores))
print("Błąd RMSE na zbiorze testowym: {:.5f}".format(test_score))

Średni błąd RMSE na zbiorze treningowym: 0.13630
Średni błąd RMSE na zbiorze walidacyjnym: 0.19931
Błąd RMSE na zbiorze testowym: 0.20221


$\text{Wyniki uległy znacznej poprawie, natomiast warto odnotować, że sieć została przetrenowana.}$<p>
$\text{W kolejnych etapach będziemy starali się dokonać pewnej regularyzacji w celu możliwego zredukoania overfittingu.}$

# Optymalizacja

## hidden_layer_sizes

$\text{W przypadku sieci neuronowych, jednym z najważniejszych parametrów jest liczba warstw ukrytych oraz liczba neuronów w każdej z warstw.}$<p>
$\text{Zbyt mała liczba neuronów może prowadzić do underfittingu, natomiast zbyt duża do overfittingu oraz długiego czasu uczenia.}$<p>
$\text{Oczywiście dokładne znalezienie optymalnego rozwiązania jest niemal niemożliwe, dlatego spróbujemy zaledwie z kilkoma różnymi wartościami tego parametru.}$

In [9]:
hidden_layer_sizes = [50,100,150] + list(itertools.permutations([50,100,150],2)) + list(itertools.permutations([50,100,150],3))
for hidden_layer_size in hidden_layer_sizes:
    model = MLPRegressor(hidden_layer_sizes=hidden_layer_size, random_state=SEED)
    train_scores, validation_scores = perform_cv_scaling(train_data[features], train_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
    print("Hidden layer sizes: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(hidden_layer_size, train_scores, validation_scores))

Hidden layer sizes: 50; RMSE train: 0.15853; RMSE validation: 0.20207
Hidden layer sizes: 100; RMSE train: 0.13630; RMSE validation: 0.19931
Hidden layer sizes: 150; RMSE train: 0.12169; RMSE validation: 0.20307
Hidden layer sizes: (50, 100); RMSE train: 0.09975; RMSE validation: 0.21803
Hidden layer sizes: (50, 150); RMSE train: 0.08948; RMSE validation: 0.22231
Hidden layer sizes: (100, 50); RMSE train: 0.08854; RMSE validation: 0.22334
Hidden layer sizes: (100, 150); RMSE train: 0.06749; RMSE validation: 0.22186
Hidden layer sizes: (150, 50); RMSE train: 0.07765; RMSE validation: 0.22214
Hidden layer sizes: (150, 100); RMSE train: 0.06510; RMSE validation: 0.21836
Hidden layer sizes: (50, 100, 150); RMSE train: 0.04911; RMSE validation: 0.23418
Hidden layer sizes: (50, 150, 100); RMSE train: 0.04165; RMSE validation: 0.23364
Hidden layer sizes: (100, 50, 150); RMSE train: 0.05390; RMSE validation: 0.23866
Hidden layer sizes: (100, 150, 50); RMSE train: 0.04763; RMSE validation: 0.22

$\text{Widzimy, że wraz ze zwiększaniem liczby neuronów oraz warstw, wyniki na zbiorze trenigowym ulegają poprawie.}$<p>
$\text{Niestety, powoduje to również zwiększenie overfittingu.}$<p>
$\text{Wygląda na to, że optymalna będzie zaledwie jedna warstwa w sieci.}$<p>
$\text{Na dalszym etapie, podczas tuningowania całej sieci, testować będziemy wyniki dla wartości w okolicach 100 neuronów w pojedynczej warstwie.}$

## activation

$\text{Kolejnym ważnym parametrem jest funkcja aktywacji.}$<p>
$\text{Odpowiada ona za to, jaka transformacja zostanie zastosowana na danych wychodzących z warstwy ukrytej.}$<p>
$\text{Dla sieci neuronowych dostępne są różne funkcje aktywacji, poniżej przetestujemy następujące: identity, logistic, tanh, relu.}$

In [10]:
activations = ['identity', 'logistic', 'tanh', 'relu']
for activation in activations:
    model = MLPRegressor(hidden_layer_sizes=100, activation=activation, random_state=SEED)
    train_scores, validation_scores = perform_cv_scaling(train_data[features], train_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
    print("Activation: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(activation, train_scores, validation_scores))

Activation: identity; RMSE train: 0.22218; RMSE validation: 0.22367
Activation: logistic; RMSE train: 0.19598; RMSE validation: 0.20402
Activation: tanh; RMSE train: 0.15694; RMSE validation: 0.19697
Activation: relu; RMSE train: 0.13630; RMSE validation: 0.19931


$\text{Najlepsze wyniki na zbiorze walidacyjnym uzyskano w przypadku funkcji aktywacji tanh.}$

## solver

$\text{Parametr solver odpowiada za algorytm optymalizacji wag - za to jak będą one aktualizowane w trakcie uczenia.}$<p>
$\text{Podobnie jak w przypadku funkcji aktywacji, dostępne są różne algorytmy, poniżej przetestujemy następujące: lbfgs, sgd, adam.}$

In [11]:
solvers = ['lbfgs', 'sgd', 'adam']
for solver in solvers:
    model = MLPRegressor(hidden_layer_sizes=100, activation='tanh', solver=solver, random_state=SEED)
    train_scores, validation_scores = perform_cv_scaling(train_data[features], train_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
    print("Solver: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(solver, train_scores, validation_scores))

Solver: lbfgs; RMSE train: 0.15194; RMSE validation: 0.20313
Solver: sgd; RMSE train: 0.20379; RMSE validation: 0.20946
Solver: adam; RMSE train: 0.15694; RMSE validation: 0.19697


$\text{Zarówno rezultaty na zbiorze treningowym, jak i walidacyjnym są najlepsze dla solvera adam.}$

## alpha

$\text{Parametr alpha odpowiada za regularyzację sieci.}$<p>
$\text{alpha służy do ograniczenia rozmiaru wag, co może pomóc w redukcji overfittingu.}$

In [12]:
alphas = [10**i for i in range(-5, 5)]
for alpha in alphas:
    model = MLPRegressor(hidden_layer_sizes=100, activation='tanh', solver='adam', alpha=alpha, random_state=SEED)
    train_scores, validation_scores = perform_cv_scaling(train_data[features], train_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
    print("Alpha: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(alpha, train_scores, validation_scores))

Alpha: 1e-05; RMSE train: 0.15691; RMSE validation: 0.19698
Alpha: 0.0001; RMSE train: 0.15694; RMSE validation: 0.19697
Alpha: 0.001; RMSE train: 0.15720; RMSE validation: 0.19690
Alpha: 0.01; RMSE train: 0.15915; RMSE validation: 0.19636
Alpha: 0.1; RMSE train: 0.17174; RMSE validation: 0.19542
Alpha: 1; RMSE train: 0.20312; RMSE validation: 0.20787
Alpha: 10; RMSE train: 0.22935; RMSE validation: 0.22987
Alpha: 100; RMSE train: 0.29388; RMSE validation: 0.29417
Alpha: 1000; RMSE train: 0.30130; RMSE validation: 0.30147
Alpha: 10000; RMSE train: 0.30130; RMSE validation: 0.30143


$\text{Wraz ze wzrostem wartości parametru alpha, wyniki na zbiorze trenigowym ulegają pogorszeniu.}$<p>
$\text{W przypadku zbioru walidacyjnego, najlepsze wyniki uzyskano dla wartości 0.01}$

## learning_rate

$\text{Parametr learning\_rate odpowiada za to jak silnie aktualizowane są wagi w trakcie uczenia.}$

In [13]:
learning_rates = [10**i for i in range(-5, 1)]
for learning_rate in learning_rates:
    model = MLPRegressor(hidden_layer_sizes=100, activation='tanh', solver='adam', alpha=0.01, learning_rate_init=learning_rate, random_state=SEED)
    train_scores, validation_scores = perform_cv_scaling(train_data[features], train_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
    print("Learning rate: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(learning_rate, train_scores, validation_scores))

Learning rate: 1e-05; RMSE train: 0.22565; RMSE validation: 0.22739
Learning rate: 0.0001; RMSE train: 0.19920; RMSE validation: 0.20637
Learning rate: 0.001; RMSE train: 0.15915; RMSE validation: 0.19636
Learning rate: 0.01; RMSE train: 0.13676; RMSE validation: 0.21779
Learning rate: 0.1; RMSE train: 0.22215; RMSE validation: 0.25893
Learning rate: 1; RMSE train: 0.45501; RMSE validation: 0.46085


$\text{Dla wartości domyślnej: 0.001, wyniki na zbiorze walidacyjnym są najlepsze.}$

## max_iter

$\text{Parametr max\_iter odpowiada za maksymalną liczbę iteracji - czyli ile razy algorytm będzie uczył się na danych i optymalizował wagi.}$

In [14]:
max_iters = [i for i in range(100, 1001, 100)]
for max_iter in max_iters:
    model = MLPRegressor(hidden_layer_sizes=100, activation='tanh', solver='adam', alpha=0.01, learning_rate_init=0.001, max_iter=max_iter, random_state=SEED)
    train_scores, validation_scores = perform_cv_scaling(train_data[features], train_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
    print("Max iter: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(max_iter, train_scores, validation_scores))

Max iter: 100; RMSE train: 0.17769; RMSE validation: 0.19859
Max iter: 200; RMSE train: 0.15915; RMSE validation: 0.19636
Max iter: 300; RMSE train: 0.14982; RMSE validation: 0.19679
Max iter: 400; RMSE train: 0.14903; RMSE validation: 0.19764
Max iter: 500; RMSE train: 0.14903; RMSE validation: 0.19764
Max iter: 600; RMSE train: 0.14903; RMSE validation: 0.19764
Max iter: 700; RMSE train: 0.14903; RMSE validation: 0.19764
Max iter: 800; RMSE train: 0.14903; RMSE validation: 0.19764
Max iter: 900; RMSE train: 0.14903; RMSE validation: 0.19764
Max iter: 1000; RMSE train: 0.14903; RMSE validation: 0.19764


$\text{Od okolic 400 iteracji, model przestaje się uczyć, a wyniki na zbiorze walidacyjnym są stałe.}$<p>
$\text{Warto jednak zwrócić uwagę, że model zaczyna być przeuczony od około 200 iteracji.}$

# Tuning

In [15]:
class RandomSearchCV:
    """
    This class is used to optimize the hyperparameters of the algorithm using Random Search.
    """

    def __init__(
        self,
        algorithm: typing.Any,
        metric: str,
        cv: typing.Any = KFold(n_splits=5, shuffle=True, random_state=17),
        n_trials: int = 100,
        seed: int = 17,
    ) -> None:
        """
        Initializes the RandomSearchCV class.

        Args:
            algorithm (typing.Any): algorithm to optimize.
            metric (str): metric to use for optimization.
            cv (typing.Any): cross-validation strategy.
            n_trials (int): number of trials to perform.
            seed (int): random seed.
        """
        self.algorithm = algorithm
        metrics = {
            "accuracy": [lambda y, y_pred: accuracy_score(y, y_pred), "preds", "maximize"],
            "roc_auc": [lambda y, y_pred: roc_auc_score(y, y_pred), "probs", "maximize"],
            "mse": [lambda y, y_pred: mean_squared_error(y, y_pred), "preds", "minimize"],
            "rmse": [
                lambda y, y_pred: mean_squared_error(y, y_pred, squared=False),
                "preds", "minimize"
            ],
            "mae": [lambda y, y_pred: mean_absolute_error(y, y_pred), "preds", "minimize"],
        }
        if metric not in metrics:
            raise ValueError("Unsupported metric: {}".format(metric))
        self.eval_metric = metrics[metric][0]
        self.metric_type = metrics[metric][1]
        self.direction = metrics[metric][2]
        self.cv = cv
        self.n_trials = n_trials
        self.seed = seed
    
    def check_X(
        self, X: typing.Union[pd.DataFrame, pd.Series, np.ndarray]
    ) -> np.ndarray:
        """Check if X is pandas DataFrame, pandas Series or numpy array and convert it to numpy array.

        Args:
            X: (Union[pd.DataFrame, pd.Series, np.ndarray]): input data.

        Returns:
            X: (np.ndarray): converted input data.
        """
        if (
            not isinstance(X, pd.DataFrame)
            and not isinstance(X, pd.Series)
            and not isinstance(X, np.ndarray)
        ):
            raise TypeError(
                "Wrong type of X. It should be pandas DataFrame, pandas Series, numpy array."
            )
        X = np.array(X)
        if X.ndim == 1:
            X = X[None, :]
        return X

    def check_y(
        self, y: typing.Union[pd.DataFrame, pd.Series, np.ndarray]
    ) -> np.ndarray:
        """Check if y is pandas DataFrame, pandas Series or numpy array and convert it to numpy array.

        Args:
            y: (Union[pd.DataFrame, pd.Series, np.ndarray]): target data.

        Returns:
            y: (np.ndarray): converted target data.
        """
        if (
            not isinstance(y, pd.DataFrame)
            and not isinstance(y, pd.Series)
            and not isinstance(y, np.ndarray)
        ):
            raise TypeError(
                "Wrong type of y. It should be pandas DataFrame, pandas Series, numpy array."
            )
        y = np.array(y)
        if y.ndim != 1:
            y = y.squeeze()
        return y

    def check_for_object_columns(self, X: np.ndarray) -> np.ndarray:
        """Check if X contains object columns and convert it to numeric data.

        Args:
            X: (np.ndarray): input data.

        Returns:
            X: (np.ndarray): converted input data.
        """
        X = pd.DataFrame(X)
        if X.select_dtypes(include=np.number).shape[1] != X.shape[1]:
            raise TypeError(
                "Your data contains object or string columns. Numeric data is obligated."
            )
        return np.array(X)

    def tune(
        self,
        X: pd.DataFrame,
        y: pd.Series,
        params_grid: typing.Dict[str, typing.Tuple[str, typing.List[typing.Any]]],
        X_valid: pd.DataFrame = None,
        y_valid: pd.Series = None,
    ) -> typing.Dict[str, typing.Any]:
        """
        This method tunes the algorithm hyperparameters.

        Args:
            X (pd.DataFrame): input data.
            y (pd.Series): target data.
            params_grid (typing.Dict[str, typing.Tuple[str, typing.List[typing.Any]]]): hyperparameters grid.
            X_valid (pd.DataFrame): validation data (default is None).
            y_valid (pd.Series): validation labels (default is None).

        Returns:
            typing.Dict[str, typing.Any]: best hyperparameters.
        """
        self.params_grid = params_grid
        study = self.create_study()
        X = self.check_X(X)
        X = self.check_for_object_columns(X)
        y = self.check_y(y)
        if X_valid is not None and y_valid is not None:
            X_valid = self.check_X(X_valid)
            y_valid = self.check_y(y_valid)
            X_valid = self.check_for_object_columns(X_valid)
            study.optimize(
                lambda trial: self.objective(trial, X, y, X_valid, y_valid),
                n_trials=self.n_trials,
            )
        else:
            study.optimize(
                lambda trial: self.objective_cv(trial, X, y),
                n_trials=self.n_trials,
            )
        return study.best_params
    
    def create_study(self,) -> optuna.study.Study:
        """This method creates an optuna study object.

        Returns:
            optuna.study.Study: optuna study object.
        """
        sampler = optuna.samplers.TPESampler(seed=self.seed)
        return optuna.create_study(direction=self.direction, sampler=sampler)
    
    def objective(self, trial: optuna.Trial, X_train: np.ndarray, y_train: np.ndarray, X_valid: np.ndarray, y_valid: np.ndarray) -> float:
        """
        This method defines the objective function for optimization when validation data is provided.

        Args:
            trial (optuna.Trial): trial object.
            X_train (np.ndarray): input data.
            y_train (np.ndarray): target data.
            X_valid (np.ndarray): validation data.
            y_valid (np.ndarray): validation labels.
        
        Returns:
            float: Validation score.
        """
        params = {param_name: self.get_param(trial, param_name, param_values) for param_name, param_values in self.params_grid.items()}
        self.algorithm = self.algorithm.set_params(**params)
        model.fit(X_train, y_train)
        y_valid_pred = model.predict(X_valid)
        return self.eval_metric(y_valid, y_valid_pred)
    
    def objective_cv(self, trial: optuna.Trial, X: np.ndarray, y: np.ndarray) -> float:
        """
        This method defines the objective function for optimization when validation data is not provided.

        Args:
            trial (optuna.Trial): trial object.
            X (np.ndarray): input data.
            y (np.ndarray): target data.
        
        Returns:
            float: Cross-validation score.
        """
        params = {param_name: self.get_param(trial, param_name, param_values) for param_name, param_values in self.params_grid.items()}
        self.algorithm = self.algorithm.set_params(**params)
        return self.perform_cv(X, y)

    
    def get_param(self, trial: optuna.Trial, param_name: str, param_values: typing.Tuple[str, typing.List[typing.Any]]) -> typing.Any:
        """
        This method converts the parameter values to the optuna parameter suggestion.

        Args:
            trial (optuna.Trial): trial object.
            param_name (str): parameter name.
            param_values (typing.Tuple[str, typing.List[typing.Any]]): parameter values.

        Returns:
            typing.Any: parameter suggestion.
        """
        param_type, param_value = param_values
        if param_type == "int":
            return trial.suggest_int(param_name, low=param_value[0], high=param_value[1])
        elif param_type == "float":
            return trial.suggest_float(param_name, low=param_value[0], high=param_value[1])
        elif param_type == "categorical":
            return trial.suggest_categorical(param_name, param_value)
        elif param_type == "constant":
            return trial.suggest_categorical(param_name, [param_value])

    def perform_cv(self, X: np.ndarray, y: np.ndarray) -> float:
        """This method performs cross-validation.

        Args:
            X: (np.ndarray): input data.
            y: (np.ndarray): target data.

        Returns:
            float: cross-validation score.
        """
        valid_scores = []
        for train_idx, valid_idx in self.cv.split(X):
            X_train_cv, X_valid_cv = X[train_idx], X[valid_idx]
            y_train_cv, y_valid_cv = y[train_idx], y[valid_idx]
            self.algorithm.fit(X_train_cv, y_train_cv)
            if self.metric_type == "preds":
                y_valid_pred = self.algorithm.predict(X_valid_cv)
            else:
                y_valid_pred = self.algorithm.predict_proba(X_valid_cv)[:, 1]
            valid_scores.append(self.eval_metric(y_valid_cv, y_valid_pred))
        return np.mean(valid_scores)

In [16]:
params_dict = {
    "hidden_layer_sizes": ("int", [50, 150]),
    "activation": ("categorical", ["tanh", "relu"]),
    "solver": ("categorical", ["sgd", "adam"]),
    "alpha": ("float", [0.005, 0.5]),
    "learning_rate_init": ("float", [0.0005, 0.005]),
    "max_iter": ("int", [150, 350]),
    "random_state": ("constant", SEED),
}
CV = RandomSearchCV(
    algorithm=MLPRegressor(random_state=SEED),
    metric="rmse",
    cv=KFold(n_splits=5, shuffle=True, random_state=SEED),
    n_trials=100,
    seed=SEED,
)
train_data_scaled = train_data.copy()
train_data_scaled[continous_features] = scaler.fit_transform(train_data_scaled[continous_features])
train_data_scaled[target] = scaler.fit_transform(train_data_scaled[target].values.reshape(-1, 1)).ravel()
best_params = CV.tune(train_data_scaled[features], train_data_scaled[target], params_dict)

[I 2024-05-02 13:35:40,741] A new study created in memory with name: no-name-9ed01e7b-a9cd-4015-9a9a-a0690f3e2996


[I 2024-05-02 13:35:50,061] Trial 0 finished with value: 0.48971971643022466 and parameters: {'hidden_layer_sizes': 79, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.32988509327904847, 'learning_rate_init': 0.0033688440321963616, 'max_iter': 265, 'random_state': 17}. Best is trial 0 with value: 0.48971971643022466.
[I 2024-05-02 13:36:09,875] Trial 1 finished with value: 0.47838451048494984 and parameters: {'hidden_layer_sizes': 53, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.43925881044327497, 'learning_rate_init': 0.0007303714952920416, 'max_iter': 281, 'random_state': 17}. Best is trial 1 with value: 0.47838451048494984.
[I 2024-05-02 13:36:20,991] Trial 2 finished with value: 0.4865709381022877 and parameters: {'hidden_layer_sizes': 105, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.28294690813654877, 'learning_rate_init': 0.0022822134622116196, 'max_iter': 308, 'random_state': 17}. Best is trial 1 with value: 0.47838451048494984.
[I 2024-05-02 13:36:38,022] Trial 

In [17]:
#Model bazowy
model = MLPRegressor(random_state=SEED)
base_train_scores, base_validation_scores = perform_cv_scaling(train_data[features], train_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
base_test_score = test_evaluation_scaling(train_data[features], train_data[target], test_data[features], test_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
#Tuning
model = MLPRegressor(**best_params)
tuning_train_scores, tuning_validation_scores = perform_cv_scaling(train_data[features], train_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
tuning_test_score = test_evaluation_scaling(train_data[features], train_data[target], test_data[features], test_data[target], model, scaler=scaler, features_to_scale=continous_features, target_to_scale=True)
results = pd.DataFrame({
    "Model": ["Base", "Tuning"],
    "Train RMSE": [base_train_scores, tuning_train_scores],
    "Validation RMSE": [base_validation_scores, tuning_validation_scores],
    "Test RMSE": [base_test_score, tuning_test_score]
})
results.style.background_gradient(cmap='Reds', axis=0)

Unnamed: 0,Model,Train RMSE,Validation RMSE,Test RMSE
0,Base,0.136297,0.199309,0.202214
1,Tuning,0.157285,0.192559,0.196598


$\text{Optymalizacja hiperparametrów modelu pozwoliła na uzyskanie lepszych wyników na zbiorze walidacyjnym oraz testowym.}$<p>
$\text{Co prawda rezultaty są nieco gorsze niż dla Ensemblingu, ale różnica nie jest duża (Sieć neuronowa: 0.196598; Ensembling: 0.196046).}$