# Rumination prediction - averaged participants' epochs 

### Vectorization with defined channels

### Imports

In [None]:
%load_ext lab_black
import os
import pickle
from time import time
import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import cesium.featurize
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.decomposition import FastICA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

import sys

sys.path.append("..")
from utils import *

### Loading data

Loading EEG data and data from rumination questionnaire. By default create_df_data loads all info from given file but one can specify it by passing a list of desired labels from csv file.

In [None]:
tmin, tmax = -0.1, 0.6
signal_frequency = 256
ERROR = 0
CORRECT = 1
random_state = 0

In [None]:
df_name = "go_nogo_df_mean"
pickled_data_filename = "../../data/" + df_name + ".pkl"
info_filename = "../../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(
        test_participants=False, info="all", personal=False, info_filename=info_filename
    )
    epochs_df.name = df_name
    # save loaded data into a pickle file
    epochs_df.to_pickle("../../data/" + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

#### Average participants' error and correct epochs

In [None]:
averaged_epochs_df = (
    epochs_df.groupby(
        ["id", "marker"],
        sort=False,
    )
    .apply(
        lambda group_df: pd.Series(
            {
                "epoch": np.mean(group_df["epoch"]),
                "Rumination Full Scale": np.mean(group_df["Rumination Full Scale"]),
            }
        )
    )
    .reset_index()
)

## Training and predictions

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from tempfile import mkdtemp


from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


import numpy as np
import scipy.stats

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

- For each band (frequency) from CWT set it computes features given in feature_dict parameter (eg. std or mean).
- Then it computes PCA on flattened EEG channels and features (outer_components = N)
- Ending feature vector has shape: outer_components from (channels * len(feature_dict) * frequencies)

#### Standard features for EEG analysis provided by Guo et al. (2012)

In [None]:
def std_signal(t, m, e):
    return np.std(m)


def abs_diffs_signal(t, m, e):
    return np.sum(np.abs(np.diff(m)))


def mean_energy_signal(t, m, e):
    return np.mean(m ** 2)


def skew_signal(t, m, e):
    return scipy.stats.skew(m)


def mean_signal(t, m, e):
    return np.mean(m)

#### Additional score functions 

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    ## Note: does not handle mix 1d representation
    # if _is_1d(y_true):
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

The adjusted R-squared increases only if the new term improves the model more than would be expected by chance. It decreases when a predictor improves the model by less than expected by chance.

https://www.statisticshowto.com/adjusted-r2/

In [None]:
def r2_adjusted_scorer(y_test, y_pred, p, n):
    r2 = r2_score(y_test, y_pred)
    r2_adj = 1 - (1 - r2) * ((n - 1) / (n - p - 1))
    return r2_adj

### Regressions grid search

Pipeline allows manipulation of vectorization's parameters. Base_steps dictionary consists of all steps of vectorization including standarization of data.

In rate_regression function, using GridSearchCV, cross-validation splitting strategy can be specified. Default cv = 5.
Results of cross-validated search are in **grid_search.cv_results** and chosen model is in **grid_search.best_estimator_**

In [None]:
dataset = ERROR
dataset_name = "correct" if dataset == CORRECT else "error"

In [None]:
X_train = np.array(
    averaged_epochs_df[averaged_epochs_df["marker"] == dataset]["epoch"].to_list()
)
y_train = np.array(
    averaged_epochs_df[averaged_epochs_df["marker"] == dataset][
        "Rumination Full Scale"
    ].to_list()
)

In [None]:
X_test = []
y_test = []

#### Defined data transformers - custom data transformation steps

In [None]:
def ChannelExtractionTransformer(channel_list):
    def transform(X):
        epochs_per_channels = np.transpose(X, (1, 0, 2))
        epochs_per_selected_channels = []

        for channel in channel_list:
            this_data = epochs_per_channels[channel]
            epochs_per_selected_channels.append(this_data)

        epochs_per_selected_channels = np.array(epochs_per_selected_channels)
        selected_channels_per_epoch = np.transpose(
            epochs_per_selected_channels, (1, 0, 2)
        )
        return selected_channels_per_epoch

    return FunctionTransformer(func=transform)


def ChannelWiseTransformer():
    def transform(X):
        data_per_channel = np.transpose(X, (1, 0, 2))
        return data_per_channel

    return FunctionTransformer(func=transform)


def BinTransformer(step):
    def bin_epoch(epoch):
        new_channels = []
        for channel in epoch:
            bins_channel = []
            index = 0
            while index + step < len(channel):
                this_bin = np.mean(channel[index : index + step])
                bins_channel.append(this_bin)
                index += step
            new_channels.append(bins_channel)
        return new_channels

    def transform(X):
        binned_data = np.array([bin_epoch(epoch) for epoch in X])
        return binned_data

    return FunctionTransformer(func=transform)


def CwtVectorizer(mwt="morl", cwt_density=2):
    def transform(X):
        cwt_per_channel = []
        for data in X:
            data_cwt = np.array([cwt(epoch, mwt, cwt_density) for epoch in data])
            cwt_per_channel.append(data_cwt)
        cwt_per_channel = np.array(cwt_per_channel)
        return cwt_per_channel

    return FunctionTransformer(func=transform)


def CwtFeatureVectorizer(feature_dict):
    def transform(X):
        vectorized_data = []

        for data_cwt in X:
            # cesium functions
            feature_set_cwt = cesium.featurize.featurize_time_series(
                times=None,
                values=data_cwt,
                errors=None,
                features_to_use=list(feature_dict.keys()),
                custom_functions=feature_dict,
            )
            features_per_epoch = feature_set_cwt.to_numpy()
            vectorized_data.append(features_per_epoch)
        vectorized_data = np.array(vectorized_data)
        return vectorized_data

    return FunctionTransformer(func=transform)


# transforms energy of each sub-band into relative energy of sub-band
def RelativeEnergyTransformer():
    def transform(X):
        vectorized_data = []

        for epoch in X:
            total_energy_of_epoch = np.sum(epoch)
            sub_band_relative_energies = np.array(
                [(sub_band_energy / total_energy_of_epoch) for sub_band_energy in epoch]
            )
            vectorized_data.append(sub_band_relative_energies)

        vectorized_data = np.array(vectorized_data)
        return vectorized_data

    return FunctionTransformer(func=transform)


# reshape data from (channels x epoch x features) to (epochs x channles x features)
# and then flatten it to (epoch x channels*features)
def PostprocessingTransformer():
    def transform(X):
        vectorized_data = np.stack(X, axis=1)
        epochs_per_channel_feature = vectorized_data.reshape(
            vectorized_data.shape[0], -1
        )
        return epochs_per_channel_feature

    return FunctionTransformer(func=transform)

# Experiments

In [None]:
results_df = pd.DataFrame()

### Experiment 1
- Models: KNN, GBR, Lasso, SVR
- without feature functions

In [None]:
pipeline_name = "channels_cwt"

In [None]:
knn = ("knn", KNeighborsRegressor())
knn_params = dict(
    knn__n_neighbors=np.arange(5, 45, 3),
)

In [None]:
gbr = ("gbr", GradientBoostingRegressor())
gbr_params = dict(
    gbr__n_estimators=np.arange(1, 45, 5),
)

In [None]:
lasso = ("lasso", Lasso())
lasso_params = dict(lasso__alpha=np.arange(0.1, 0.5, 0.1))

In [None]:
svr = ("svr", SVR())
svr_params = dict(
    svr__kernel=["rbf", "linear", "sigmoid"],
    svr__C=[0.001, 0.01, 0.1, 1],
)

In [None]:
regressor_params = dict(
    pca__n_components=np.arange(15, 37, 2),
)

In [None]:
tested_regressors = [
    (lasso, lasso_params),
    (gbr, gbr_params),
    (knn, knn_params),
    (svr, svr_params),
]

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
all_channels = np.arange(0, 64, 1)
# red_board = [3, 10, 11, 18, 19, 30, 31, 37, 38, 45, 46, 48, 55]
most_important = [31, 46, 48, 30]

channels_options = [all_channels, most_important]
# channels_options = [most_important]

In [None]:
def rate_regression(
    X_train, y_train, X_test, y_test, regressor, regressor_params, base_steps, cv=5
):
    pipeline = Pipeline(steps=base_steps + [regressor])
    param_grid = regressor_params
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring={"r2", "neg_mean_absolute_error"},
        refit="r2",
        n_jobs=10,
        verbose=10,
    )
    grid_search.fit(X_train, y_train)

    return grid_search

In [None]:
for channel_list in channels_options:
    print(f"Channels used in vectorization: {channel_list}\n")

    # define base steps
    this_base_steps = [
        ("channel_extraction", ChannelExtractionTransformer(channel_list=channel_list)),
        ("channel_postprocessing", ChannelWiseTransformer()),
        ("cwt", CwtVectorizer()),
        ("postprocessing", PostprocessingTransformer()),
        ("pca", PCA(random_state=random_state)),
        ("scaler", StandardScaler()),
    ]

    # rate different models
    for (regressor, params) in tested_regressors:
        print(f"Rating {regressor}\n")
        tested_params = {**regressor_params, **params}
        grid_result = rate_regression(
            X_train,
            y_train,
            X_test,
            y_test,
            regressor,
            tested_params,
            base_steps=this_base_steps,
            cv=2,
        )

        #         predictions = grid_result.predict(X_test)
        #         r2 = grid_result.score(X_test, y_test)
        #         mae = mean_absolute_error(y_test, predictions)
        #         r2_adj = r2_adjusted_scorer(y_test, predictions, len(X_test[0]), len(X_test))

        best_estimator_index = grid_result.best_index_
        mean_cv_r2 = grid_result.cv_results_["mean_test_r2"][best_estimator_index]
        std_cv_r2 = grid_result.cv_results_["std_test_r2"][best_estimator_index]
        mean_cv_neg_mean_absolute_error = grid_result.cv_results_[
            "mean_test_neg_mean_absolute_error"
        ][best_estimator_index]
        std_cv_neg_mean_absolute_error = grid_result.cv_results_[
            "std_test_neg_mean_absolute_error"
        ][best_estimator_index]

        print(f"     Best parameters: {grid_result.best_params_}")
        print(f"     mean r2: {mean_cv_r2}           ± {round(std_cv_r2,3)}\n")

        data = {
            "data_set": dataset_name,
            "pipeline_name": pipeline_name,
            "function": "-",
            "model": regressor,
            "parameters": grid_result.best_params_,
            "channels": channel_list,
            "mean_cv_r2": mean_cv_r2,
            "std_cv_r2": std_cv_r2,
            "mean_cv_mae": mean_cv_neg_mean_absolute_error,
            "std_cv_mae": std_cv_neg_mean_absolute_error,
        }

        results_df = results_df.append(data, ignore_index=True)

In [None]:
results_df

In [None]:
results_df.to_pickle("../../data/regression_channels_" + dataset_name + ".pkl")

### Experiment 1' - bins

In [None]:
step_in_ms = 50

In [None]:
step = int(signal_frequency * step_in_ms / 1000)

#### Bins without cwt

In [None]:
pipeline_name = "channels_bins"

In [None]:
for channel_list in channels_options:
    print(f"Channels used in vectorization: {channel_list}\n")

    # define base steps
    this_base_steps = [
        ("channel_extraction", ChannelExtractionTransformer(channel_list=channel_list)),
        ("binning", BinTransformer(step=step)),
        ("data_channel_swap", ChannelWiseTransformer()),
        ("postprocessing", PostprocessingTransformer()),
        ("pca", PCA(random_state=random_state)),
        ("scaler", StandardScaler()),
    ]

    # rate different models
    for (regressor, params) in tested_regressors:
        print(f"Rating {regressor}\n")
        tested_params = {**regressor_params, **params}
        grid_result = rate_regression(
            X_train,
            y_train,
            X_test,
            y_test,
            regressor,
            tested_params,
            base_steps=this_base_steps,
            cv=2,
        )

        #         predictions = grid_result.predict(X_test)
        #         r2 = grid_result.score(X_test, y_test)
        #         mae = mean_absolute_error(y_test, predictions)
        #         r2_adj = r2_adjusted_scorer(y_test, predictions, len(X_test[0]), len(X_test))

        best_estimator_index = grid_result.best_index_
        mean_cv_r2 = grid_result.cv_results_["mean_test_r2"][best_estimator_index]
        std_cv_r2 = grid_result.cv_results_["std_test_r2"][best_estimator_index]
        mean_cv_neg_mean_absolute_error = grid_result.cv_results_[
            "mean_test_neg_mean_absolute_error"
        ][best_estimator_index]
        std_cv_neg_mean_absolute_error = grid_result.cv_results_[
            "std_test_neg_mean_absolute_error"
        ][best_estimator_index]

        print(f"     Best parameters: {grid_result.best_params_}")
        print(f"     mean r2: {mean_cv_r2}           ± {round(std_cv_r2,3)}\n")

        data = {
            "data_set": dataset_name,
            "pipeline_name": pipeline_name,
            "function": "-",
            "model": regressor,
            "parameters": grid_result.best_params_,
            "channels": channel_list,
            "mean_cv_r2": mean_cv_r2,
            "std_cv_r2": std_cv_r2,
            "mean_cv_mae": mean_cv_neg_mean_absolute_error,
            "std_cv_mae": std_cv_neg_mean_absolute_error,
        }

        results_df = results_df.append(data, ignore_index=True)

In [None]:
results_df

In [None]:
results_df.to_pickle("../../data/regression_channels_" + dataset_name + ".pkl")

#### Bins with cwt

In [None]:
pipeline_name = "channels_bins_cwt"

In [None]:
for channel_list in channels_options:
    print(f"Channels used in vectorization: {channel_list}\n")

    # define base steps
    this_base_steps = [
        ("channel_extraction", ChannelExtractionTransformer(channel_list=channel_list)),
        ("binning", BinTransformer(step=step)),
        ("channel_postprocessing", ChannelWiseTransformer()),
        ("cwt", CwtVectorizer()),
        ("postprocessing", PostprocessingTransformer()),
        ("pca", PCA(random_state=random_state)),
        ("scaler", StandardScaler()),
    ]

    # rate different models
    for (regressor, params) in tested_regressors:
        print(f"Rating {regressor}\n")
        tested_params = {**regressor_params, **params}
        grid_result = rate_regression(
            X_train,
            y_train,
            X_test,
            y_test,
            regressor,
            tested_params,
            base_steps=this_base_steps,
            cv=2,
        )

        #         predictions = grid_result.predict(X_test)
        #         r2 = grid_result.score(X_test, y_test)
        #         mae = mean_absolute_error(y_test, predictions)
        #         r2_adj = r2_adjusted_scorer(y_test, predictions, len(X_test[0]), len(X_test))

        best_estimator_index = grid_result.best_index_
        mean_cv_r2 = grid_result.cv_results_["mean_test_r2"][best_estimator_index]
        std_cv_r2 = grid_result.cv_results_["std_test_r2"][best_estimator_index]
        mean_cv_neg_mean_absolute_error = grid_result.cv_results_[
            "mean_test_neg_mean_absolute_error"
        ][best_estimator_index]
        std_cv_neg_mean_absolute_error = grid_result.cv_results_[
            "std_test_neg_mean_absolute_error"
        ][best_estimator_index]

        print(f"     Best parameters: {grid_result.best_params_}")
        print(f"     mean r2: {mean_cv_r2}           ± {round(std_cv_r2,3)}\n")

        data = {
            "data_set": dataset_name,
            "pipeline_name": pipeline_name,
            "function": "-",
            "model": regressor,
            "parameters": grid_result.best_params_,
            "channels": channel_list,
            "mean_cv_r2": mean_cv_r2,
            "std_cv_r2": std_cv_r2,
            "mean_cv_mae": mean_cv_neg_mean_absolute_error,
            "std_cv_mae": std_cv_neg_mean_absolute_error,
        }

        results_df = results_df.append(data, ignore_index=True)

In [None]:
results_df.to_pickle("../../data/regression_channels_" + dataset_name + ".pkl")

### Experiment 2
- Models: KNN, GBR, Lasso, SVR
- Iterate through functions listed in guo_features list
- Iterate through different channel lists

In [None]:
pipeline_name = "channels_function"

In [None]:
guo_features = [
    #     {"std": std_signal},
    {"abs_diffs": abs_diffs_signal},
    {"energy": mean_energy_signal},
    #     {"skew": skew_signal},
    {"mean": mean_signal},
]

In [None]:
knn = ("knn", KNeighborsRegressor())
knn_params = dict(
    knn__n_neighbors=np.arange(5, 45, 3),
)

In [None]:
gbr = ("gbr", GradientBoostingRegressor())
gbr_params = dict(
    gbr__n_estimators=np.arange(1, 45, 5),
)

In [None]:
lasso = ("lasso", Lasso())
lasso_params = dict(lasso__alpha=np.arange(0.1, 0.5, 0.1))

In [None]:
svr = ("svr", SVR())
svr_params = dict(
    svr__kernel=["rbf", "linear", "sigmoid"],
    svr__C=[0.001, 0.01, 0.1, 1],
)

In [None]:
regressor_params = dict(
    pca__n_components=np.arange(15, 37, 2),
)

In [None]:
tested_regressors = [
    (lasso, lasso_params),
    (gbr, gbr_params),
    (knn, knn_params),
    (svr, svr_params),
]

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
def rate_regression2(
    X_train, y_train, X_test, y_test, regressor, regressor_params, base_steps, cv=5
):
    pipeline = Pipeline(steps=base_steps + [regressor])
    param_grid = regressor_params
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring={"r2", "neg_mean_absolute_error"},
        refit="r2",
        n_jobs=10,
        verbose=10,
    )
    grid_search.fit(X_train, y_train)

    return grid_search

In [None]:
all_channels = np.arange(0, 64, 1)
# red_board = [3, 10, 11, 18, 19, 30, 31, 37, 38, 45, 46, 48, 55]
most_important = [31, 46, 48, 30]

channels_options = [all_channels, most_important]

In [None]:
for feature_function_dict in guo_features:
    print(f"Featurize with {feature_function_dict.keys()} function")
    for channel_list in channels_options:
        print(f"Channels used in vectorization: {channel_list}\n")

        # define base steps
        this_base_steps = [
            (
                "channel_extraction",
                ChannelExtractionTransformer(channel_list=channel_list),
            ),
            ("channel_postprocessing", ChannelWiseTransformer()),
            ("cwt", CwtVectorizer()),
            ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
            ("postprocessing", PostprocessingTransformer()),
            ("pca", PCA(random_state=random_state)),
            ("scaler", StandardScaler()),
        ]

        # rate different models
        for (regressor, params) in tested_regressors:
            print(f"Rating {regressor}\n")
            tested_params = {**regressor_params, **params}
            grid_result = rate_regression2(
                X_train,
                y_train,
                X_test,
                y_test,
                regressor,
                tested_params,
                base_steps=this_base_steps,
                cv=2,
            )

            #             predictions = grid_result.predict(X_test)
            #             r2 = grid_result.score(X_test, y_test)
            #             mae = mean_absolute_error(y_test, predictions)
            #             r2_adj = r2_adjusted_scorer(y_test, predictions, len(X_test[0]), len(X_test))

            best_estimator_index = grid_result.best_index_
            mean_cv_r2 = grid_result.cv_results_["mean_test_r2"][best_estimator_index]
            std_cv_r2 = grid_result.cv_results_["std_test_r2"][best_estimator_index]
            mean_cv_neg_mean_absolute_error = grid_result.cv_results_[
                "mean_test_neg_mean_absolute_error"
            ][best_estimator_index]
            std_cv_neg_mean_absolute_error = grid_result.cv_results_[
                "std_test_neg_mean_absolute_error"
            ][best_estimator_index]

            print(f"     Best parameters: {grid_result.best_params_}")
            print(f"     mean r2: {mean_cv_r2}           ± {round(std_cv_r2,3)}\n")

            data = {
                "data_set": dataset_name,
                "pipeline_name": pipeline_name
                + "_"
                + list(feature_function_dict.keys())[0],
                "function": list(feature_function_dict.keys()),
                "model": regressor,
                "parameters": grid_result.best_params_,
                "channels": channel_list,
                "mean_cv_r2": mean_cv_r2,
                "std_cv_r2": std_cv_r2,
                "mean_cv_mae": mean_cv_neg_mean_absolute_error,
                "std_cv_mae": std_cv_neg_mean_absolute_error,
            }

            results_df = results_df.append(data, ignore_index=True)

In [None]:
results_df.to_csv("../../data/regression_channels_" + dataset_name + ".csv")

In [None]:
results_df.to_pickle("../../data/regression_channels_" + dataset_name + ".pkl")

#### Experiment 2' - bins("binning"), BinTransformer(step=step),

In [None]:
pipeline_name = "channels_bins_function"

In [None]:
for feature_function_dict in guo_features:
    print(f"Featurize with {feature_function_dict.keys()} function")
    for channel_list in channels_options:
        print(f"Channels used in vectorization: {channel_list}\n")

        # define base steps
        this_base_steps = [
            (
                "channel_extraction",
                ChannelExtractionTransformer(channel_list=channel_list),
            ),
            ("binning", BinTransformer(step=step)),
            ("channel_postprocessing", ChannelWiseTransformer()),
            ("cwt", CwtVectorizer()),
            ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
            ("postprocessing", PostprocessingTransformer()),
            ("pca", PCA(random_state=random_state)),
            ("scaler", StandardScaler()),
        ]

        # rate different models
        for (regressor, params) in tested_regressors:
            print(f"Rating {regressor}\n")
            tested_params = {**regressor_params, **params}
            grid_result = rate_regression2(
                X_train,
                y_train,
                X_test,
                y_test,
                regressor,
                tested_params,
                base_steps=this_base_steps,
                cv=2,
            )

            #             predictions = grid_result.predict(X_test)
            #             r2 = grid_result.score(X_test, y_test)
            #             mae = mean_absolute_error(y_test, predictions)
            #             r2_adj = r2_adjusted_scorer(y_test, predictions, len(X_test[0]), len(X_test))

            best_estimator_index = grid_result.best_index_
            mean_cv_r2 = grid_result.cv_results_["mean_test_r2"][best_estimator_index]
            std_cv_r2 = grid_result.cv_results_["std_test_r2"][best_estimator_index]
            mean_cv_neg_mean_absolute_error = grid_result.cv_results_[
                "mean_test_neg_mean_absolute_error"
            ][best_estimator_index]
            std_cv_neg_mean_absolute_error = grid_result.cv_results_[
                "std_test_neg_mean_absolute_error"
            ][best_estimator_index]

            print(f"     Best parameters: {grid_result.best_params_}")
            print(f"     mean r2: {mean_cv_r2}           ± {round(std_cv_r2,3)}\n")

            data = {
                "data_set": dataset_name,
                "pipeline_name": pipeline_name
                + "_"
                + list(feature_function_dict.keys())[0],
                "function": list(feature_function_dict.keys()),
                "model": regressor,
                "parameters": grid_result.best_params_,
                "channels": channel_list,
                "mean_cv_r2": mean_cv_r2,
                "std_cv_r2": std_cv_r2,
                "mean_cv_mae": mean_cv_neg_mean_absolute_error,
                "std_cv_mae": std_cv_neg_mean_absolute_error,
            }

            results_df = results_df.append(data, ignore_index=True)

Dummy Classifier for baseline:

In [None]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)

y_pred = dummy_regr.predict(X_test)
print(mean_absolute_percentage_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(dummy_regr.score(X_test, y_test))
print(np.std(y))