# Rumination classification - averaged participants' epochs 

### Vectorization with ICA

### Imports

In [None]:
%load_ext lab_black
import os
import os.path as op

import pickle
from time import time
import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import cesium.featurize
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.decomposition import FastICA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA


import sys

sys.path.append("..")

from utils import *

### Loading data

Loading EEG data and data from rumination questionnaire. By default create_df_data loads all info from given file but one can specify it by passing a list of desired labels from csv file.

In [None]:
tmin, tmax = -0.1, 0.6
signal_frequency = 256
ERROR = 0
CORRECT = 1
random_state = 0

In [None]:
df_name = "go_nogo_df_mean"
pickled_data_filename = "../../data/" + df_name + ".pkl"
info_filename = "../../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(
        test_participants=False, info="all", personal=False, info_filename=info_filename
    )
    epochs_df.name = df_name
    # save loaded data into a pickle file
    epochs_df.to_pickle("../../data/" + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

Data is now read into dataframe and each epoch is a single record.

## Training and classification

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from tempfile import mkdtemp
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score
from sklearn.metrics import classification_report


import numpy as np
import scipy.stats

- Computes ICA and then at each channel computes CWT (ica_n_components = N).
- For each band (frequency) from CWT set it computes features given in feature_dict parameter (eg. std or mean).
- Then it computes PCA on flattened ICA channels and features (outer_components = N)
- Ending feature vector has shape: outer_components from (ica_n_components * len(feature_dict) * frequencies)

#### Standard features for EEG analysis provided by Guo et al. (2012)

In [None]:
def std_signal(t, m, e):
    return np.std(m)


def abs_diffs_signal(t, m, e):
    return np.sum(np.abs(np.diff(m)))


def mean_energy_signal(t, m, e):
    return np.mean(m ** 2)


def skew_signal(t, m, e):
    return scipy.stats.skew(m)


def mean_signal(t, m, e):
    return np.mean(m)

### Average participants' error and correct epochs

In [None]:
averaged_epochs_df = (
    epochs_df.groupby(
        ["id", "marker"],
        sort=False,
    )
    .apply(
        lambda group_df: pd.Series(
            {
                "epoch": np.mean(group_df["epoch"]),
                "Rumination Full Scale": np.mean(group_df["Rumination Full Scale"]),
            }
        )
    )
    .reset_index()
)

### Classification grid search

In [None]:
dataset = CORRECT
dataset_name = "correct" if dataset == CORRECT else "error"

In [None]:
X_train = np.array(
    averaged_epochs_df[averaged_epochs_df["marker"] == dataset]["epoch"].to_list()
)
y_train = np.array(
    averaged_epochs_df[averaged_epochs_df["marker"] == dataset][
        "Rumination Full Scale"
    ].to_list()
)

In [None]:
X_test = []
y_test = []

#### Split data by median into two groups: high/low rumination

In [None]:
rumination_median = np.median(y_train)
HIGH = 1
LOW = 0

In [None]:
for i in range(len(y_train)):
    if y_train[i] < rumination_median:
        y_train[i] = LOW
    else:
        y_train[i] = HIGH

#### Defined data transformers - custom data transformation steps

In [None]:
def IcaPreprocessingTransformer():
    def transform(X):
        timepoints_per_channel = np.concatenate(X, axis=1)
        return timepoints_per_channel.T

    return FunctionTransformer(func=transform)


def IcaPostprocessingTransformer(timepoints_count):
    def transform(X):
        X_ica_transposed = X.T
        ica_n_components = X.shape[1]

        epochs_count = int(X_ica_transposed.shape[1] / timepoints_count)
        data_per_channel = X_ica_transposed.reshape(
            ica_n_components, epochs_count, timepoints_count
        )
        return data_per_channel

    return FunctionTransformer(func=transform)


def CwtVectorizer(mwt="morl", cwt_density=2):
    def transform(X):
        cwt_per_channel = []
        for data in X:
            data_cwt = np.array([cwt(epoch, mwt, cwt_density) for epoch in data])
            cwt_per_channel.append(data_cwt)
        cwt_per_channel = np.array(cwt_per_channel)
        return cwt_per_channel

    return FunctionTransformer(func=transform)


# def MovingAverageCwt():
#     def transform(X):


def CwtFeatureVectorizer(feature_dict):
    def transform(X):
        vectorized_data = []

        for data_cwt in X:
            # cesium functions
            feature_set_cwt = cesium.featurize.featurize_time_series(
                times=None,
                values=data_cwt,
                errors=None,
                features_to_use=list(feature_dict.keys()),
                custom_functions=feature_dict,
            )
            features_per_epoch = feature_set_cwt.to_numpy()
            vectorized_data.append(features_per_epoch)
        vectorized_data = np.array(vectorized_data)
        return vectorized_data

    return FunctionTransformer(func=transform)


# transforms energy of each sub-band into relative energy of sub-band
def RelativeEnergyTransformer():
    def transform(X):
        vectorized_data = []

        for epoch in X:
            total_energy_of_epoch = np.sum(epoch)
            sub_band_relative_energies = np.array(
                [(sub_band_energy / total_energy_of_epoch) for sub_band_energy in epoch]
            )
            vectorized_data.append(sub_band_relative_energies)

        vectorized_data = np.array(vectorized_data)
        return vectorized_data

    return FunctionTransformer(func=transform)


# reshape data from (channels x epoch x features) to (epochs x channles x features)
# and then flatten it to (epoch x channels*features)
def PostprocessingTransformer():
    def transform(X):
        vectorized_data = np.stack(X, axis=1)
        epochs_per_channel_feature = vectorized_data.reshape(
            vectorized_data.shape[0], -1
        )
        return epochs_per_channel_feature

    return FunctionTransformer(func=transform)

# Experiments

In [None]:
results_df = pd.DataFrame()

### Experiment 1
- Models: KNN, SVC, DecisionTree, LogisticRegression
- without feature function

In [None]:
pipeline_name = "ICA_saute"

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(5, 45, 3),
)

svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "poly"],
    svc__C=[0.1, 1],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[4, 6, 8],
)

lr = ("lr", LogisticRegression())
lr_params = dict()

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(10, 37, 3),
    pca__n_components=np.arange(9, 30, 2),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
base_steps = [
    ("ica_preprocessing", IcaPreprocessingTransformer()),
    ("ica", FastICA(random_state=random_state)),
    (
        "ica_postprocessing",
        IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
    ),
    ("cwt", CwtVectorizer()),
    ("postprocessing", PostprocessingTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA(random_state=random_state)),
]

In [None]:
def rate_classification(
    X_train, y_train, X_test, y_test, classifier, classifier_params, cv=5
):
    pipeline = Pipeline(steps=base_steps + [classifier])
    param_grid = classifier_params
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring={"balanced_accuracy", "precision"},
        refit="balanced_accuracy",
        n_jobs=10,
        verbose=10,
    )
    grid_search.fit(X_train, y_train)

    return grid_search

In [None]:
for (classifier, params) in tested_classifiers:
    print(f"Rating {classifier}")
    tested_params = {**classifier_params, **params}
    
    # define cross-validation method
    cv = StratifiedKFold(n_splits=2)

    grid_result = rate_classification(
        X_train, y_train, X_test, y_test, classifier, tested_params, cv=cv
    )

    best_estimator_index = grid_result.best_index_
    mean_cv_balanced_accuracy = grid_result.cv_results_["mean_test_balanced_accuracy"][
        best_estimator_index
    ]
    std_cv_balanced_accuracy = grid_result.cv_results_["std_test_balanced_accuracy"][
        best_estimator_index
    ]
    mean_cv_precision = grid_result.cv_results_["mean_test_precision"][
        best_estimator_index
    ]
    std_cv_precision = grid_result.cv_results_["std_test_precision"][
        best_estimator_index
    ]

    print(f"     Best parameters: {grid_result.best_params_}")
    print(
        f"     mean accuracy: {mean_cv_balanced_accuracy}           ± {round(std_cv_balanced_accuracy,3)}\n"
    )

    data = {
        "data_set": dataset_name,
        "pipeline_name": pipeline_name,
        "function": "-",
        "model": classifier,
        "parameters": grid_result.best_params_,
        "mean_cv_balanced_accuracy": mean_cv_balanced_accuracy,
        "std_cv_balanced_accuracy": std_cv_balanced_accuracy,
        "mean_cv_precision": mean_cv_precision,
        "std_cv_precision": std_cv_precision,
    }
    results_df = results_df.append(data, ignore_index=True)

In [None]:
results_df.to_pickle("../../data/ICA_" + dataset_name + ".pkl")

### Experiment 2
- Models: KNN, SVC, DecisionTree, LogisticRegression
- Iterate through functions listed in guo_features list

In [None]:
pipeline_name = "ICA_function"

In [None]:
guo_features = [
    {"std": std_signal},
    {"abs_diffs": abs_diffs_signal},
    {"energy": mean_energy_signal},
    {"skew": skew_signal},
    {"mean": mean_signal},
]

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(5, 45, 3),
)

svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "poly"],
    svc__C=[0.1, 1],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=random_state))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[4, 6, 8],
)

lr = ("lr", LogisticRegression())
lr_params = dict()

In [None]:
classifier_params = dict(
    ica__n_components=np.arange(10, 37, 3),
    pca__n_components=np.arange(9, 30, 2),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
def rate_classification2(
    X_train, y_train, X_test, y_test, classifier, classifier_params, base_steps, cv=5
):
    pipeline = Pipeline(steps=base_steps + [classifier])
    param_grid = classifier_params
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring={"balanced_accuracy", "precision"},
        refit="balanced_accuracy",
        n_jobs=10,
        verbose=10,
    )
    grid_search.fit(X_train, y_train)

    return grid_search

In [None]:
for feature_function_dict in guo_features:
    print(f"Featurize with {feature_function_dict.keys()} function")

    # define base steps
    this_base_steps = [
        ("ica_preprocessing", IcaPreprocessingTransformer()),
        ("ica", FastICA(random_state=random_state)),
        (
            "ica_postprocessing",
            IcaPostprocessingTransformer(timepoints_count=X_train.shape[-1]),
        ),
        ("cwt", CwtVectorizer()),
        ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
        ("postprocessing", PostprocessingTransformer()),
        ("scaler", StandardScaler()),
        ("pca", PCA(random_state=random_state)),
    ]
    
    # define cross-validation method
    cv = StratifiedKFold(n_splits=2)

    # rate different models
    for (classifier, params) in tested_classifiers:
        print(f"Rating {classifier} ")
        tested_params = {**classifier_params, **params}
        grid_result = rate_classification2(
            X_train,
            y_train,
            X_test,
            y_test,
            classifier,
            tested_params,
            base_steps=this_base_steps,
            cv=cv,
        )

        # calculate results from cross-validation
        best_estimator_index = grid_result.best_index_
        mean_cv_balanced_accuracy = grid_result.cv_results_[
            "mean_test_balanced_accuracy"
        ][best_estimator_index]
        std_cv_balanced_accuracy = grid_result.cv_results_[
            "std_test_balanced_accuracy"
        ][best_estimator_index]
        mean_cv_precision = grid_result.cv_results_["mean_test_precision"][
            best_estimator_index
        ]
        std_cv_precision = grid_result.cv_results_["std_test_precision"][
            best_estimator_index
        ]

        print(f"     Best parameters: {grid_result.best_params_}")
        print(
            f"     mean accuracy: {mean_cv_balanced_accuracy}           ± {round(std_cv_balanced_accuracy,3)}\n"
        )

        data = {
            "data_set": dataset_name,
            "pipeline_name": pipeline_name
            + "_"
            + list(feature_function_dict.keys())[0],
            "function": list(feature_function_dict.keys())[0],
            "model": classifier,
            "parameters": grid_result.best_params_,
            "mean_cv_balanced_accuracy": mean_cv_balanced_accuracy,
            "std_cv_balanced_accuracy": std_cv_balanced_accuracy,
            "mean_cv_precision": mean_cv_precision,
            "std_cv_precision": std_cv_precision,
        }
        results_df = results_df.append(data, ignore_index=True)

In [None]:
results_df.to_csv("../../data/ICA_" + dataset_name + ".csv")

In [None]:
results_df.to_pickle("../../data/ICA_" + dataset_name + ".pkl")