# Rumination classification - averaged participants' epochs 

### Vectorization with defined channels

### Imports

In [None]:
%load_ext lab_black
import os
import pickle
from time import time
import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import cesium.featurize
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.decomposition import FastICA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA


from utils import *

### Loading data

Loading EEG data and data from rumination questionnaire. By default create_df_data loads all info from given file but one can specify it by passing a list of desired labels from csv file.

In [None]:
tmin, tmax = -0.1, 0.6
signal_frequency = 256
ERROR = 0
CORRECT = 1

In [None]:
def load_epochs_from_file(file, reject_bad_segments="auto", mask=None):
    """Load epochs from a header file.

    Args:
        file: path to a header file (.vhdr)
        reject_bad_segments: 'auto' | 'annot' | 'peak-to-peak'

        Whether the epochs with overlapping bad segments are rejected by default.

        'auto' means that bad segments are rejected automatically.
        'annot' rejection based on annotations and reject only channels annotated in .vmrk file as
        'bad'.
        'peak-to-peak' rejection based on peak-to-peak amplitude of channels.

        Rejected with 'annot' and 'amplitude' channels are zeroed.

    Returns:
        mne Epochs

    """
    # Import the BrainVision data into an MNE Raw object
    raw = mne.io.read_raw_brainvision("../data/" + file)

    # Construct annotation filename
    annot_file = file[:-4] + "vmrk"

    # Read in the event information as MNE annotations
    annotations = mne.read_annotations("../data/" + annot_file)

    # Add the annotations to our raw object so we can use them with the data
    raw.set_annotations(annotations)

    # Map with response markers only
    event_dict = {
        "Stimulus/RE*ex*1_n*1_c_1*R*FB": 10004,
        "Stimulus/RE*ex*1_n*1_c_1*R*FG": 10005,
        "Stimulus/RE*ex*1_n*1_c_2*R": 10006,
        "Stimulus/RE*ex*1_n*2_c_1*R": 10007,
        "Stimulus/RE*ex*2_n*1_c_1*R": 10008,
        "Stimulus/RE*ex*2_n*2_c_1*R*FB": 10009,
        "Stimulus/RE*ex*2_n*2_c_1*R*FG": 10010,
        "Stimulus/RE*ex*2_n*2_c_2*R": 10011,
    }

    # Map for merged correct/error response markers
    merged_event_dict = {"correct_response": 0, "error_response": 1}

    # Reconstruct the original events from Raw object
    events, event_ids = mne.events_from_annotations(raw, event_id=event_dict)

    # Merge correct/error response events
    merged_events = mne.merge_events(
        events,
        [10004, 10005, 10009, 10010],
        merged_event_dict["correct_response"],
        replace_events=True,
    )
    merged_events = mne.merge_events(
        merged_events,
        [10006, 10007, 10008, 10011],
        merged_event_dict["error_response"],
        replace_events=True,
    )

    epochs = []
    bads = []
    this_reject_by_annotation = True

    if reject_bad_segments != "auto":
        this_reject_by_annotation = False

    # Read epochs
    temp_epochs = mne.Epochs(
        raw=raw,
        events=merged_events,
        event_id=merged_event_dict,
        tmin=tmin,
        tmax=tmax,
        baseline=None,
        reject_by_annotation=this_reject_by_annotation,
        preload=True,
    )

    if reject_bad_segments == "annot":
        custom_annotations = get_annotations(annot_file)
        bads = get_bads_by_annotation(custom_annotations)
    elif reject_bad_segments == "peak-to-peak":
        bads = get_bads_by_peak_to_peak_amplitude(temp_epochs)
    else:
        epochs = temp_epochs
        return epochs

    if mask is None:
        epochs = clear_bads(temp_epochs, bads)
    elif len(mask) == 64:
        epochs = reject_with_mask(temp_epochs, mask, bads)
    else:
        print(
            "Given mask has wrong shape. Expected len of 64 but got {}".format(
                len(mask)
            )
        )

    return epochs

In [None]:
def create_df_data(
    test_participants=False,
    test_epochs=False,
    info_filename=None,
    info=["Rumination Full Scale"],
):
    """Loads data for all participants and create DataFrame with optional additional info from given .csv file.
    Participants with less than 10 epochs per condition are rejected.

    Parameters
    ----------
    test_participants: bool
        whether load data for training or final testing.
        If true load participants data for testing.
    test_epochs: bool
        whether load data for training or final testing.
        If true load epochs of each participants data for testing.
    info_filename: String | None
        path to .csv file with additional data.
    info: array
        listed parameters from the info file to be loaded.


    Returns
    -------
    go_nogo_data_df : pandas.DataFrame

    """
    header_files = glob.glob("../data/responses/*.vhdr")
    header_files = sorted(header_files)
    go_nogo_data_df = pd.DataFrame()

    for file in header_files:
        #  load eeg data for given participant
        participant_epochs = load_epochs_from_file(file)

        # and compute participant's id from file_name
        participant_id = re.match(r".*_(\w+).*", file).group(1)

        error = participant_epochs["error_response"]._data
        correct = participant_epochs["correct_response"]._data

        # exclude those participants who have too few samples
        if len(error) < 10 or len(correct) < 10:
            # not enough data for this participant
            continue

        # construct dataframe for participant with: id|epoch_data|response_type|additional info...
        participant_df = create_df_from_epochs(
            participant_id, correct, error, info_filename, info
        )
        print(participant_id)
        go_nogo_data_df = go_nogo_data_df.append(participant_df, ignore_index=True)

    return go_nogo_data_df

In [None]:
def create_df_from_epochs(id, correct, error, info_filename, info):
    """Create df for each participant. DF structure is like: {id: String ; epoch: epoch_data ; marker: 1.0|0.0}
    1.0 means correct and 0.0 means error response.
    Default info extracted form .csv file is 'Rumination Full Scale' and participants' ids.
    With this info df structure is like:
    {id: String ; epoch: epoch_data ; marker: 1.0|0.0 ; File: id ; 'Rumination Full Scale': int}

    Parameters
    ----------
    id: String
        participant's id extracted from filename
    correct: array
        correct responses' data
    error: array
        error responses' data
    info_filename: String
        path to .csv file with additional data.
    info: array
        listed parameters from the info file to be loaded.

    Returns
    -------
    participant_df : pandas.DataFrame

    """
    participant_df = pd.DataFrame()
    info_df = pd.DataFrame()

    # get additional info from file
    if info_filename is not None:
        rumination_df = pd.read_csv(info_filename, usecols=["File"] + info)
        info_df = (
            rumination_df.loc[rumination_df["File"] == id]
            .reset_index()
            .drop("index", axis=1)
        )

    for epoch in correct:
        epoch_df = pd.DataFrame(
            {"id": [id], "epoch": [epoch], "marker": [CORRECT]}
        ).join(info_df)
        participant_df = participant_df.append(epoch_df, ignore_index=True)

    for epoch in error:
        epoch_df = pd.DataFrame({"id": [id], "epoch": [epoch], "marker": [ERROR]}).join(
            info_df
        )
        participant_df = participant_df.append(epoch_df, ignore_index=True)

    return participant_df

In [None]:
df_name = "go_nogo_df_mean"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(info_filename=info_filename)
    epochs_df.name = df_name
    # save loaded data into a pickle file
    epochs_df.to_pickle("../data/" + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

## Training and classification

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from tempfile import mkdtemp
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score
from sklearn.metrics import classification_report


import numpy as np
import scipy.stats

- For each band (frequency) from CWT set it computes features given in feature_dict parameter (eg. std or mean).
- Then it computes PCA on flattened EEG channels and features (outer_components = N)
- Ending feature vector has shape: outer_components from (channels * len(feature_dict) * frequencies)

#### Standard features for EEG analysis provided by Guo et al. (2012)

In [None]:
def std_signal(t, m, e):
    return np.std(m)


def abs_diffs_signal(t, m, e):
    return np.sum(np.abs(np.diff(m)))


def mean_energy_signal(t, m, e):
    return np.mean(m ** 2)


def skew_signal(t, m, e):
    return scipy.stats.skew(m)


def mean_signal(t, m, e):
    return np.mean(m)

### Classification grid search

In [None]:
dataset = CORRECT
dataset_name = "correct" if dataset == CORRECT else "error"

In [None]:
X = np.array(epochs_df[epochs_df["marker"] == dataset]["epoch"].to_list())
y = np.array(
    epochs_df[epochs_df["marker"] == dataset]["Rumination Full Scale"].to_list()
)

In [None]:
rumination_median = np.median(y)
HIGH = 1
LOW = 0

In [None]:
for i in range(len(y)):
    if y[i] < rumination_median:
        y[i] = LOW
    else:
        y[i] = HIGH

In [None]:
data_df = pd.DataFrame({"X": X.tolist(), "y": y})

In [None]:
high_X = np.array(data_df[data_df["y"] == 1]["X"].to_list())
low_X = np.array(data_df[data_df["y"] == 0]["X"].to_list())

In [None]:
mean_high = np.mean(high_X, axis=0)
mean_low = np.mean(low_X, axis=0)

In [None]:
# Fz=37; Fcz=46; Cz=47; Cpz=31; Pz=30

import matplotlib.pyplot as plt

channel = 37
plt.plot(mean_high[channel])
plt.plot(mean_low[channel])

plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

Defined data transformers - custom data transformation steps

In [None]:
def ChannelExtractionTransformer(channel_list):
    def transform(X):
        epochs_per_channels = np.transpose(X, (1, 0, 2))
        epochs_per_selected_channels = []

        for channel in channel_list:
            this_data = epochs_per_channels[channel]
            epochs_per_selected_channels.append(this_data)

        epochs_per_selected_channels = np.array(epochs_per_selected_channels)
        selected_channels_per_epoch = np.transpose(
            epochs_per_selected_channels, (1, 0, 2)
        )
        return selected_channels_per_epoch

    return FunctionTransformer(func=transform)


def ChannelWiseTransformer():
    def transform(X):
        data_per_channel = np.transpose(X, (1, 0, 2))
        return data_per_channel

    return FunctionTransformer(func=transform)


def CwtVectorizer(mwt="morl", cwt_density=2):
    def transform(X):
        cwt_per_channel = []
        for data in X:
            data_cwt = np.array([cwt(epoch, mwt, cwt_density) for epoch in data])
            cwt_per_channel.append(data_cwt)
        cwt_per_channel = np.array(cwt_per_channel)
        return cwt_per_channel

    return FunctionTransformer(func=transform)


def CwtFeatureVectorizer(feature_dict):
    def transform(X):
        vectorized_data = []

        for data_cwt in X:
            # cesium functions
            feature_set_cwt = cesium.featurize.featurize_time_series(
                times=None,
                values=data_cwt,
                errors=None,
                features_to_use=list(feature_dict.keys()),
                custom_functions=feature_dict,
            )
            features_per_epoch = feature_set_cwt.to_numpy()
            vectorized_data.append(features_per_epoch)
        vectorized_data = np.array(vectorized_data)
        return vectorized_data

    return FunctionTransformer(func=transform)


# transforms energy of each sub-band into relative energy of sub-band
def RelativeEnergyTransformer():
    def transform(X):
        vectorized_data = []

        for epoch in X:
            total_energy_of_epoch = np.sum(epoch)
            sub_band_relative_energies = np.array(
                [(sub_band_energy / total_energy_of_epoch) for sub_band_energy in epoch]
            )
            vectorized_data.append(sub_band_relative_energies)

        vectorized_data = np.array(vectorized_data)
        return vectorized_data

    return FunctionTransformer(func=transform)


# reshape data from (channels x epoch x features) to (epochs x channles x features)
# and then flatten it to (epoch x channels*features)
def PostprocessingTransformer():
    def transform(X):
        vectorized_data = np.stack(X, axis=1)
        epochs_per_channel_feature = vectorized_data.reshape(
            vectorized_data.shape[0], -1
        )
        return epochs_per_channel_feature

    return FunctionTransformer(func=transform)

# Experiments

In [None]:
results_df = pd.DataFrame()

### Experiment 1
- Models: KNN, SVC, DecisionTree, LogisticRegression
- without feature functions

In [None]:
pipeline_name = "channels_saute"

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(5, 45, 3),
)

svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "poly"],
    svc__C=[0.1, 1],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=5))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[4, 6, 8],
)

lr = ("lr", LogisticRegression())
lr_params = dict()

In [None]:
classifier_params = dict(
    pca__n_components=np.arange(3, 37, 3),
)

In [None]:
tested_classifiers = [
    (svc, svc_params),
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
]

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
all_channels = np.arange(0, 64, 1)
# red_board = [3, 10, 11, 18, 19, 30, 31, 37, 38, 45, 46, 48, 55]
most_important = [30, 31, 37, 46, 47]

channels_options = [all_channels, most_important]

In [None]:
# base_steps = [
#     ("channel_extraction", ChannelExtractionTransformer(channel_list = all_channels)),
#     ("channel_postprocessing", ChannelWiseTransformer()),
#     ("cwt", CwtVectorizer()),
#     ("cwt_feature", CwtFeatureVectorizer(feature_dict=guo_features)),
#     ("postprocessing", PostprocessingTransformer()),
#     ("pca", PCA(random_state=5)),
#     ("scaler", StandardScaler()),
# ]

In [None]:
def rate_classification(
    X_train, y_train, X_test, y_test, classifier, classifier_params, base_steps, cv=5
):
    pipeline = Pipeline(steps=base_steps + [classifier])
    param_grid = classifier_params
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring={"balanced_accuracy", "precision"},
        refit="balanced_accuracy",
        n_jobs=10,
        verbose=10,
    )
    grid_search.fit(X_train, y_train)

    return grid_search

In [None]:
for channel_list in channels_options:
    print(f"Channels used in vectorization: {channel_list}\n")

    this_base_steps = [
        ("channel_extraction", ChannelExtractionTransformer(channel_list=channel_list)),
        ("channel_postprocessing", ChannelWiseTransformer()),
        ("cwt", CwtVectorizer()),
        ("postprocessing", PostprocessingTransformer()),
        ("scaler", StandardScaler()),
        ("pca", PCA(random_state=5)),
    ]

    for (classifier, params) in tested_classifiers:
        print(f"Rating {classifier}\n")
        tested_params = {**classifier_params, **params}
        grid_result = rate_classification(
            X_train,
            y_train,
            X_test,
            y_test,
            classifier,
            tested_params,
            base_steps=this_base_steps,
            cv=2,
        )

        #         predictions = grid_result.predict(X_test)
        #         accuracy = grid_result.score(X_test, y_test)
        #         precision = precision_score(y_test, predictions)
        #         report = classification_report(y_test, predictions)

        best_estimator_index = grid_result.best_index_
        mean_cv_balanced_accuracy = grid_result.cv_results_[
            "mean_test_balanced_accuracy"
        ][best_estimator_index]
        std_cv_balanced_accuracy = grid_result.cv_results_[
            "std_test_balanced_accuracy"
        ][best_estimator_index]
        mean_cv_precision = grid_result.cv_results_["mean_test_precision"][
            best_estimator_index
        ]
        std_cv_precision = grid_result.cv_results_["std_test_precision"][
            best_estimator_index
        ]

        print(f"     Best parameters: {grid_result.best_params_}")
        print(
            f"     mean accuracy: {mean_cv_balanced_accuracy}           ± {round(std_cv_balanced_accuracy,3)}\n"
        )

        data = {
            "data_set": dataset_name,
            "pipeline_name": pipeline_name,
            "function": "-",
            "model": classifier,
            "parameters": grid_result.best_params_,
            "channels": channel_list,
            "mean_cv_balanced_accuracy": mean_cv_balanced_accuracy,
            "std_cv_balanced_accuracy": std_cv_balanced_accuracy,
            "mean_cv_precision": mean_cv_precision,
            "std_cv_precision": std_cv_precision,
        }
        results_df = results_df.append(data, ignore_index=True)

In [None]:
results_df.to_pickle("../data/channels_" + dataset_name + ".pkl")

### Experiment 2
- Models: KNN, SVC, DecisionTree, LogisticRegression
- Iterate through functions listed in guo_features list
- Iterate through different channel lists

In [None]:
pipeline_name = "channels_function"

In [None]:
guo_features = [
    {"std": std_signal},
    {"abs_diffs": abs_diffs_signal},
    {"energy": mean_energy_signal},
    {"skew": skew_signal},
    {"mean": mean_signal},
]

In [None]:
knn = ("knn", KNeighborsClassifier())
knn_params = dict(
    knn__n_neighbors=np.arange(5, 45, 3),
)

svc = ("svc", SVC())
svc_params = dict(
    svc__kernel=["linear", "poly"],
    svc__C=[0.1, 1],
)

decision_tree = ("decision_tree", DecisionTreeClassifier(random_state=5))
decision_tree_params = dict(
    decision_tree__criterion=["gini", "entropy"],
    decision_tree__max_depth=[4, 6, 8],
)

lr = ("lr", LogisticRegression())
lr_params = dict()

In [None]:
classifier_params = dict(
    pca__n_components=np.arange(3, 37, 3),
)

In [None]:
tested_classifiers = [
    (lr, lr_params),
    (decision_tree, decision_tree_params),
    (knn, knn_params),
    (svc, svc_params),
]

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
def rate_classification2(
    X_train, y_train, X_test, y_test, classifier, classifier_params, base_steps, cv=5
):
    pipeline = Pipeline(steps=base_steps + [classifier])
    param_grid = classifier_params
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring={"balanced_accuracy", "precision"},
        refit="balanced_accuracy",
        n_jobs=10,
        verbose=10,
    )
    grid_search.fit(X_train, y_train)

    return grid_search

In [None]:
all_channels = np.arange(0, 64, 1)
# red_board = [3, 10, 11, 18, 19, 30, 31, 37, 38, 45, 46, 48, 55]
most_important = [30, 31, 37, 46, 47]

channels_options = [all_channels, most_important]

In [None]:
for feature_function_dict in guo_features:
    print(f"Featurize with {feature_function_dict.keys()} function\n")
    for channel_list in channels_options:
        print(f"Channels used in vectorization: {channel_list}\n")

        # define base steps
        this_base_steps = [
            (
                "channel_extraction",
                ChannelExtractionTransformer(channel_list=channel_list),
            ),
            ("channel_postprocessing", ChannelWiseTransformer()),
            ("cwt", CwtVectorizer()),
            ("cwt_feature", CwtFeatureVectorizer(feature_dict=feature_function_dict)),
            ("postprocessing", PostprocessingTransformer()),
            ("scaler", StandardScaler()),
            ("pca", PCA(random_state=5)),
        ]

        # rate different models
        for (classifier, params) in tested_classifiers:
            print(f"Rating {classifier} \n")
            tested_params = {**classifier_params, **params}
            grid_result = rate_classification2(
                X_train,
                y_train,
                X_test,
                y_test,
                classifier,
                tested_params,
                base_steps=this_base_steps,
                cv=2,
            )
            #             predictions = grid_result.predict(X_test)
            #             accuracy = grid_result.score(X_test, y_test)
            #             precision = precision_score(y_test, predictions)
            #             report = classification_report(y_test, predictions)

            # calculate results from cross-validation
            best_estimator_index = grid_result.best_index_
            mean_cv_balanced_accuracy = grid_result.cv_results_[
                "mean_test_balanced_accuracy"
            ][best_estimator_index]
            std_cv_balanced_accuracy = grid_result.cv_results_[
                "std_test_balanced_accuracy"
            ][best_estimator_index]
            mean_cv_precision = grid_result.cv_results_["mean_test_precision"][
                best_estimator_index
            ]
            std_cv_precision = grid_result.cv_results_["std_test_precision"][
                best_estimator_index
            ]

            print(f"     Best parameters: {grid_result.best_params_}")
            print(
                f"     mean accuracy: {mean_cv_balanced_accuracy}           ± {round(std_cv_balanced_accuracy,3)}\n"
            )

            data = {
                "data_set": dataset_name,
                "pipeline_name": pipeline_name
                + "_"
                + list(feature_function_dict.keys())[0],
                "function": list(feature_function_dict.keys())[0],
                "model": classifier,
                "parameters": grid_result.best_params_,
                "channels": channel_list,
                "mean_cv_balanced_accuracy": mean_cv_balanced_accuracy,
                "std_cv_balanced_accuracy": std_cv_balanced_accuracy,
                "mean_cv_precision": mean_cv_precision,
                "std_cv_precision": std_cv_precision,
            }
            results_df = results_df.append(data, ignore_index=True)

In [None]:
results_df.to_pickle("../data/channels_" + dataset_name + ".pkl")

In [None]:
results_df.to_csv("../data/channels_" + dataset_name + ".csv")