# Rumination prediction with pre-selected channels

- using not ICA but pre-selected channels do not increase the effectiveness of the model:

    For channels Fz, Fcz, Cz, Cpz, Pz:
    
    R^2 = 0.02502976580629168
- using averaged epochs per person with pre-selected channels do not increase the effectiveness of the model.

### Imports

In [None]:
%load_ext lab_black
import os
import pickle
from time import time
import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.decomposition import FastICA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA


from utils import *

### Loading data

Loading EEG data and data from rumination questionnaire. By default create_df_data load all info from given file but ones can specify it passing list of desired labels from csv file.

In [None]:
tmin, tmax = -0.1, 0.6
signal_frequency = 256
ERROR = 0
CORRECT = 1

In [None]:
def load_epochs_from_file(file, reject_bad_segments="auto", mask=None):
    """Load epochs from a header file.

    Args:
        file: path to a header file (.vhdr)
        reject_bad_segments: 'auto' | 'annot' | 'peak-to-peak'

        Whether the epochs with overlapping bad segments are rejected by default.

        'auto' means that bad segments are rejected automatically.
        'annot' rejection based on annotations and reject only channels annotated in .vmrk file as
        'bad'.
        'peak-to-peak' rejection based on peak-to-peak amplitude of channels.

        Rejected with 'annot' and 'amplitude' channels are zeroed.

    Returns:
        mne Epochs

    """
    # Import the BrainVision data into an MNE Raw object
    raw = mne.io.read_raw_brainvision("../data/" + file)

    # Construct annotation filename
    annot_file = file[:-4] + "vmrk"

    # Read in the event information as MNE annotations
    annotations = mne.read_annotations("../data/" + annot_file)

    # Add the annotations to our raw object so we can use them with the data
    raw.set_annotations(annotations)

    # Map with response markers only
    event_dict = {
        "Stimulus/RE*ex*1_n*1_c_1*R*FB": 10004,
        "Stimulus/RE*ex*1_n*1_c_1*R*FG": 10005,
        "Stimulus/RE*ex*1_n*1_c_2*R": 10006,
        "Stimulus/RE*ex*1_n*2_c_1*R": 10007,
        "Stimulus/RE*ex*2_n*1_c_1*R": 10008,
        "Stimulus/RE*ex*2_n*2_c_1*R*FB": 10009,
        "Stimulus/RE*ex*2_n*2_c_1*R*FG": 10010,
        "Stimulus/RE*ex*2_n*2_c_2*R": 10011,
    }

    # Map for merged correct/error response markers
    merged_event_dict = {"correct_response": 0, "error_response": 1}

    # Reconstruct the original events from Raw object
    events, event_ids = mne.events_from_annotations(raw, event_id=event_dict)

    # Merge correct/error response events
    merged_events = mne.merge_events(
        events,
        [10004, 10005, 10009, 10010],
        merged_event_dict["correct_response"],
        replace_events=True,
    )
    merged_events = mne.merge_events(
        merged_events,
        [10006, 10007, 10008, 10011],
        merged_event_dict["error_response"],
        replace_events=True,
    )

    epochs = []
    bads = []
    this_reject_by_annotation = True

    if reject_bad_segments != "auto":
        this_reject_by_annotation = False

    # Read epochs
    temp_epochs = mne.Epochs(
        raw=raw,
        events=merged_events,
        event_id=merged_event_dict,
        tmin=tmin,
        tmax=tmax,
        baseline=None,
        reject_by_annotation=this_reject_by_annotation,
        preload=True,
    )

    if reject_bad_segments == "annot":
        custom_annotations = get_annotations(annot_file)
        bads = get_bads_by_annotation(custom_annotations)
    elif reject_bad_segments == "peak-to-peak":
        bads = get_bads_by_peak_to_peak_amplitude(temp_epochs)
    else:
        epochs = temp_epochs
        return epochs

    if mask is None:
        epochs = clear_bads(temp_epochs, bads)
    elif len(mask) == 64:
        epochs = reject_with_mask(temp_epochs, mask, bads)
    else:
        print(
            "Given mask has wrong shape. Expected len of 64 but got {}".format(
                len(mask)
            )
        )

    return epochs

In [None]:
def create_df_data(
    test_participants=False,
    test_epochs=False,
    info_filename=None,
    info=["Rumination Full Scale"],
):
    """Loads data for all participants and create DataFrame with optional additional info from given .csv file.
    Participants with less than 10 epochs per condition are rejected.

    Parameters
    ----------
    test_participants: bool
        whether load data for training or final testing.
        If true load participants data for testing.
    test_epochs: bool
        whether load data for training or final testing.
        If true load epochs of each participants data for testing.
    info_filename: String | None
        path to .csv file with additional data.
    info: array
        listed parameters from the info file to be loaded.


    Returns
    -------
    go_nogo_data_df : pandas.DataFrame

    """
    header_files = glob.glob("../data/responses/*.vhdr")
    header_files = sorted(header_files)
    go_nogo_data_df = pd.DataFrame()

    for file in header_files:
        #  load eeg data for given participant
        participant_epochs = load_epochs_from_file(file)

        # and compute participant's id from file_name
        participant_id = re.match(r".*_(\w+).*", file).group(1)

        error = participant_epochs["error_response"]._data
        correct = participant_epochs["correct_response"]._data

        error_mean = np.mean(error, axis=0)
        correct_mean = np.mean(correct, axis=0)

        # exclude those participants who have too few samples
        if len(error) < 5 or len(correct) < 5:
            # not enough data for this participant
            continue

        # construct dataframe for participant with: id|epoch_data|response_type|additional info...
        participant_df = create_df_from_epochs(
            participant_id, correct, error, info_filename, info
        )
        print(participant_id)
        go_nogo_data_df = go_nogo_data_df.append(participant_df, ignore_index=True)

    return go_nogo_data_df

In [None]:
def create_df_from_epochs(id, correct, error, info_filename, info):
    """Create df for each participant. DF structure is like: {id: String ; epoch: epoch_data ; marker: 1.0|0.0}
    1.0 means correct and 0.0 means error response.
    Default info extracted form .csv file is 'Rumination Full Scale' and participants' ids.
    With this info df structure is like:
    {id: String ; epoch: epoch_data ; marker: 1.0|0.0 ; File: id ; 'Rumination Full Scale': int}

    Parameters
    ----------
    id: String
        participant's id extracted from filename
    correct: array
        correct responses' data
    error: array
        error responses' data
    info_filename: String
        path to .csv file with additional data.
    info: array
        listed parameters from the info file to be loaded.

    Returns
    -------
    participant_df : pandas.DataFrame

    """
    participant_df = pd.DataFrame()
    info_df = pd.DataFrame()

    # get additional info from file
    if info_filename is not None:
        rumination_df = pd.read_csv(info_filename, usecols=["File"] + info)
        info_df = (
            rumination_df.loc[rumination_df["File"] == id]
            .reset_index()
            .drop("index", axis=1)
        )

    for epoch in correct:
        epoch_df = pd.DataFrame(
            {"id": [id], "epoch": [epoch], "marker": [CORRECT]}
        ).join(info_df)
        participant_df = participant_df.append(epoch_df, ignore_index=True)

    for epoch in error:
        epoch_df = pd.DataFrame({"id": [id], "epoch": [epoch], "marker": [ERROR]}).join(
            info_df
        )
        participant_df = participant_df.append(epoch_df, ignore_index=True)

    return participant_df

In [None]:
df_name = "go_nogo_df"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(info_filename=info_filename)
    epochs_df.name = df_name
    # save loaded data into a pickle file
    epochs_df.to_pickle("../data/" + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

Sorting participants by the number of errors, descending. This way the best participants are first.

In [None]:
# add new columns with info about error/correct responses amount
grouped_df = epochs_df.groupby("id")
epochs_df["error_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == ERROR).sum()
)
epochs_df["correct_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == CORRECT).sum()
)

# mergesort for stable sorting
epochs_df = epochs_df.sort_values("error_sum", ascending=False, kind="mergesort")

## Vectorization

- Channes are pre-selected EEG electrodes
- Continous Wavelet Transform decompose signal of channel from each epoch into set of wavelets functions
- PCA reducing dimention of features (wavelets) into computed best ones

In [None]:
X = np.array(epochs_df[epochs_df["marker"] == ERROR]["epoch"].to_list())
y = np.array(epochs_df[epochs_df["marker"] == ERROR]["Rumination Full Scale"].to_list())

Selection specified channels

In [None]:
# channels = [3, 10, 11, 18, 19, 30, 31, 37, 38, 45, 46, 48, 55]
channels = [31, 46, 48, 30]

In [None]:
channels_per_epochs = np.transpose(X, (1, 0, 2))
selected_channels_per_epochs = []

for channel in channels:
    this_data = channels_per_epochs[channel]
    selected_channels_per_epochs.append(this_data)

selected_channels_per_epochs = np.array(selected_channels_per_epochs)
X = np.transpose(selected_channels_per_epochs, (1, 0, 2))

In [None]:
def vectorize(
    X,
    mwt="mexh",
    cwt_density=2,
    ica_n_components=3,
    wv_weighting="PCA",
    extracted_n_components=3,
):
    print("X shape: {}".format(X.shape))

    vectorized_data = []
    data_per_channel = np.transpose(X, (1, 0, 2))

    for data in data_per_channel:
        data_cwt = np.array([cwt(epoch, mwt, cwt_density) for epoch in data])

        # for PCA shape must be like  (n_samples, n_features) -> wavelets_per_epoch.shape == (epoch, frequencies*timepoints)
        wavelets_per_epoch = data_cwt.reshape(data_cwt.shape[0], -1)

        pca = PCA(n_components=extracted_n_components)
        pca_components_per_epoch = pca.fit_transform(wavelets_per_epoch)
        #         print(pca.explained_variance_)
        vectorized_data.append(pca_components_per_epoch)

    vectorized_data = np.array(vectorized_data)
    vectorized_data = np.stack(vectorized_data, axis=1)
    epochs_per_channel_feature = vectorized_data.reshape(vectorized_data.shape[0], -1)

    print("Vectorized X shape: {}".format(epochs_per_channel_feature.shape))

    return epochs_per_channel_feature

In [None]:
vectorized_X = vectorize(X, mwt="morl", extracted_n_components=5)

Creating dataframe from vectorized X set for future

In [None]:
vectorized_X_df = pd.DataFrame(
    vectorized_X, columns=np.arange(0, vectorized_X.shape[1], 1)
)
vectorized_X_df["y"] = y

## Training and prediction

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR


from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    ## Note: does not handle mix 1d representation
    # if _is_1d(y_true):
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    vectorized_X, y, test_size=0.2, random_state=42
)

#### Quick vectorization test on SVR model

In [None]:
scaler = StandardScaler().fit(X_train)
rescaled_X_train = scaler.transform(X_train)
model = SVR(kernel="rbf", C=1, gamma=0.1, epsilon=0.1)
model.fit(rescaled_X_train, y_train)

# transform the validation dataset
rescaled_X_test = scaler.transform(X_test)
predictions = model.predict(rescaled_X_test)
print(mean_absolute_percentage_error(y_test, predictions))
print(mean_absolute_error(y_test, predictions))
print(mean_squared_error(y_test, predictions))
print(model.score(rescaled_X_test, y_test))

### Regression grid search

Parameters for models:

In [None]:
param_grid_gbm = dict(n_estimators=np.arange(20, 100, 5))
param_grid_knn = dict(n_neighbors=np.arange(5, 100, 5))
param_grid_svr = dict(
    C=np.arange(1, 5, 1),
    gamma=np.arange(0.1, 0.5, 0.1),
    epsilon=np.arange(0.1, 0.5, 0.1),
)

In [None]:
pipelines = []
pipelines.append(
    ("ScaledLR", Pipeline([("Scaler", StandardScaler()), ("LR", LinearRegression())]))
)
pipelines.append(
    (
        "ScaledKNN",
        Pipeline([("Scaler", StandardScaler()), ("KNN", KNeighborsRegressor())]),
    )
)
pipelines.append(
    (
        "ScaledGBM",
        Pipeline([("Scaler", StandardScaler()), ("GBM", GradientBoostingRegressor())]),
    )
)
pipelines.append(
    (
        "ScaledSVR",
        Pipeline([("Scaler", StandardScaler()), ("SVR", SVR(kernel="rbf"))]),
    )
)

results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(
        model, X_train, y_train, cv=kfold, scoring="neg_mean_squared_error"
    )
    results.append(cv_results)
    names.append(name)
    msg = "{}: {:f} ({:f})".format(name, cv_results.mean(), cv_results.std())
    print(msg)

Grid search for the best model:

In [None]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = GradientBoostingRegressor()
kfold = KFold(n_splits=10)
grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid_gbm,
    scoring="neg_mean_squared_error",
    cv=kfold,
    verbose=3,
)
grid_result = grid.fit(rescaledX, y_train)

means = grid_result.cv_results_["mean_test_score"]
stds = grid_result.cv_results_["std_test_score"]
params = grid_result.cv_results_["params"]
for mean, stdev, param in zip(means, stds, params):
    print("{:f} ({:f}) with: {!r}".format(mean, stdev, param))

print("Best: {:f} using {}".format(grid_result.best_score_, grid_result.best_params_))

Dummy Classifier for baseline:

In [None]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)

y_pred = dummy_regr.predict(X_test)
print(mean_absolute_percentage_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(model.score(X_test, y_test))