# Rumination prediction with statistical feature functions

Suprisingly good. The best statistical functions: **abs_diff** and std, but std perform worse than abs_diff.

Function is calculated for each band from cwt separately, the PCA is computed for principal components extraction. Everything is made in each ICA channel separately.

**Research for the best amount of ICA and PCA components is needed.**

Results for ICA= 6, PCA=5:

- Vectorization with mean function: 

        22.262470602898563
        0.6553535562156957
        0.6701843820323544
        0.1253460744825673


- Vectorization with std function: 

        22.356530287958208
        0.6528160797078707
        0.6622073864400982
        0.1357568072535158


- Vectorization with mean2 function: 

        23.522736451248715
        0.6898912347618341
        0.7282693285547807
        0.049539732449879414
        
        
- **Vectorization with abs_diffs function:**

        20.316528317106584
        0.598258644924345
        0.5699238437388514
        0.256195547767771


- Vectorization with skew function: 

        24.617083295052065
        0.7180929985106101
        0.7818632257758061
        -0.02040536601092713


- Vectorization with kurtosis function: 

        24.84531814461336
        0.7272395445327172
        0.8003162367100916
        -0.04448828838871188
        
        
- Vectorization with variation function: 

        26.167193910667653
        0.7467452978933837
        0.8497374027769138
        -0.10898757852870222
        
        
- Vectorization with median_abs_deviation function: 

        22.938335987122773
        0.6675833664351787
        0.6887154337895299
        0.1011613014590178

### Imports

In [None]:
%load_ext lab_black
import os
import pickle
from time import time
import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.decomposition import FastICA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA


from utils import *

### Loading data

Loading EEG data and data from rumination questionnaire. By default create_df_data load all info from given file but ones can specify it passing list of desired labels from csv file.

In [None]:
tmin, tmax = -0.1, 0.6
signal_frequency = 256
ERROR = 0
CORRECT = 1

In [None]:
def load_epochs_from_file(file, reject_bad_segments="auto", mask=None):
    """Load epochs from a header file.

    Args:
        file: path to a header file (.vhdr)
        reject_bad_segments: 'auto' | 'annot' | 'peak-to-peak'

        Whether the epochs with overlapping bad segments are rejected by default.

        'auto' means that bad segments are rejected automatically.
        'annot' rejection based on annotations and reject only channels annotated in .vmrk file as
        'bad'.
        'peak-to-peak' rejection based on peak-to-peak amplitude of channels.

        Rejected with 'annot' and 'amplitude' channels are zeroed.

    Returns:
        mne Epochs

    """
    # Import the BrainVision data into an MNE Raw object
    raw = mne.io.read_raw_brainvision("../data/" + file)

    # Construct annotation filename
    annot_file = file[:-4] + "vmrk"

    # Read in the event information as MNE annotations
    annotations = mne.read_annotations("../data/" + annot_file)

    # Add the annotations to our raw object so we can use them with the data
    raw.set_annotations(annotations)

    # Map with response markers only
    event_dict = {
        "Stimulus/RE*ex*1_n*1_c_1*R*FB": 10004,
        "Stimulus/RE*ex*1_n*1_c_1*R*FG": 10005,
        "Stimulus/RE*ex*1_n*1_c_2*R": 10006,
        "Stimulus/RE*ex*1_n*2_c_1*R": 10007,
        "Stimulus/RE*ex*2_n*1_c_1*R": 10008,
        "Stimulus/RE*ex*2_n*2_c_1*R*FB": 10009,
        "Stimulus/RE*ex*2_n*2_c_1*R*FG": 10010,
        "Stimulus/RE*ex*2_n*2_c_2*R": 10011,
    }

    # Map for merged correct/error response markers
    merged_event_dict = {"correct_response": 0, "error_response": 1}

    # Reconstruct the original events from Raw object
    events, event_ids = mne.events_from_annotations(raw, event_id=event_dict)

    # Merge correct/error response events
    merged_events = mne.merge_events(
        events,
        [10004, 10005, 10009, 10010],
        merged_event_dict["correct_response"],
        replace_events=True,
    )
    merged_events = mne.merge_events(
        merged_events,
        [10006, 10007, 10008, 10011],
        merged_event_dict["error_response"],
        replace_events=True,
    )

    epochs = []
    bads = []
    this_reject_by_annotation = True

    if reject_bad_segments != "auto":
        this_reject_by_annotation = False

    # Read epochs
    temp_epochs = mne.Epochs(
        raw=raw,
        events=merged_events,
        event_id=merged_event_dict,
        tmin=tmin,
        tmax=tmax,
        baseline=None,
        reject_by_annotation=this_reject_by_annotation,
        preload=True,
    )

    if reject_bad_segments == "annot":
        custom_annotations = get_annotations(annot_file)
        bads = get_bads_by_annotation(custom_annotations)
    elif reject_bad_segments == "peak-to-peak":
        bads = get_bads_by_peak_to_peak_amplitude(temp_epochs)
    else:
        epochs = temp_epochs
        return epochs

    if mask is None:
        epochs = clear_bads(temp_epochs, bads)
    elif len(mask) == 64:
        epochs = reject_with_mask(temp_epochs, mask, bads)
    else:
        print(
            "Given mask has wrong shape. Expected len of 64 but got {}".format(
                len(mask)
            )
        )

    return epochs

In [None]:
def create_df_data(
    test_participants=False,
    test_epochs=False,
    info_filename=None,
    info=["Rumination Full Scale"],
):
    """Loads data for all participants and create DataFrame with optional additional info from given .csv file.
    Participants with less than 10 epochs per condition are rejected.

    Parameters
    ----------
    test_participants: bool
        whether load data for training or final testing.
        If true load participants data for testing.
    test_epochs: bool
        whether load data for training or final testing.
        If true load epochs of each participants data for testing.
    info_filename: String | None
        path to .csv file with additional data.
    info: array
        listed parameters from the info file to be loaded.


    Returns
    -------
    go_nogo_data_df : pandas.DataFrame

    """
    header_files = glob.glob("../data/responses/*.vhdr")
    header_files = sorted(header_files)
    go_nogo_data_df = pd.DataFrame()

    for file in header_files:
        #  load eeg data for given participant
        participant_epochs = load_epochs_from_file(file)

        # and compute participant's id from file_name
        participant_id = re.match(r".*_(\w+).*", file).group(1)

        error = participant_epochs["error_response"]._data
        correct = participant_epochs["correct_response"]._data

        # exclude those participants who have too few samples
        if len(error) < 10 or len(correct) < 10:
            # not enough data for this participant
            continue

        # construct dataframe for participant with: id|epoch_data|response_type|additional info...
        participant_df = create_df_from_epochs(
            participant_id, correct, error, info_filename, info
        )
        print(participant_id)
        go_nogo_data_df = go_nogo_data_df.append(participant_df, ignore_index=True)

    return go_nogo_data_df

In [None]:
def create_df_from_epochs(id, correct, error, info_filename, info):
    """Create df for each participant. DF structure is like: {id: String ; epoch: epoch_data ; marker: 1.0|0.0}
    1.0 means correct and 0.0 means error response.
    Default info extracted form .csv file is 'Rumination Full Scale' and participants' ids.
    With this info df structure is like:
    {id: String ; epoch: epoch_data ; marker: 1.0|0.0 ; File: id ; 'Rumination Full Scale': int}

    Parameters
    ----------
    id: String
        participant's id extracted from filename
    correct: array
        correct responses' data
    error: array
        error responses' data
    info_filename: String
        path to .csv file with additional data.
    info: array
        listed parameters from the info file to be loaded.

    Returns
    -------
    participant_df : pandas.DataFrame

    """
    participant_df = pd.DataFrame()
    info_df = pd.DataFrame()

    # get additional info from file
    if info_filename is not None:
        rumination_df = pd.read_csv(info_filename, usecols=["File"] + info)
        info_df = (
            rumination_df.loc[rumination_df["File"] == id]
            .reset_index()
            .drop("index", axis=1)
        )

    for epoch in correct:
        epoch_df = pd.DataFrame(
            {"id": [id], "epoch": [epoch], "marker": [CORRECT]}
        ).join(info_df)
        participant_df = participant_df.append(epoch_df, ignore_index=True)

    for epoch in error:
        epoch_df = pd.DataFrame({"id": [id], "epoch": [epoch], "marker": [ERROR]}).join(
            info_df
        )
        participant_df = participant_df.append(epoch_df, ignore_index=True)

    return participant_df

In [None]:
df_name = "go_nogo_df"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(info_filename=info_filename)
    epochs_df.name = df_name
    # save loaded data into a pickle file
    epochs_df.to_pickle("../data/" + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

Data is now read into dataframe and each epoch is a single record.

Sorting participants by the number of errors, descending. This way the best participants are first.

In [None]:
# add new columns with info about error/correct responses amount
grouped_df = epochs_df.groupby("id")
epochs_df["error_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == ERROR).sum()
)
epochs_df["correct_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == CORRECT).sum()
)

# mergesort for stable sorting
epochs_df = epochs_df.sort_values("error_sum", ascending=False, kind="mergesort")

### Vectorization

- ICA reduces channles from 64 to given amount of independent components
- Continous Wavelet Transform decompose signal of channel from each epoch into set of wavelets functions
- Feature function given in peature_function parameter is calculated for each band from CWT
- PCA reducing dimention of features into computed best ones

In [None]:
import scipy.stats


def vectorize(
    X,
    feature_function,
    mwt="mexh",
    cwt_density=2,
    ica_n_components=3,
    wv_weighting="PCA",
    extracted_n_components=3,
):
    print("X shape: {}".format(X.shape))

    # compute ICA for reducing dim from 64-channel to ica-n-components signal.
    # for ICA shape must be like  (n_samples, n_features) -> timepoints_per_channel.shape.T == (epochs*timepoints, num_of_channels)
    timepoints_per_channel = np.concatenate(X, axis=1)
    ica = FastICA(n_components=ica_n_components)
    X_ica = ica.fit_transform(timepoints_per_channel.T)

    # reshaping X_ica for recover (channel, epoch, timepoints) structure instead (epochs*timepoints, channel)
    X_ica_transposed = X_ica.T
    data_per_channel = X_ica_transposed.reshape(
        ica_n_components, X.shape[0], X.shape[-1]
    )

    vectorized_data = []

    for data in data_per_channel:
        data_cwt = np.array([cwt(epoch, mwt, cwt_density) for epoch in data])

        # for calculating features per scale data must be transpose
        # from shape (epoch, frequency, times) to shape (frequency,epoch,times)
        data_per_scale = np.transpose(data_cwt, (1, 0, 2))
        epochs_per_scale = []

        for scale in data_per_scale:
            this_epoch = []
            for epoch in scale:
                epoch_stats = feature_function(epoch)
                this_epoch.append(epoch_stats)

            epochs_per_scale.append(this_epoch)

        epochs_per_scale = np.array(epochs_per_scale)
        scales_per_epoch = np.transpose(epochs_per_scale, (1, 0))

        pca = PCA(n_components=extracted_n_components)
        pca_components_per_epoch = pca.fit_transform(scales_per_epoch)
        vectorized_data.append(pca_components_per_epoch)

    vectorized_data = np.array(vectorized_data)
    vectorized_data = np.stack(vectorized_data, axis=1)
    epochs_per_channel_feature = vectorized_data.reshape(vectorized_data.shape[0], -1)

    print("Vectorized X shape: {}".format(epochs_per_channel_feature.shape))

    return epochs_per_channel_feature

In [None]:
X = np.array(epochs_df[epochs_df["marker"] == ERROR]["epoch"].to_list())
y = np.array(epochs_df[epochs_df["marker"] == ERROR]["Rumination Full Scale"].to_list())

In [None]:
vectorized_X_df = pd.DataFrame(data={"y": y})

#### Vectorization with different statistical functions

In [None]:
import numpy as np
import scipy.stats


def mean_signal(m):
    return np.mean(m)


def std_signal(m):
    return np.std(m)


def mean_square_signal(m):
    return np.mean(m ** 2)


def abs_diffs_signal(m):
    return np.sum(np.abs(np.diff(m)))


def skew_signal(m):
    return scipy.stats.skew(m)


def kurtosis_signal(m):
    return scipy.stats.kurtosis(m)


def variation_signal(m):
    return scipy.stats.variation(m)


def median_abs_deviation_signal(m):
    return scipy.stats.median_abs_deviation(m)

In [None]:
stat_features_dict = {
    "mean": mean_signal,
    "std": std_signal,
    "mean2": mean_square_signal,
    "abs_diffs": abs_diffs_signal,
    "skew": skew_signal,
    "kurtosis": kurtosis_signal,
    "variation": variation_signal,
    "median_abs_deviation": median_abs_deviation_signal,
}

Vectorization. Results added to dataframe. Column_name = func_name

In [None]:
ica = 9
pca = 2
for func_name, func in stat_features_dict.items():
    vectorized_X = vectorize(
        X,
        feature_function=func,
        ica_n_components=ica,
        mwt="morl",
        extracted_n_components=pca,
    )
    vectorized_X_df[func_name] = vectorized_X.tolist()
    vectorized_X_df["ica_n_components"] = ica
    vectorized_X_df["pca_n_components"] = pca

Quick vectorization wit only one function

In [None]:
ica = 9
pca = 2
func_name = "std"
func = stat_features_dict[func_name]

vectorized_X = vectorize(
    X,
    feature_function=func,
    ica_n_components=ica,
    mwt="morl",
    extracted_n_components=pca,
)

vectorized_X_df[func_name] = vectorized_X.tolist()
vectorized_X_df["ica_n_components"] = ica
vectorized_X_df["pca_n_components"] = pca

In [None]:
vectorized_X_df

## Training and prediction

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR


from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    ## Note: does not handle mix 1d representation
    # if _is_1d(y_true):
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

#### Training and prediction with SVR model all types of vectorization

In [None]:
for func_name in stat_features_dict:
    vectorized_X = np.array(vectorized_X_df[func_name].to_list())

    X_train, X_test, y_train, y_test = train_test_split(
        vectorized_X, y, test_size=0.2, random_state=42
    )
    print("Vectorization with {} function: ".format(func_name))

    scaler = StandardScaler().fit(X_train)
    rescaled_X_train = scaler.transform(X_train)
    model = SVR(kernel="rbf", C=2, gamma=0.1, epsilon=0.1)
    model.fit(rescaled_X_train, y_train)

    # transform the validation dataset
    rescaled_X_test = scaler.transform(X_test)
    predictions = model.predict(rescaled_X_test)
    print(mean_absolute_percentage_error(y_test, predictions))
    print(mean_absolute_error(y_test, predictions))
    print(mean_squared_error(y_test, predictions))
    print(model.score(rescaled_X_test, y_test))

Quick training and prediction

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    vectorized_X, y, test_size=0.2, random_state=42
)

In [None]:
scaler = StandardScaler().fit(X_train)
rescaled_X_train = scaler.transform(X_train)
model = SVR(kernel="rbf", C=1, gamma=0.1, epsilon=0.1)
model.fit(rescaled_X_train, y_train)

# transform the validation dataset
rescaled_X_test = scaler.transform(X_test)
predictions = model.predict(rescaled_X_test)
print(mean_absolute_percentage_error(y_test, predictions))
print(mean_absolute_error(y_test, predictions))
print(mean_squared_error(y_test, predictions))
print(model.score(rescaled_X_test, y_test))

### Regressions grid search

In [None]:
basic_steps = [("scaler", StandardScaler())]


def rate_regression(
    X_train, y_train, X_test, y_test, regressor, regressor_params, cv=2
):
    pipeline = Pipeline(steps=basic_steps + [regressor])
    param_grid = regressor_params
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring={"r2": "r2", "mae": "neg_mean_absolute_error"},
        refit="r2",
        n_jobs=3,
    )
    grid_search.fit(X_train, y_train)
    predictions = grid_search.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = grid_search.score(X_test, y_test)
    print(f"Best result: MAPE {mape} MAE {mae} MSE {mse} R^2 {r2}")
    grid_search_df = pd.DataFrame(grid_search.cv_results_)

    return grid_search_df

#### SVR

In [None]:
svr = ("svr", SVR(kernel="rbf"))
svr_params = dict(
    svr__C=np.arange(1, 5, 1),
    svr__gamma=[0.1],
    svr__epsilon=[0.1],
)
svr_grid_search_df = rate_regression(X_train, y_train, X_test, y_test, svr, svr_params)

#### KNN

In [None]:
knn = ("knn", KNeighborsRegressor())
knn_params = dict(
    knn__n_neighbors=np.arange(5, 100, 5),
)
knn_grid_search_df = rate_regression(X_train, y_train, X_test, y_test, knn, knn_params)

#### GBR

In [None]:
gbr = ("gbr", GradientBoostingRegressor())
gbr_params = dict(
    gbr__n_estimators=np.arange(1, 100, 5),
)
gbr_grid_search_df = rate_regression(X_train, y_train, X_test, y_test, gbr, gbr_params)

Dummy Classifier for baseline:

In [None]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)

y_pred = dummy_regr.predict(X_test)
print(mean_absolute_percentage_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(model.score(X_test, y_test))