# Rumination prediction

### Imports

In [None]:
%load_ext lab_black
import os
import pickle
from time import time
import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.decomposition import FastICA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA


from utils import *

### Loading data

Loading EEG data and data from rumination questionnaire. By default create_df_data load only info about rumination but ones can specify it passing list of desired labels from csv file.

In [None]:
df_name = "go_nogo_df"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(info_filename=info_filename)
    epochs_df.name = df_name
    # save loaded data into a pickle file
    epochs_df.to_pickle("../data/" + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

Data is now read into dataframe and each epoch is a single record.

In [None]:
display(epochs_df)

Sorting participants by the number of errors, descending. This way the best participants are first.

In [None]:
# add new columns with info about error/correct responses amount
grouped_df = epochs_df.groupby("id")
epochs_df["error_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == ERROR).sum()
)
epochs_df["correct_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == CORRECT).sum()
)

# mergesort for stable sorting
epochs_df = epochs_df.sort_values("error_sum", ascending=False, kind="mergesort")

In [None]:
display(epochs_df)

In [None]:
a = np.array([[[1, 1, 1], [2, 2, 2], [3, 3, 3]], [[4, 4, 4], [5, 5, 5], [6, 6, 6]]])

In [None]:
b = np.concatenate(a, axis=1)
b

In [None]:
c = b.reshape(3, a.shape[0], a.shape[-1])
c

In [None]:
c[0].shape

In [None]:
d = np.stack(c, axis=1)
d

### Vectorization

- ICA reduces channles from 64 to given amount of independent components
- Continous Wavelet Transform decompose signal of channel from each epoch into set of wavelets functions
- PCA reducing dimention of features (wavelets) into computed best ones

In [None]:
def vectorize_2(
    X,
    mwt="mexh",
    cwt_density=2,
    ica_n_components=3,
    wv_weighting="PCA",
    extracted_n_components=3,
):
    print("X shape: {}".format(X.shape))

    # compute ICA for reducing dim from 64-channel to ica-n-components signal.
    # for ICA shape must be like  (n_samples, n_features) -> timepoints_per_channel.shape.T == (epochs*timepoints, num_of_channels)
    timepoints_per_channel = np.concatenate(X, axis=1)
    ica = FastICA(n_components=ica_n_components)
    X_ica = ica.fit_transform(timepoints_per_channel.T)

    # reshaping X_ica for recover (channel, epoch, timepoints) structure instead (epochs*timepoints, channel)
    X_ica_transposed = X_ica.T
    data_per_channel = X_ica_transposed.reshape(
        ica_n_components, X.shape[0], X.shape[-1]
    )

    vectorized_data = []

    for data in data_per_channel:
        data_cwt = np.array([cwt(epoch, mwt, cwt_density) for epoch in data])

        # for PCA shape must be like  (n_samples, n_features) -> wavelets_per_epoch.shape == (epoch, frequencies*timepoints)
        wavelets_per_epoch = data_cwt.reshape(data_cwt.shape[0], -1)

        pca = PCA(n_components=extracted_n_components)
        pca_components_per_epoch = pca.fit_transform(wavelets_per_epoch)
        vectorized_data.append(pca_components_per_epoch)

    vectorized_data = np.array(vectorized_data)
    vectorized_data = np.stack(vectorized_data, axis=1)
    print("Vectorized X shape: {}".format(vectorized_data.shape))

    return vectorized_data

In [None]:
X = np.array(epochs_df[epochs_df["marker"] == ERROR]["epoch"].to_list())
y = np.array(epochs_df[epochs_df["marker"] == ERROR]["Rumination Full Scale"].to_list())

X_vectorized = vectorize_2(X, extracted_n_components=10)
epochs_per_channel_feature = X_vectorized.reshape(X_vectorized.shape[0], -1)
print(epochs_per_channel_feature.shape)

ica3_pca10_df = pd.DataFrame(data={"X": epochs_per_channel_feature.tolist(), "y": y})

display(ica3_pca10_df)

### Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    epochs_per_channel_feature, y, test_size=0.2, random_state=42
)

print(X_train.shape)
print(y_train.shape)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

# neigh = KNeighborsRegressor()
neigh = RandomForestRegressor(n_estimators=1000)
neigh.fit(X_train, y_train)

In [None]:
y_pred = neigh.predict(X_test)
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_pred)

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    ## Note: does not handle mix 1d representation
    # if _is_1d(y_true):
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
mean_absolute_percentage_error(y_test, y_pred)

In [None]:
y_test.mean()

In [None]:
y_test.std()

In [None]:
test_X = X_test[1]
print(test_X)
test_y = y_test[1]
print(test_y)

print(neigh.predict([test_X]))