# Rumination prediction

### Imports

In [None]:
%load_ext lab_black
import os
import pickle
from time import time
import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.decomposition import FastICA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from utils import *

### Loading data

Loading EEG data and data from rumination questionnaire. By default create_df_data load only info about rumination but ones can specify it passing list of desired labels from csv file.

In [None]:
df_name = "go_nogo_df"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(info_filename=info_filename)
    epochs_df.name = df_name
    # save loaded data into a pickle file
    epochs_df.to_pickle("../data/" + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

Data is now read into dataframe and each epoch is a single record.

In [None]:
display(epochs_df)

Sorting participants by the number of errors, descending. This way the best participants are first.

In [None]:
# add new columns with info about error/correct responses amount
grouped_df = epochs_df.groupby("id")
epochs_df["error_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == ERROR).sum()
)
epochs_df["correct_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == CORRECT).sum()
)

# mergesort for stable sorting
epochs_df = epochs_df.sort_values("error_sum", ascending=False, kind="mergesort")

In [None]:
display(epochs_df)

In [None]:
epochs_df["epoch"][0]

In [None]:
a = [
    [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]],
    [[5, 5, 5], [6, 6, 6], [7, 7, 7], [8, 8, 8]],
    [[9, 9, 9], [10, 10, 10], [11, 11, 11], [12, 12, 12]],
]
a = np.array(a)

In [None]:
a.shape

In [None]:
b = np.concatenate(a, axis=1)
b.shape

In [None]:
b = b.T
b.shape

In [None]:
b

In [None]:
x = b.T

In [None]:
x

In [None]:
channel_num = 4
epoch_num = 3
times_len = 3
c = x.reshape(channel_num, epoch_num, times_len)

In [None]:
c.shape

In [None]:
d = np.stack(c, axis=1)
d

### Vectorization

In [None]:
def vectorize_2(
    X,
    mwt="mexh",
    cwt_density=2,
    ica_n_components=3,
    wv_weighting="PCA",
    wv_weighting_n_components=3,
):
    print("X shape: {}".format(X.shape))

    # compute ICA for reducing dim from 64-channel to ica-n-components signal.
    # for ICA shape must be like  (n_samples, n_features) -> timepoints_per_channel.shape.T == (epochs*timepoints, num_of_channels)
    timepoints_per_channel = np.concatenate(X, axis=1)
    ica = FastICA(n_components=ica_n_components)
    X_ica = ica.fit_transform(timepoints_per_channel.T)
    print("X_ica transformed shape: {}".format(X_ica.shape))

    # reshaping X_ica for recover (channel, epoch, timepoints) structure instead (epochs*timepoints, channel)
    X_ica_transposed = X_ica.T
    data_per_channel = X_ica_transposed.reshape(
        ica_n_components, X.shape[0], X.shape[-1]
    )

    for data in data_per_channel:
        print("Data in one channel shape: {}".format(data.shape))


#         X_cwts = np.array([cwt(epoch, mwt, cwt_density) for epoch in data])

- ICA reduces channles from 64 to given amount of independent components
- Continous Wavelet Transform decompose signal of channel from each epoch into set of wavelets functions
- PCA reducing dimention of features (wavelets) into computed best ones

Each epoch is vectorized as (ica_components*pca_components)

In [None]:
from sklearn.decomposition import PCA


def vectorize(
    X,
    mwt="mexh",
    cwt_density=2,
    ica_n_components=3,
    wv_weighting="PCA",
    wv_weighting_n_components=3,
):

    # compute ICA for reducing dim from 64-channel to ica-n-components signal.
    # for ICA shape must be like  (n_samples, n_features) -> timepoints_per_channel.shape == (num_of_channels, timepoints)

    print(X.shape)
    timepoints_per_channel = np.concatenate(X, axis=1)
    ica = FastICA(n_components=ica_n_components)
    ica.fit(timepoints_per_channel.T)

    params = []
    X_features = []

    for spatial_filter in ica.components_:
        # apply ICA for creating independent channel (matrixes mul). Channel has shape EPOCHS x TIMEPOINTS
        X_filtered = np.tensordot(X, spatial_filter, axes=([1], [0]))
        print("Filtered shape: {}".format(X_filtered.shape))

        # compute CWT for channel.
        # CWT decompose signal into set of basis functions consisting transformations of mother wavelet function.
        # After decomposing it has shape EPOCH x FREQUENCY x TIMEPOINT
        X_cwts = np.array([cwt(epoch, mwt, cwt_density) for epoch in X_filtered])

        # PCA need 2-dim array (n_samples, n_features)
        # prepare X set for PCA -> flatten two last dimenstion.
        X_flattened = X_cwts.reshape(X_cwts.shape[0], -1)

        # compute which waveltes (factors) are the most significant
        if wv_weighting == "PCA":
            pca = PCA(n_components=wv_weighting_n_components)
            pca.fit(X_flattened)
            wv_weights = pca.components_
        elif wv_weighting == "ICA":
            ica = FastICA(n_components=wv_weighting_n_components, tol=0.001)
            ica.fit(X_flattened)
            wv_weights = ica.components_
        elif wv_weighting == "LDA":
            lda = LinearDiscriminantAnalysis(n_components=wv_weighting_n_components)
            lda.fit(X_flattened, y)
            wv_weights = lda.scalings_
        else:
            raise ValueError("wrong wv_choice argument")

        # unflatten wv_weights
        cwt_shape = X_cwts.shape[1:]  # FREQUENCY x TIMEPOINT shape
        wv_weights = wv_weights.reshape(wv_weighting_n_components, *cwt_shape)
        # X_cwts has a shape EPOCH x FREQUENCY x TIMEPOINT
        # wv_weights has a shape  WAVELET_COMPONENT x FREQUENCY x TIMEPOINT
        one_channel_X_features = np.tensordot(X_cwts, wv_weights, axes=([1, 2], [1, 2]))
        # one_channel_X_features has a shape EPOCH x WAVELET_COMPONENT

        params.append((spatial_filter, wv_weights))
        X_features.append(one_channel_X_features)

    # transform from shape (ICA_COMP x EPOCH x WAVELET_COMP) to (EPOCH x ICA_COMP x WAVELET_COMP)
    X_features = np.array(X_features).transpose((1, 0, 2))

    return X_features, params

In [None]:
def train(
    X,
    y,
    mwt="mexh",
    cwt_density=2,
    ica_n_components=3,
    wv_weighting="PCA",
    wv_weighting_n_components=3,
):
    # X has a shape EPOCHS x CHANNELS x TIMEPOINTS
    # y has a shape EPOCHS

    features, params = vectorize(
        X,
        mwt,
        cwt_density,
        ica_n_components,
        wv_weighting,
        wv_weighting_n_components,
    )

    # flatten features into shape EPOCH x (ICA_COMP*WAVELET_COMP)
    features = features.reshape(features.shape[0], -1)

    clf = None
    # create model

    return params, clf

### Prediction

In [None]:
pass

### Benchmark

In [None]:
def benchmark(epochs_df, **hyperparams):
    # get only epochs from error responses
    X = np.array(epochs_df[epochs_df["marker"] == ERROR]["epoch"].to_list())
    # rumination levels for X
    y = np.array(
        epochs_df[epochs_df["marker"] == ERROR]["Rumination Full Scale"].to_list()
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # train
    params, clf = train(X_train, y_train, **hyperparams)
    vectorize_2(X_train)

In [None]:
benchmark(epochs_df, mwt="mexh")