### Imports

In [68]:
import os
import pywt
import mne
import numpy as np
import pandas as pd

from mne.time_frequency import psd_welch
from mne.decoding import Scaler
from mne.filter import construct_iir_filter

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.svm import SVC

## Preprocessing

### Loading edf

In [2]:
edfs_path = "..\dataverse_files"
manifest_path = "..\dataverse_files\MANIFEST.txt"

In [3]:
raw_patients_data = []

edfs_file_names = [f for f in os.listdir(edfs_path) if f.endswith('.edf')]

for file_name in edfs_file_names:
    path = edfs_path + '\\' + file_name 
    raw_data = mne.io.read_raw_edf(path, preload=True, verbose=False)
    raw_patients_data.append(raw_data)

edf_file_names = []

### EEG signals filtration

In [4]:
iir_filter_dataset = [
    {'order': 2, 'flow': 0.5, 'fhigh': 50},
    {'order': 5, 'flow': 0.5, 'fhigh': 50},
    {'order': 6, 'flow': 0.5, 'fhigh': 50},
    {'order': 2, 'flow': 2, 'fhigh': 45},
    {'order': 5, 'flow': 2, 'fhigh': 45},
    {'order': 6, 'flow': 2, 'fhigh': 45},
    {'order': 5, 'flow': 0.5, 'fhigh': 45},
    {'order': 6, 'flow': 0.5, 'fhigh': 45},
    {'order': 81, 'fcut': 40, 'fstop': 45},
    {'order': 5, 'fcut': 50},
    {'order': 6, 'fcut': 50},
]

In [5]:
iir = []

for iir_data in iir_filter_dataset:
    btype = 'bandpass'
    f_stop = None

    if 'flow' in iir_data and 'fhigh' in iir_data:
        f_pass = [iir_data['flow'], iir_data['fhigh']]

    if 'fcut' in iir_data:
        btype = 'lowpass'
        f_pass = iir_data['fcut']
        if 'fstop' in iir_data:
            f_stop = iir_data['fstop']
        
    iir.append(construct_iir_filter(dict(order=iir_data['order'], ftype='butter', output='sos'), f_pass, f_stop, 250, btype))


IIR filter parameters
---------------------
Butterworth bandpass zero-phase (two-pass forward and reverse) non-causal filter:
- Filter order 8 (effective, after forward-backward)
- Cutoffs at 0.50, 50.00 Hz: -6.02, -6.02 dB


IIR filter parameters
---------------------
Butterworth bandpass zero-phase (two-pass forward and reverse) non-causal filter:
- Filter order 20 (effective, after forward-backward)
- Cutoffs at 0.50, 50.00 Hz: -6.02, -6.02 dB


IIR filter parameters
---------------------
Butterworth bandpass zero-phase (two-pass forward and reverse) non-causal filter:
- Filter order 24 (effective, after forward-backward)
- Cutoffs at 0.50, 50.00 Hz: -6.02, -6.02 dB


IIR filter parameters
---------------------
Butterworth bandpass zero-phase (two-pass forward and reverse) non-causal filter:
- Filter order 8 (effective, after forward-backward)
- Cutoffs at 2.00, 45.00 Hz: -6.02, -6.02 dB


IIR filter parameters
---------------------
Butterworth bandpass zero-phase (two-pass forward

C:\Users\Bartosz Ziomek\anaconda3\envs\thesis\lib\site-packages\scipy\signal\filter_design.py:1631: BadCoefficients: Badly conditioned filter coefficients (numerator): the results may be meaningless


In [6]:
# filtered_patients_data = [raw_patient_data.copy()
#                           .filter(l_freq=None, h_freq=None, picks='eeg', method='iir', iir_params=iir[2], n_jobs=-1, verbose=False) 
#                           for raw_patient_data in raw_patients_data]

### Scaling EEG signals with Scaler from MNE

In [7]:
# def mne_std_scaler(edf):
#     scaler = Scaler(scalings='mean')
#     return [scaler.fit_transform(patient_data.get_data()) for patient_data in edf]

In [8]:
# def mne_robust_scaler(edf):
#     scaler = Scaler(scalings='median')
#     return [scaler.fit_transform(patient_data.get_data()) for patient_data in edf]

In [9]:
# mne_std_scaled_patient_data = mne_robust_scaler(filtered_patients_data)

In [10]:
# mne_std_scaled_patient_data[0]

### Filtered EEG signals segmentation

In [11]:
def get_label(edf):
    patient_edf_file_name = edf.filenames[0].split('\\')[-1]
    isSick = patient_edf_file_name.lower().startswith('s')
    return int(isSick == True) # 1 - is sick, 0 is healthy

In [12]:
def print_info(epochs_num_per_patient, labels):
    # print('\nEpochs number per patient: ', epochs_num_per_patient)
    
    class_0_num = sum(labels) 
    class_1_num = len(labels)-sum(labels)

    # print('\nnegative: ', class_0_num)
    # print('positive: ', class_1_num)

In [13]:
def transform_patients_data_into_X_y_sets(patients_data, info=True):
    epochs_per_patient = []
    labels = []
    
    epochs_num_per_patient = []
    for edf in raw_patients_data:
        epochs = mne.make_fixed_length_epochs(edf, duration=25, preload=True, verbose=False)
        
        epochs_per_patient.append(epochs)
        epochs_num_per_patient.append(len(epochs))
        
        label = get_label(edf)
        labels.extend([label for epoch in epochs])
    
    epochs = mne.concatenate_epochs(epochs_per_patient)
    
    if info:
        print_info(epochs_num_per_patient, labels)

    return (epochs, labels) # (X, y)

In [14]:
def transform_patients_data_into_X_y_sets(patients_data, info=False):
    epochs_per_patient = []
    labels = []
    
    for edf in patients_data:
        epochs = mne.make_fixed_length_epochs(edf, duration=25, preload=True, verbose=False)
        
        epochs_per_patient.append(epochs)
        
        label = get_label(edf)
        labels.extend([label for epoch in epochs])
    
    epochs = mne.concatenate_epochs(epochs_per_patient)
    
    if info:
        print_info(len(epochs), labels)

    return (epochs, labels) # (X, y)

In [15]:
# X, y = transform_patients_data_into_X_y_sets(filtered_patients_data)

### Feature extraction

In [16]:
def eeg_power_band(epochs):
    """EEG relative power band feature extraction.

    This function takes an ``mne.Epochs`` object and creates EEG features based
    on relative power in specific frequency bands that are compatible with
    scikit-learn.

    Parameters
    ----------
    epochs : Epochs
        The data.

    Returns
    -------
    X : numpy array of shape [n_samples, 5]
        Transformed data.
    """
    # specific frequency bands
    FREQ_BANDS = {"delta": [0.5, 4.5],
                  "theta": [4.5, 8.5],
                  "alpha": [8.5, 11.5],
                  "sigma": [11.5, 15.5],
                  "beta": [15.5, 30]}

    psds, freqs = psd_welch(epochs, picks='eeg', fmin=0.5, fmax=30.)
    # Normalize the PSDs
    psds /= np.sum(psds, axis=-1, keepdims=True)

    X = []
    for fmin, fmax in FREQ_BANDS.values():
        psds_band = psds[:, :, (freqs >= fmin) & (freqs < fmax)].mean(axis=-1)
        X.append(psds_band.reshape(len(psds), -1))

    return np.concatenate(X, axis=1)

In [17]:
# features = eeg_power_band(X)

In [18]:
# features.shape

In [19]:
# len(features[0])

### Classification

##### Without scaling

In [20]:
def class_test(features, y, clr):
    X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.33, shuffle=True, random_state=41)

    pipe = make_pipeline(
        clr
    )

    # Train
    pipe.fit(X_train, y_train)

    # Test
    y_pred = pipe.predict(X_test)

    # Assess the results
    acc = accuracy_score(y_test, y_pred)

    return acc

##### With MinMaxScaler

In [21]:
# X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.33, shuffle=True, random_state=41)

# pipe = make_pipeline(
#     MinMaxScaler(copy=False),
#     RandomForestClassifier(n_estimators=100, random_state=42),
# )

# # Train
# pipe.fit(X_train, y_train)

# # Test
# y_pred = pipe.predict(X_test)

# # Assess the results
# acc = accuracy_score(y_test, y_pred)

# print("Accuracy score: {}".format(acc))

##### With MinMaxScaler

In [22]:
# X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.33, shuffle=True, random_state=41)

# pipe = make_pipeline(
#     StandardScaler(copy=False),
#     RandomForestClassifier(n_estimators=100, random_state=42),
# )

# # Train
# pipe.fit(X_train, y_train)

# # Test
# y_pred = pipe.predict(X_test)

# # Assess the results
# acc = accuracy_score(y_test, y_pred)

# print("Accuracy score: {}".format(acc))

##### With RobustScaler

In [23]:
# X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.33, shuffle=True, random_state=41)

# pipe = make_pipeline(
#     RobustScaler(copy=False),
#     RandomForestClassifier(n_estimators=100, random_state=42),
# )

# # Train
# pipe.fit(X_train, y_train)

# # Test
# y_pred = pipe.predict(X_test)

# # Assess the results
# acc = accuracy_score(y_test, y_pred)

# print("Accuracy score: {}".format(acc))

### Cross Validatated Classification

In [24]:
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# scores = cross_val_score(clf, features, y, cv=3)

# print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

### Test

In [25]:
clr_arr = [KNeighborsClassifier(n_neighbors=2),
            RandomForestClassifier(n_estimators=100, random_state=42),
            SVC(kernel='poly')]

res_acc_arr = []

In [26]:
for index1, clr in enumerate(clr_arr):
    with open(f'result{index1 + 1}.txt', 'w', encoding = 'utf-8') as f:
        acc_arr = []

        for index, i in enumerate(iir):
            filtered_patients_data = [raw_patient_data.copy()
                                  .filter(l_freq=None, h_freq=None, picks='eeg', method='iir', iir_params=i, n_jobs=-1, verbose=False) 
                                  for raw_patient_data in raw_patients_data]

            X, y = transform_patients_data_into_X_y_sets(filtered_patients_data)

            features = eeg_power_band(X)

            acc = class_test(features, y, clr)
            acc_arr.append(acc)

            filtered_patients_data = []
            X = []
            y = []
            features = []
            f.write(f"{iir_filter_dataset[index]} Accuracy score: {acc}\n")

        max_acc = max(acc_arr)
        f.write(f"{iir_filter_dataset[acc_arr.index(max_acc)]} -> {max_acc}\n")
    res_acc_arr.append(acc_arr)

Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline co

C:\Users\Bartosz Ziomek\anaconda3\envs\thesis\lib\site-packages\scipy\signal\filter_design.py:1631: BadCoefficients: Badly conditioned filter coefficients (numerator): the results may be meaningless


Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline co

C:\Users\Bartosz Ziomek\anaconda3\envs\thesis\lib\site-packages\scipy\signal\filter_design.py:1631: BadCoefficients: Badly conditioned filter coefficients (numerator): the results may be meaningless


Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline co

C:\Users\Bartosz Ziomek\anaconda3\envs\thesis\lib\site-packages\scipy\signal\filter_design.py:1631: BadCoefficients: Badly conditioned filter coefficients (numerator): the results may be meaningless


Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)
Not setting metadata
1142 matching events found
No baseline correction applied
0 bad epochs dropped
Effective window size : 1.024 (s)


In [27]:
np.average(np.array(res_acc_arr).T, axis = 1), max(np.average(np.array(res_acc_arr).T, axis = 1))

(array([0.9372237 , 0.94076039, 0.93633952, 0.93987622, 0.93899204,
        0.93810787, 0.93987622, 0.93899204, 0.94429708, 0.94341291,
        0.94341291]),
 0.9442970822281168)

In [69]:
iir_filter_dataset = [
    {'order': 2, 'flow': 0.5, 'fhigh': 50},
    {'order': 5, 'flow': 0.5, 'fhigh': 50},
    {'order': 6, 'flow': 0.5, 'fhigh': 50},
    {'order': 2, 'flow': 2, 'fhigh': 45},
    {'order': 5, 'flow': 2, 'fhigh': 45},
    {'order': 6, 'flow': 2, 'fhigh': 45},
    {'order': 5, 'flow': 0.5, 'fhigh': 45},
    {'order': 6, 'flow': 0.5, 'fhigh': 45},
    {'order': 81, 'fcut': 40, 'fstop': 45},
    {'order': 5, 'fcut': 50, 'fstop': None},
    {'order': 6, 'fcut': 50, 'fstop': None},
]

conv = np.average(np.array(res_acc_arr).T, axis = 1).tolist();
test = [list(iir_filter.values()) for iir_filter in iir_filter_dataset]

In [70]:
for index, item in enumerate(conv):
    test[index].append(item)

In [76]:
df = pd.DataFrame(test, columns =['Order', 'Flow/Fcut', 'Fhigh/Fstop', 'Avg. acc'])

In [77]:
df.sort_values('Avg. acc')

Unnamed: 0,Order,Flow/Fcut,Fhigh/Fstop,Avg. acc
2,6,0.5,50.0,0.93634
0,2,0.5,50.0,0.937224
5,6,2.0,45.0,0.938108
4,5,2.0,45.0,0.938992
7,6,0.5,45.0,0.938992
3,2,2.0,45.0,0.939876
6,5,0.5,45.0,0.939876
1,5,0.5,50.0,0.94076
9,5,50.0,,0.943413
10,6,50.0,,0.943413
