### Librarby and module imports

In [69]:
import re
import os

import pyedflib
import numpy as np
import pandas as pd

from scipy.signal import resample
from utils_pipeline import Pipeline
from preprocessing_library import FFT, Slice, Magnitude, Log10

### Construction a dictionary with labels

In [72]:
class Preprocess:
    def __init__(self):
        self.result = {}

    def labeling(self, intervals):
        """
        :param intervals: list of intervals where intervals[i][0] - start intervals[i][1] - end intervals[i][2] - label
        :return: labels - dictionary, keys - labels, values - intervals
        """
        labels = {}

        for interval in intervals:
            if interval[-1] not in labels:
                labels[interval[-1]] = [interval[0]]
            else:
                labels[interval[-1]].append(interval[0])

        return labels

    def apply(self):
        """whole directory check"""
        for root, dirs, files in os.walk("/home/eshuranov/projects/eeg_stud/Dataset/TUEV/tuev/edf/train", topdown=True):
            for name in files:
                if name.endswith('.rec'):
                    file_path = os.path.join(root, name)

                    dct = {}
                    """
                    node[0] - chanel
                    node[1] - start
                    node[2] - end
                    node[3] - label
                    """
                    with open(file_path, 'r') as file:
                        for node in file.readlines():
                            node = node[:-1]

                            node = node.split(',')

                            if int(node[0]) not in dct:
                                dct[int(node[0])] = [[[float(node[1]), float(node[2])], int(node[3])]]
                            else:
                                dct[int(node[0])].append([[float(node[1]), float(node[2])], int(node[3])])

                    for key in dct:
                        dct[key] = self.labeling(dct[key])
                    self.result[name[:-4]] = dct
                    
                
        return self.result


In [73]:
labels = Preprocess().apply()

In [74]:
labels.keys()

dict_keys(['aaaaaehl_00000001', 'aaaaaaoy_00000001', 'aaaaafit_00000001', 'aaaaaclp_00000002', 'aaaaabyj_00000007', 'aaaaafcu_00000001', 'aaaaaayn_00000001', 'aaaaadxl_00000002', 'aaaaabjc_00000005', 'aaaaaaof_00000001', 'aaaaabfu_00000004', 'aaaaadrh_00000002', 'aaaaabsw_00000001', 'aaaaabqd_00000001', 'aaaaadsr_00000007', 'aaaaacxl_00000010', 'aaaaacxx_00000001', 'aaaaacjf_00000001', 'aaaaaamc_00000001', 'aaaaacyp_00000001', 'aaaaafkp_00000001', 'aaaaabtw_00000001', 'aaaaadxw_00000001', 'aaaaabsy_00000001', 'aaaaafgy_00000001', 'aaaaafgm_00000001', 'aaaaaewi_00000003', 'aaaaabxb_00000001', 'aaaaaeio_00000001', 'aaaaaecu_00000001', 'aaaaadgr_00000001', 'aaaaabop_00000004', 'aaaaabop_00000002', 'aaaaabop_00000003', 'aaaaafop_00000001', 'aaaaabhu_00000001', 'aaaaaamd_00000003', 'aaaaadsd_00000001', 'aaaaafmb_00000001', 'aaaaaeel_00000002', 'aaaaaeuh_00000001', 'aaaaaesj_00000001', 'aaaaabuu_00000001', 'aaaaaezm_00000001', 'aaaaabrr_00000001', 'aaaaacnb_00000001', 'aaaaabgg_00000001', 'a

In [75]:
labels

{'aaaaaehl_00000001': {8: {6: [[31.1, 32.1],
    [32.1, 33.1],
    [33.1, 34.1],
    [34.1, 35.1]]},
  15: {1: [[28.7, 29.7]]},
  1: {1: [[28.7, 29.7]]},
  4: {6: [[31.1, 32.1], [32.1, 33.1], [33.1, 34.1], [34.1, 35.1]]},
  5: {6: [[31.1, 32.1], [32.1, 33.1], [33.1, 34.1], [34.1, 35.1]]},
  6: {6: [[31.1, 32.1], [32.1, 33.1], [33.1, 34.1], [34.1, 35.1]]},
  11: {1: [[28.8, 29.8]],
   6: [[31.1, 32.1], [32.1, 33.1], [33.1, 34.1], [34.1, 35.1]]},
  19: {1: [[28.7, 29.7]]},
  9: {1: [[28.8, 29.8]],
   6: [[31.1, 32.1], [32.1, 33.1], [33.1, 34.1], [34.1, 35.1]]},
  18: {1: [[28.7, 29.7]]},
  7: {6: [[31.1, 32.1], [32.1, 33.1], [33.1, 34.1], [34.1, 35.1]]},
  17: {1: [[28.7, 29.7]]},
  12: {6: [[31.1, 32.1], [32.1, 33.1], [33.1, 34.1], [34.1, 35.1]]},
  2: {1: [[28.7, 29.7]]},
  10: {6: [[31.1, 32.1], [32.1, 33.1], [33.1, 34.1], [34.1, 35.1]]},
  16: {1: [[28.7, 29.7]]}},
 'aaaaaaoy_00000001': {14: {5: [[0.0, 1.0],
    [1.0, 2.0],
    [559.5, 560.5],
    [560.5, 561.5],
    [561.5, 562.5],


### Extracting features and applying FFT

In [76]:
parameters = pd.read_csv('Dataset/parameters.csv', index_col=['parameter'])

In [77]:
def extract_signal(f, signal_labels, electrode_name, start, stop):

    tuh_label = [s for s in signal_labels if 'EEG ' + electrode_name + '-' in s]
    if len(tuh_label) > 1:
        print(tuh_label)
        exit('Multiple electrodes found with the same string! Abort')

    channel = signal_labels.index(tuh_label[0])
    signal = np.array(f.readSignal(channel))

    start, stop = float(start), float(stop)
    original_sample_frequency = f.getSampleFrequency(channel)
    original_start_index = int(np.floor(start * float(original_sample_frequency)))
    original_stop_index = int(np.floor(stop * float(original_sample_frequency)))

    seizure_signal = signal[original_start_index:original_stop_index]

    new_sample_frequency = int(parameters.loc['sampling_frequency']['value'])
    new_num_time_points = int(np.floor((stop - start) * new_sample_frequency))
    seizure_signal_resampled = resample(seizure_signal, new_num_time_points)

    return seizure_signal_resampled

def read_edfs_and_extract(edf_path, edf_start, edf_stop):

    f = pyedflib.EdfReader(edf_path)

    montage = str(parameters.loc['montage']['value'])
    montage_list = re.split(';', montage)
    signal_labels = f.getSignalLabels()
    x_data = []

    for i in montage_list:
        electrode_list = re.split('-', i)
        electrode_1 = electrode_list[0]
        extracted_signal_from_electrode_1 = extract_signal(f, signal_labels, electrode_name=electrode_1, start=edf_start, stop=edf_stop)
        electrode_2 = electrode_list[1]
        extracted_signal_from_electrode_2 = extract_signal(f, signal_labels, electrode_name=electrode_2, start=edf_start, stop=edf_stop)
        this_differential_output = extracted_signal_from_electrode_1-extracted_signal_from_electrode_2
        x_data.append(this_differential_output)

    f._close()
    del f

    x_data = np.array(x_data)

    return x_data

In [78]:
def get_label(start, stop, channel, sampling_frequency, file_path):
    """
    params: start - start of the window, stop - end of the window, channel - channel nubmer, 
    sampling frequency - smapling frequency of edf file, file_path - file path to EDF file!!!
    return: labels which belongs to the window
    """
    global labels
    lst = file_path.split('/')
    
    file_name = lst[-1][:-4]
    result = set()

    if channel not in labels[file_name]:
        return [None]
        
    possible_lables = labels[file_name][channel]
    for label in possible_lables:
        intervals = possible_lables[label]
        
        for interval in intervals:
            if interval[0] <= stop - 0.5 and interval[1] >= start - 0.5:
                result.add(label)

    if not result:
        return np.array([None])
    return np.array(result)

In [79]:
def convert_to_fft_with_labels(window_length, window_step, fft_min_freq, fft_max_freq, sampling_frequency, file_path, file_length):
    """
    parmas:
    window_length - lenght of the interrval
    window_step - length of overlap
    sampling_frequency - frequency of the file
    file_path - path to the edf file
    file-length - length of the edf file
    return:
    pandas dataframe, with label, cahnnel number, FFT image and filename
    """
    time_series_data = read_edfs_and_extract(file_path, 0, file_length)
    
    pipeline = Pipeline([FFT(), Slice(fft_min_freq, fft_max_freq), Magnitude(), Log10()])
    
    label_start, label_step, label_stop = 0, window_step, window_length
    start, step = 0, int(np.floor(window_step * sampling_frequency))
    stop = start + int(np.floor(window_length * sampling_frequency))

    lst = file_path.split('/')
    file_name = lst[-1][:-4]
    fft_data = []

    while stop < time_series_data.shape[1]:
        signal_window = time_series_data[:, start:stop]
        fft_window = pipeline.apply(signal_window)

        for i in range(len(fft_window)):
            tmp = [get_label(label_start, label_stop, i, sampling_frequency, file_path), i, label_start, file_name]
            tmp.extend(np.array(fft_window[i]))
            
            fft_data.append(tmp)

        start, stop = start + step, stop + step
        label_start, label_stop = label_start + label_step, label_stop + label_step

    columns = ['label', 'channel_number', 'window_start', 'file_name']
    columns.extend([f'FFT_image_signal_{index}' for index in range(96)])
    
    return pd.DataFrame(fft_data, columns=columns)

In [80]:
def find_file_length(edf_path):
    f = pyedflib.EdfReader(edf_path)
    original_sample_frequency = f.getSampleFrequency(0)
    signal = np.array(f.readSignal(0))
    
    f._close()
    del f
    return len(signal) / original_sample_frequency

### Example useage for one fille

In [81]:
window_length = 5
window_step = window_length / 4
fft_min_freq = 1
fft_max_freq = 96
sampling_frequency = 250
file_path = '/home/eshuranov/projects/eeg_stud/Dataset/TUEV/tuev/edf/train/aaaaablw/aaaaablw_00000001.edf'
file_length = find_file_length(file_path)

In [82]:
result = convert_to_fft_with_labels(window_length, window_step, fft_min_freq, fft_max_freq,
                                    sampling_frequency, file_path, file_length)
result.head(25)

Unnamed: 0,label,channel_number,window_start,file_name,FFT_image_signal_0,FFT_image_signal_1,FFT_image_signal_2,FFT_image_signal_3,FFT_image_signal_4,FFT_image_signal_5,...,FFT_image_signal_86,FFT_image_signal_87,FFT_image_signal_88,FFT_image_signal_89,FFT_image_signal_90,FFT_image_signal_91,FFT_image_signal_92,FFT_image_signal_93,FFT_image_signal_94,FFT_image_signal_95
0,[None],0,0.0,aaaaablw_00000001,3.565146,4.054555,3.960806,3.88192,3.594154,3.126447,...,2.477615,2.748681,2.57014,2.63194,2.701838,2.952489,2.593133,2.653518,2.799739,2.506702
1,[None],1,0.0,aaaaablw_00000001,4.297498,3.899224,3.59797,3.370932,3.303626,3.106119,...,2.206094,2.700978,2.781754,2.921848,2.818152,2.56272,2.516969,2.624326,2.177338,2.648456
2,[None],2,0.0,aaaaablw_00000001,3.816182,2.208107,2.884775,2.590442,2.48199,2.738091,...,2.596185,2.295069,2.304221,2.657756,2.616782,2.5743,1.771159,2.351663,2.648617,2.305122
3,[None],3,0.0,aaaaablw_00000001,2.784622,2.853122,2.946922,3.012071,2.95726,2.520676,...,2.445553,2.420417,2.037269,2.477124,2.246792,1.702746,2.250642,1.442305,1.988573,1.170138
4,[None],4,0.0,aaaaablw_00000001,4.352777,3.735899,3.702898,2.993281,3.168895,3.482455,...,2.231987,2.701518,2.589631,2.608774,2.565191,2.460618,2.121642,2.512819,2.858559,2.525241
5,[None],5,0.0,aaaaablw_00000001,3.90562,3.969778,3.815147,3.675407,3.42412,2.981833,...,2.391464,1.756372,2.49791,2.148683,2.83545,1.62344,2.606036,1.92517,2.728326,2.512994
6,[None],6,0.0,aaaaablw_00000001,3.446784,3.288219,3.275853,3.192103,2.886509,2.874173,...,2.109781,2.110105,2.269128,2.377387,2.188522,1.655238,1.812866,2.254082,2.581845,2.321042
7,[None],7,0.0,aaaaablw_00000001,3.235278,3.148839,2.467313,2.724875,2.644038,2.554581,...,2.341149,1.222783,2.278466,1.784916,1.841394,1.87277,1.840369,2.350755,1.477456,2.204921
8,[None],8,0.0,aaaaablw_00000001,3.767681,3.021228,3.063435,2.621439,3.138826,3.147641,...,2.831851,2.427763,1.933134,2.82622,2.381336,2.549219,2.542198,2.540667,2.864603,2.586557
9,[None],9,0.0,aaaaablw_00000001,3.456587,3.565502,3.20227,3.282296,2.887466,2.203154,...,2.315735,1.43314,2.344999,0.868956,2.27533,2.377438,2.327409,2.23661,2.176939,2.192377


In [83]:
result.shape

(22100, 100)

## Consutrcting the dataframe for entire directory of edf files

In [84]:
import os

file_paths = []
for dirpath, dirnames, filenames in os.walk("/home/eshuranov/projects/eeg_stud/Dataset/TUEV/tuev/edf/train"): # the path to the train directory in TUEV dataset
    for filename in sorted(list(filenames)):
        if filename.endswith('.edf'):
            file_paths.append(os.path.join(dirpath, filename))

### Constructing a daraframe for the first file and stacking others upon.

In [85]:
# print(file_paths)

window_length = 5
window_step = window_length / 4
fft_min_freq = 1
fft_max_freq = 96
sampling_frequency = 250
file_path = file_paths[0]
file_length = find_file_length(file_path)

In [86]:
df = convert_to_fft_with_labels(window_length, window_step, fft_min_freq, fft_max_freq, sampling_frequency, file_path, file_length)
df.head()

Unnamed: 0,label,channel_number,window_start,file_name,FFT_image_signal_0,FFT_image_signal_1,FFT_image_signal_2,FFT_image_signal_3,FFT_image_signal_4,FFT_image_signal_5,...,FFT_image_signal_86,FFT_image_signal_87,FFT_image_signal_88,FFT_image_signal_89,FFT_image_signal_90,FFT_image_signal_91,FFT_image_signal_92,FFT_image_signal_93,FFT_image_signal_94,FFT_image_signal_95
0,[None],0,0.0,aaaaaehl_00000001,4.220508,3.83376,2.9833,3.810046,3.810976,3.702923,...,2.164343,2.497989,1.746972,2.125165,2.155093,2.450107,2.693052,2.659798,2.286942,1.320341
1,[None],1,0.0,aaaaaehl_00000001,3.482935,3.49333,3.314656,3.087393,3.323063,3.218921,...,2.169976,2.532957,2.43987,2.539972,2.062157,1.891956,1.329246,1.90065,2.439317,2.01821
2,[None],2,0.0,aaaaaehl_00000001,3.541936,3.610211,3.778294,3.513544,3.480528,3.379001,...,2.085382,2.335438,2.336094,2.364382,2.084399,1.754878,2.241414,1.903124,2.213962,2.0372
3,[None],3,0.0,aaaaaehl_00000001,2.884013,3.645488,3.592169,3.399077,3.399175,3.019315,...,2.186275,2.146117,2.022436,1.221566,1.743736,2.038606,2.340001,2.300907,2.420181,2.447888
4,[None],4,0.0,aaaaaehl_00000001,3.914896,3.57502,3.483098,3.675805,2.924761,3.266281,...,2.334342,2.749132,2.654275,1.888185,2.647796,2.736953,2.224371,2.543454,2.561461,2.220254


In [87]:
for i in range(1, len(file_paths)): # Remove [:10] if you want to apply this for the whole dataset
    file_path = file_paths[i]
    file_length = find_file_length(file_path)
    
    tmp_data_frame = convert_to_fft_with_labels(window_length, window_step, fft_min_freq, fft_max_freq, sampling_frequency, file_path, file_length)
    df = pd.concat([df, tmp_data_frame])

In [88]:
df = df.reset_index()

In [89]:
df['label'].value_counts()

label
[None]       5668213
{6}            56642
{5}            15907
{2}            14302
{3}             8620
{4}             3085
{1}             1802
{5, 6}           887
{1, 6}           259
{2, 5}           239
{4, 5}           192
{4, 6}           111
{1, 5}            50
{3, 5}            13
{2, 3}            11
{4, 5, 6}          8
{1, 5, 6}          6
{1, 3}             4
{3, 5, 6}          3
{3, 4}             2
{2, 3, 6}          2
{3, 6}             2
Name: count, dtype: int64

### Applying one-hot-encoding for the target

In [90]:
def my_get_dummies(array):
    arr = [1, 2, 3, 4, 5, 6, None]
    tmp_df = pd.DataFrame(np.zeros((len(array), 7), float), columns=list(map(str, arr)))

    for index in range(len(array)):
        tmp_df.iloc[index] = [1 if idx in array[index] else 0 for idx in arr]

    return tmp_df

In [91]:
tmp = my_get_dummies(df['label'])

In [92]:
df = df.drop(['label'], axis=1)
df = pd.concat([tmp, df], axis=1)

In [93]:
df.head()

Unnamed: 0,1,2,3,4,5,6,None,index,channel_number,window_start,...,FFT_image_signal_86,FFT_image_signal_87,FFT_image_signal_88,FFT_image_signal_89,FFT_image_signal_90,FFT_image_signal_91,FFT_image_signal_92,FFT_image_signal_93,FFT_image_signal_94,FFT_image_signal_95
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,...,2.164343,2.497989,1.746972,2.125165,2.155093,2.450107,2.693052,2.659798,2.286942,1.320341
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1,0.0,...,2.169976,2.532957,2.43987,2.539972,2.062157,1.891956,1.329246,1.90065,2.439317,2.01821
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,2,0.0,...,2.085382,2.335438,2.336094,2.364382,2.084399,1.754878,2.241414,1.903124,2.213962,2.0372
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,3,0.0,...,2.186275,2.146117,2.022436,1.221566,1.743736,2.038606,2.340001,2.300907,2.420181,2.447888
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4,4,0.0,...,2.334342,2.749132,2.654275,1.888185,2.647796,2.736953,2.224371,2.543454,2.561461,2.220254


### Train/validation split

In [94]:
from sklearn.model_selection import train_test_split

In [95]:
features_arr = ['index', 'channel_number', 'window_start']
features_arr.extend([f'FFT_image_signal_{index}' for index in range(96)])

X = df[features_arr]
y = df[['1', '2', '3', '4', '5', '6', 'None']]

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [97]:
X_train = X_train.reset_index()
X_test = X_test.reset_index()
y_train = y_train.reset_index()
y_test = y_test.reset_index()

In [98]:
train_data_set = pd.concat([X_train, y_train])
validation_data_set = pd.concat([X_test, y_test])

In [99]:
train_data_set.to_csv('EEG_train.csv')

In [100]:
validation_data_set.to_csv('EEG_validation.csv')