# Extracting signals

In [8]:
import re
import os
import numpy as np
import pandas as pd

from scipy.signal import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

# from mne_features.feature_extraction import FeatureExtractor

In [1]:
import pyedflib

In [9]:
parameters = pd.read_csv('Dataset/parameters.csv', index_col=['parameter'])

In [2]:
def extract_signal(f, signal_labels, electrode_name, start, stop):

    tuh_label = [s for s in signal_labels if 'EEG ' + electrode_name + '-' in s]
    if len(tuh_label) > 1:
        print(tuh_label)
        exit('Multiple electrodes found with the same string! Abort')

    channel = signal_labels.index(tuh_label[0])
    signal = np.array(f.readSignal(channel))

    start, stop = float(start), float(stop)
    original_sample_frequency = f.getSampleFrequency(channel)
    original_start_index = int(np.floor(start * float(original_sample_frequency)))
    original_stop_index = int(np.floor(stop * float(original_sample_frequency)))

    seizure_signal = signal[original_start_index:original_stop_index]

    new_sample_frequency = int(parameters.loc['sampling_frequency']['value'])
    new_num_time_points = int(np.floor((stop - start) * new_sample_frequency))
    seizure_signal_resampled = resample(seizure_signal, new_num_time_points)

    return seizure_signal_resampled

def read_edfs_and_extract(edf_path, edf_start, edf_stop):

    f = pyedflib.EdfReader(edf_path)

    montage = str(parameters.loc['montage']['value'])
    montage_list = re.split(';', montage)
    signal_labels = f.getSignalLabels()
    x_data = []

    for i in montage_list:
        electrode_list = re.split('-', i)
        electrode_1 = electrode_list[0]
        extracted_signal_from_electrode_1 = extract_signal(f, signal_labels, electrode_name=electrode_1, start=edf_start, stop=edf_stop)
        electrode_2 = electrode_list[1]
        extracted_signal_from_electrode_2 = extract_signal(f, signal_labels, electrode_name=electrode_2, start=edf_start, stop=edf_stop)
        this_differential_output = extracted_signal_from_electrode_1-extracted_signal_from_electrode_2
        x_data.append(this_differential_output)

    f._close()
    del f

    x_data = np.array(x_data)

    return x_data

In [6]:
edf_path = '/home/eshuranov/projects/eeg_stud/Dataset/TUEV/tuev/edf/train/aaaaafop/aaaaafop_00000001.edf' # test file

In [10]:
test_file = read_edfs_and_extract(edf_path, 0, 1)

In [11]:
test_file.shape

(20, 250)

In [120]:
print(test_file)

[[-121. -120. -121. ...  116.  118.  119.]
 [  -5.  -11.   -9. ...  -93. -106. -109.]
 [   7.   13.   14. ...  -55.  -46.  -46.]
 ...
 [ -11.  -13.  -12. ...   24.   25.   23.]
 [ -11.  -12.  -10. ...    8.    5.    6.]
 [  -8.  -13.   -9. ...    9.    8.   11.]]


## Applying FFT and post-processing

In [4]:
from utils_pipeline import Pipeline
from preprocessing_library import FFT, Slice, Magnitude, Log10

#### Applying preproccesing on labels

In [8]:
from label_preprocess import Preprocess

TypeError: 'type' object is not subscriptable

In [97]:
labels = Preprocess().apply()

In [369]:
def get_label(start, stop, channel, sampling_frequency, file_path):
    """
    params: start - start of the window, stop - end of the window, channel - channel nubmer, 
    sampling frequency - smapling frequency of edf file, file_path - file path to EDF file!!!
    return: labels which belongs to the window
    """
    global labels
    lst = file_path.split('/')
    
    file_name = lst[-1][:-4]
    result = []

    if channel not in labels[file_name]:
        return [None]
        
    possible_lables = labels[file_name][channel]
    for label in possible_lables:
        intervals = possible_lables[label]
        for interval in intervals:
            if interval[0] <= stop and interval[1] >= start or interval[1] >= start and interval[0] < stop:
                result.append(label)

    if not result:
        return [None]
    return result        

#### FFT

In [357]:
def convert_to_fft_with_labels(window_length, window_step, fft_min_freq, fft_max_freq, sampling_frequency, file_path, file_length):
    """
    parmas:
    window_length - lenght of the interrval
    window_step - length of overlap
    sampling_frequency - frequency of the file
    file_path - path to the edf file
    file-length - length of the edf file
    return:
    pandas dataframe, with label, cahnnel number, FFT image and filename
    """
    time_series_data = read_edfs_and_extract(file_path, 0, file_length)
    
    pipeline = Pipeline([FFT(), Slice(fft_min_freq, fft_max_freq), Magnitude(), Log10()])
    
    label_start, label_step, label_stop = 0, window_step, window_length
    start, step = 0, int(np.floor(window_step * sampling_frequency))
    stop = start + int(np.floor(window_length * sampling_frequency))

    lst = file_path.split('/')
    file_name = lst[-1][:-4]
    fft_data = []

    while stop < time_series_data.shape[1]:
        signal_window = time_series_data[:, start:stop]
        fft_window = pipeline.apply(signal_window)

        for i in range(len(fft_window)):
            fft_data.append([get_label(label_start, label_stop, i, sampling_frequency, file_path), i, fft_window[i], label_start, file_name])
        
        start, stop = start + step, stop + step
        label_start, label_stop = label_start + label_step, label_stop + label_step

    return pd.DataFrame(fft_data, columns=['label', 'channel-number', 'FFT-image', 'window_start', 'file-name'])

In [358]:
def find_file_length(edf_path):
    f = pyedflib.EdfReader(edf_path)
    original_sample_frequency = f.getSampleFrequency(0)
    signal = np.array(f.readSignal(0))
    
    f._close()
    del f
    return len(signal) / original_sample_frequency

In [359]:
window_length = 5
window_step = window_length / 4
fft_min_freq = 1
fft_max_freq = 96
sampling_frequency = 250
file_path = '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaaaar/aaaaaaar_00000001.edf'
file_length = find_file_length(file_path)

In [360]:
result = convert_to_fft_with_labels(window_length, window_step, fft_min_freq, fft_max_freq,
                                    sampling_frequency, file_path, file_length)
result.head(25)

Unnamed: 0,label,channel-number,FFT-image,window_start,file-name
0,[6],0,"[4.353648527358219, 4.637707718247491, 4.00085...",0.0,aaaaaaar_00000001
1,[6],1,"[4.501154579595272, 3.9811983194273175, 4.0574...",0.0,aaaaaaar_00000001
2,[6],2,"[4.053145405319158, 3.9614527147490635, 3.5529...",0.0,aaaaaaar_00000001
3,[6],3,"[3.6639207606825597, 3.6409645058286917, 3.190...",0.0,aaaaaaar_00000001
4,[6],4,"[4.617176123115018, 4.303431852456283, 4.29354...",0.0,aaaaaaar_00000001
5,[None],5,"[4.561007835827728, 4.340348081221418, 4.09265...",0.0,aaaaaaar_00000001
6,[5],6,"[3.9356719940922273, 3.568082104961621, 3.5255...",0.0,aaaaaaar_00000001
7,[None],7,"[3.5882412124654914, 3.2515358459941455, 3.492...",0.0,aaaaaaar_00000001
8,[5],8,"[3.946554486834907, 4.0906360961089625, 3.4192...",0.0,aaaaaaar_00000001
9,[5],9,"[4.193737524936112, 3.656007687184383, 3.52556...",0.0,aaaaaaar_00000001


In [361]:
result.shape

(25540, 5)

### Analysing a separate file

In [28]:
import dill as pickle
import warnings

In [48]:
test_path = 'aaaaaaar_00000001.edf'

In [65]:
import pyedflib
import numpy as np

f = pyedflib.EdfReader(test_path)
original_sample_frequency = f.getSampleFrequency(2)
signal = np.array(f.readSignal(2))
print(f'raw signal by channel 0: {signal}, length of the signal: {len(signal)}')
print(f'original sample freqauency: {original_sample_frequency}')
print(f'length of the signal in seconds: {len(signal) / original_sample_frequency}')
f._close()
del f

raw signal by channel 0: [-58. -62. -57. ...   0.   0.   0.], length of the signal: 399500
original sample freqauency: 250.0
length of the signal in seconds: 1598.0


# Train and Validation sets

### Getting file paths

In [326]:
import os

file_paths = []
for dirpath, dirnames, filenames in os.walk("/Users/konstantin/Desktop/TUEV_data/edf/train"):
    for filename in sorted(list(filenames)):
        if filename.endswith('.edf'):
            file_paths.append(os.path.join(dirpath, filename))

In [327]:
file_paths

['/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaablw/aaaaablw_00000001.edf',
 '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaaevo/aaaaaevo_00000001.edf',
 '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaaeyk/aaaaaeyk_00000001.edf',
 '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaafoe/aaaaafoe_00000001.edf',
 '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaadlg/aaaaadlg_00000001.edf',
 '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaabji/aaaaabji_00000001.edf',
 '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaadoo/aaaaadoo_00000001.edf',
 '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaaezj/aaaaaezj_00000001.edf',
 '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaacvq/aaaaacvq_00000001.edf',
 '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaaboq/aaaaaboq_00000003.edf',
 '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaaezm/aaaaaezm_00000001.edf',
 '/Users/konstantin/Desktop/TUEV_data/edf/train/aaaaadoa/aaaaadoa_00000006.edf',
 '/Users/konstantin/Desktop/

### Apllying preprocess transformation 

In [370]:
window_length = 5
window_step = window_length / 4
fft_min_freq = 1
fft_max_freq = 96
sampling_frequency = 250
file_path = file_paths[0]
file_length = find_file_length(file_path)

In [371]:
df = convert_to_fft_with_labels(window_length, window_step, fft_min_freq, fft_max_freq, sampling_frequency, file_path, file_length)
df.head()

Unnamed: 0,label,channel-number,FFT-image,window_start,file-name
0,[None],0,"[3.5651459841051847, 4.054554959153428, 3.9608...",0.0,aaaaablw_00000001
1,[None],1,"[4.297497666425236, 3.899223790290856, 3.59796...",0.0,aaaaablw_00000001
2,[None],2,"[3.8161823051675143, 2.208106576150998, 2.8847...",0.0,aaaaablw_00000001
3,[None],3,"[2.7846219695132572, 2.8531217438387517, 2.946...",0.0,aaaaablw_00000001
4,[None],4,"[4.352776568267279, 3.7358987357430027, 3.7028...",0.0,aaaaablw_00000001


In [372]:
for i in range(1, len(file_paths)):
    file_path = file_paths[i]
    file_length = find_file_length(file_path)
    
    tmp_data_frame = convert_to_fft_with_labels(window_length, window_step, fft_min_freq, fft_max_freq, sampling_frequency, file_path, file_length)
    df = pd.concat([df, tmp_data_frame])

In [373]:
df.shape

(5770360, 5)

In [374]:
df.to_csv('data.csv')

### Spliting data in train and validation sets

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
import pandas as pd

df = pd.read_csv('data.csv', sep=',')
df = df.reset_index()

In [6]:
X = df[['FFT-image', 'window_start', 'channel-number']]
X

Unnamed: 0,FFT-image,window_start,channel-number
0,[3.56514598 4.05455496 3.96080639 3.88191954 3...,0.00,0
1,[4.29749767 3.89922379 3.59796984 3.37093156 3...,0.00,1
2,[3.81618231 2.20810658 2.88477519 2.59044162 2...,0.00,2
3,[ 2.78462197 2.85312174 2.94692202 3.012070...,0.00,3
4,[4.35277657 3.73589874 3.70289783 2.99328087 3...,0.00,4
...,...,...,...
5770355,[2.98655675 2.63212274 2.51678878 2.69555511 2...,1361.25,15
5770356,[3.01657186 2.97345611 2.71332353 2.98604653 2...,1361.25,16
5770357,[3.01137721 2.99581212 2.7047019 2.05343629 1...,1361.25,17
5770358,[3.10698782 2.84377458 3.19318984 2.20565263 2...,1361.25,18


In [8]:
X = X.rename(columns={'FFT-image': 'FFT_image'})
X

Unnamed: 0,FFT_image,window_start,channel-number
0,[3.56514598 4.05455496 3.96080639 3.88191954 3...,0.00,0
1,[4.29749767 3.89922379 3.59796984 3.37093156 3...,0.00,1
2,[3.81618231 2.20810658 2.88477519 2.59044162 2...,0.00,2
3,[ 2.78462197 2.85312174 2.94692202 3.012070...,0.00,3
4,[4.35277657 3.73589874 3.70289783 2.99328087 3...,0.00,4
...,...,...,...
5770355,[2.98655675 2.63212274 2.51678878 2.69555511 2...,1361.25,15
5770356,[3.01657186 2.97345611 2.71332353 2.98604653 2...,1361.25,16
5770357,[3.01137721 2.99581212 2.7047019 2.05343629 1...,1361.25,17
5770358,[3.10698782 2.84377458 3.19318984 2.20565263 2...,1361.25,18


In [None]:
X['FFT_image'] = X['FFT_image'].str.replace('\n', '')

In [None]:
X['FFT_image'] = X.FFT_image.str[1:-1].replace('  ', ' ')

In [None]:
arr = []

In [None]:
for string in X.FFT_image:
    arr.append(list(map(float, string.split())))

In [None]:
len(arr)

In [None]:
tmp = pd.DataFrame(arr, columns = [f'FFT_image_signal_{index}' for index in range(96)])

In [None]:
# tmp = pd.DataFrame(X.FFT_image.str.split(), columns = [f'FFT_image_signal_{index}' for index in range(96)])

In [None]:
tmp.head()

In [380]:
Y = df[['label']]
Y.head()

Unnamed: 0,label
0,[None]
1,[None]
2,[None]
3,[None]
4,[None]


In [382]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=42)

In [384]:
y_train.head()

Unnamed: 0,label
10995,[None]
7178,[None]
27063,[None]
3193,[None]
1098,[None]


In [385]:
X_train.head()

Unnamed: 0,FFT-image,window_start,channel-number
10995,"[3.943334931850202, 3.80215277060562, 3.697599...",686.25,15
7178,"[3.212959647964134, 3.672813773919944, 3.38470...",447.5,18
27063,"[3.152114161776472, 3.1408470899256153, 2.5271...",1691.25,3
3193,"[2.6590546898748784, 2.7028677099069056, 2.370...",198.75,13
1098,"[2.844020507837268, 3.238936133838194, 3.02474...",67.5,18


### Model training

In [387]:
from sklearn.ensemble import GradientBoostingClassifier

In [388]:
clf = GradientBoostingClassifier(n_estimators=100, loss='log_loss', learning_rate=1.0, random_state=42).fit(X_train, y_train)

ValueError: setting an array element with a sequence.

In [None]:
clf.score(X_test, y_test)