In [1]:
import re
import os

import pyedflib
import numpy as np
import pandas as pd


import csv

from scipy.signal import resample
from utils_pipeline import Pipeline
from preprocessing_library import FFT, Slice, Magnitude, Log10

In [2]:
! which python 

/home/eshuranov/miniconda3/envs/py38/bin/python


### Constructing the dictionary with labels

In [3]:
class Preprocess:
    @staticmethod
    def merge(segments):
        """
        :param segments: list of overlapping intervals with same label, segment[i][0] - start, segment[i][1] - end
        :return: list of intervals with no overlaps
        """
        merged = []
        segments.sort(key=lambda x: x[0])

        for segment in segments:
            if not merged or segment[0] > merged[-1][1]:
                merged.append(segment)
            else:
                merged[-1][1] = segment[1]

        return merged

    def labeling(self, intervals):
        """
        :param intervals: list of intervals where intervals[i][0] - start intervals[i][1] - end intervals[i][2] - label
        :return: labels - dictionary, keys - labels, values - intervals
        """
        labels = {}

        for interval in intervals:
            if interval[-1] not in labels:
                labels[interval[-1]] = [interval[0]]
            else:
                labels[interval[-1]].append(interval[0])

        # print(labels)
        for key in labels:
            labels[key] = self.merge(labels[key])

        return labels

    def apply_one_file(self, file_path):
        """
        :params: file_path - path to a single file
        :return: result - dictionary with the following structure: file_name -> channel -> lable -> intervals with no overlaps
        """
        result = {}
        dct = {}
        with open(file_path, 'r') as file:
            for node in file.readlines():
                node = node[:-1]

                node = node.split(',')

                if int(node[0]) not in dct:
                    dct[int(node[0])] = [[[float(node[1]), float(node[2])], int(node[3])]]
                else:
                    dct[int(node[0])].append([[float(node[1]), float(node[2])], int(node[3])])

            for key in dct:
                dct[key] = self.labeling(dct[key])
    
            result[file_path[:-4]] = dct
        
        return result
        
    def apply_dataset(self):
        """
        apply the labeling to the whole directory
        """
        result = {}
        
        for root, dirs, files in os.walk("/home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/eval", topdown=True):
            for name in files:
                if name.endswith('.rec'):
                    file_path = os.path.join(root, name)

                    dct = {}
                    """
                    node[0] - chanel
                    node[1] - start
                    node[2] - end
                    node[3] - label
                    """
                    with open(file_path, 'r') as file:
                        for node in file.readlines():
                            node = node[:-1]

                            node = node.split(',')

                            if int(node[0]) not in dct:
                                dct[int(node[0])] = [[[round(float(node[1]), 4), round(float(node[2]), 4)], int(node[3])]]
                            else:
                                dct[int(node[0])].append([[round(float(node[1]), 4), round(float(node[2]), 4)], int(node[3])])

                    for key in dct:
                        dct[key] = self.labeling(dct[key])

                    result[file_path[:-4]] = dct
        return result


### Analysing the dictionary with labels

In [4]:
label_dct = Preprocess().apply_dataset()

In [5]:
print("label_dct: ",label_dct)

label_dct:  {'/home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/eval/060/bckg_060_a_': {12: {6: [[20.5, 26.5]]}, 14: {6: [[20.5, 26.5]], 4: [[74.9, 75.9], [230.8, 231.8]], 5: [[27.6, 31.6]]}, 1: {5: [[20.6, 23.6], [28.1, 32.1]]}, 9: {6: [[20.5, 26.5]]}, 18: {4: [[75.0, 76.0], [230.8, 231.8]]}, 8: {6: [[20.5, 26.5]]}, 13: {6: [[20.5, 26.5]]}, 11: {6: [[20.5, 26.5]]}, 7: {6: [[20.5, 26.5]]}, 0: {5: [[20.5, 24.5], [28.1, 32.1]], 4: [[74.9, 75.9], [230.8, 231.8]]}, 4: {4: [[74.9, 75.9], [230.8, 231.8]]}, 10: {6: [[20.5, 26.5]]}, 15: {6: [[20.5, 26.5]]}, 16: {6: [[20.5, 26.5]]}}, '/home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/eval/044/bckg_044_a_1': {8: {6: [[0.7, 8.7]], 5: [[310.6, 319.6]]}, 15: {6: [[0.7, 8.7]], 5: [[310.6, 319.6]]}, 4: {6: [[0.7, 8.7]], 5: [[310.6, 319.6]]}, 14: {6: [[0.7, 8.7]], 5: [[310.6, 319.6]]}, 11: {6: [[0.7, 8.7]], 5: [[310.6, 319.6]]}, 17: {6: [[0.7, 8.7]], 5: [[310.6, 319.6]]}, 18: {6: [[0.7, 8

In [6]:
cnt_int = 0
cnt_float = 0

for file_name in label_dct.keys():
    for channel in sorted(label_dct[file_name].keys()):
        for label in sorted(label_dct[file_name][channel].keys()):
            for interval in label_dct[file_name][channel][label]:
                if (interval[1] - interval[0]) % 1 == 0:
                    cnt_int += 1
                else:
                    cnt_float += 1

print(cnt_int)
print(cnt_float)

6803
887


### Constructing numpy files for a single edf file

In [7]:
test_file_path_rec = '/home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/train/aaaaablw/aaaaablw_00000001.rec'
single_file_dct = Preprocess().apply_one_file(test_file_path_rec)
single_file_dct

{'/home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/train/aaaaablw/aaaaablw_00000001': {6: {6: [[29.1,
     38.1],
    [57.7, 65.7]]},
  9: {6: [[29.1, 38.1], [57.7, 65.7]]},
  20: {6: [[29.1, 38.1], [57.7, 65.7]]},
  15: {6: [[29.1, 38.1], [57.7, 65.7]]},
  1: {6: [[29.1, 38.1]]},
  11: {6: [[29.1, 38.1], [57.7, 65.7]]},
  2: {6: [[29.1, 38.1], [57.7, 65.7]]},
  21: {6: [[29.1, 38.1], [57.7, 65.7]]},
  14: {6: [[29.1, 38.1], [57.7, 65.7]]},
  10: {6: [[29.1, 38.1], [57.7, 65.7]]},
  12: {6: [[29.1, 38.1], [57.7, 65.7]]},
  19: {6: [[29.1, 38.1], [57.7, 65.7]]},
  8: {6: [[29.1, 38.1], [57.7, 65.7]]},
  0: {6: [[29.1, 38.1]]},
  13: {6: [[29.1, 38.1], [57.7, 65.7]]},
  5: {6: [[29.1, 38.1], [57.7, 65.7]]},
  16: {6: [[29.1, 38.1], [57.7, 65.7]]},
  18: {6: [[29.1, 38.1], [57.7, 65.7]]},
  3: {6: [[29.1, 38.1], [57.7, 65.7]]},
  17: {6: [[29.1, 38.1], [57.7, 65.7]]},
  7: {6: [[29.1, 38.1], [57.7, 65.7]]},
  4: {6: [[29.1, 38.1], [57.7, 65.7]]}}}

### Extracting signals

In [8]:
parameters = pd.read_csv('/home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/parameters.csv', index_col=['parameter'])

In [9]:
def extract_signal(f, signal_labels, electrode_name, start, stop):
    """
    f - opened edf file.
    signal_labels - list of signals.
    electrode_name - the name of electrode.
    start - start of the window in seconds.
    stop - end of the window in seconds.
    """
    tuh_label = [s for s in signal_labels if 'EEG ' + electrode_name + '-' in s]
    if len(tuh_label) > 1:
        print(tuh_label)
        exit('Multiple electrodes found with the same string! Abort')
        
    channel = signal_labels.index(tuh_label[0])
    signal = np.array(f.readSignal(channel))

    return signal[start:stop]

### Constructing FFT images

In [10]:
def convert_to_fft(window_start, window_end, window_step, channel,
                               fft_min_freq, fft_max_freq, sampling_frequency, file_path):
    """
    Split an interval into 1 second intervals with 0.5 second overlap, applying FFT.
    
    parmas:
    window_start - the beginning of interval in seconds.
    window_end - the end of interval in seconds.
    window_step - the overlap in seconds.
    channel - 
    fft_min_freq - the min frequency.
    fft_max_freq - the max ferquency.
    sampling_ferquency - the frequency of the edf file.
    file_path - the path to the edf file.
    
    return:
    np.array - interval splitted into 1 secons segments with 0.5 second overlap, with FFT applied.
    """
    pipeline = Pipeline([FFT(), Slice(fft_min_freq, fft_max_freq), Magnitude(), Log10()])
    
    start, step = int(np.floor(window_start * sampling_frequency)), int(np.floor(window_step * sampling_frequency))
    stop = start + step

    lst = file_path.split('/')
    file_name = lst[-1][:-4]
    fft_data = []

    montage = str(parameters.loc['montage']['value'])
    montage_list = re.split(';', montage)
    # print("montage_list: ", montage_list)
    # print("montage_list[channel]: ", montage_list[channel])
    electrode_list = re.split('-', montage_list[channel])

    f = pyedflib.EdfReader(file_path)
    signal_labels = f.getSignalLabels()
    
    while stop <= window_end * sampling_frequency:
        extracted_signal_from_electrode_1 = extract_signal(f, signal_labels, electrode_list[0], start, stop)
        extracted_signal_from_electrode_2 = extract_signal(f, signal_labels, electrode_list[1], start, stop)
        
        signal_window = np.array(extracted_signal_from_electrode_1-extracted_signal_from_electrode_2)
        fft_window = pipeline.apply(signal_window)
        
        fft_data.append(fft_window)
        start, stop = start + step, stop + step
        # print(fft_data)
    
    f._close()
    del f
    
    return np.array(fft_data)

### A single file usage example

In [11]:
test_file_path_edf = '/home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/train/aaaaablw/aaaaablw_00000001.edf'
arr = convert_to_fft(29.1, 38.1, 0.5, 0, 1, 96, 250, test_file_path_edf)
print(f'arr: {arr},\nlen(arr): {len(arr)},\nlen(arr[0]): {len(arr[0])}')

arr: [[1.71065759 1.63976    0.7138609  ... 1.18646985 1.2437725  0.78738767]
 [2.1504314  1.75372405 1.93814755 ... 1.26149422 1.15614055 1.03361897]
 [2.42635049 1.64645036 2.12363506 ... 1.15438603 1.41452648 1.26861089]
 ...
 [2.17775443 2.04387071 1.9346262  ... 1.09226038 1.28243374 1.14905439]
 [2.24846638 2.50624815 1.5401735  ... 1.49119303 0.95360965 1.16301862]
 [2.20585812 1.91174881 1.93668347 ... 1.04818828 0.94269498 0.78888465]],
len(arr): 18,
len(arr[0]): 62


### Multiple-files usage example

In [12]:
unique_id = 0
out_path = "/home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/out_tuev_eval/"
list_of_rows = []
step = 0
maxstep = len(list(label_dct.keys()))
for file_name in list(label_dct.keys()): #[:3]: # constructing the numpy files for the first 3 files
    # print(file_name)
    wkspFldr = (file_name.split('/')[0:-1])
    # print("wkspFldr: ",wkspFldr)
    
    fname = os.path.basename(file_name)
    # out_name = out_path + "/" + wkspFldr[-1]+"//" + fname
    out_name = out_path + fname
    # print("out_name: ",out_name)
    # continue
    for channel in range(22): 
        try: # sometimes there is no data about a channel
            for label in label_dct[file_name][channel]:
                for index, interval in enumerate(label_dct[file_name][channel][label]):
                    result_file_name = out_name + '_channel_' + str(channel) + '_label_' + str(label) + '_interval_id_' + str(index) + '_unique_id_' + str(unique_id)
                    unique_id += 1
                    
                    if len(interval) > 1:
                        arr = convert_to_fft(interval[0], interval[1], 0.5, channel, 1, 96, 250, file_name + '.edf')
                        np.save(result_file_name, arr)
                        d_tmp={}
                        d_tmp["fname"]=fname
                        d_tmp["channel"]=channel
                        d_tmp["label"]=label
                        d_tmp["interval"]=interval
                        d_tmp["unique_id"] = unique_id
                        d_tmp["path"] = result_file_name+".npy"
                        list_of_rows.append(d_tmp)
                    # break
                # break      
        
        except KeyError as exx:
            print(exx)
            pass
        # break
    
    print("step: ",step, maxstep,file_name)
    step += 1
    
keys = list_of_rows[0].keys()
with open(out_path+'row_list.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(list_of_rows)
        
            

2
3
5
6
17
19
20
21
step:  0 159 /home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/eval/060/bckg_060_a_
step:  1 159 /home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/eval/044/bckg_044_a_1
step:  2 159 /home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/eval/014/pled_014_a_2
7
9
10
11
12
13
14
step:  3 159 /home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/eval/014/bckg_014_a_
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
21
step:  4 159 /home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/eval/043/gped_043_a_1
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
step:  5 159 /home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/eval/043/gped_043_a_
step:  6 159 /home/eshuranov/projects/eeg_epileptiform_detection/EEG2Rep/Dataset/TUEV/tuev/edf/eval/043/bckg_043_a_1
step:  7 159 /home/eshuranov/projects/eeg_epileptiform

In [13]:
print(os. getcwd() )

/home/eshuranov/projects/eeg_epileptiform_detection/ML_solution
