In [37]:
import pandas as pd
import os
import json

In [15]:
def get_bckg_seiz_list(label_lists):
    if label_lists[0][3] == 'bckg':
        return [0], [0]
    else:
        onsets = []
        offsets = []
        for idx, label_list in enumerate(label_lists):
            onsets.append(float(label_list[1]))
            offsets.append(float(label_list[2]))
    return onsets, offsets

In [39]:
# specify the directory you want to start from
root_dir = './input/TUSZv2_labels'  # replace this with your directory

df = pd.DataFrame(columns=['filename', 'filepath', 'onsets', 'offsets', 'length'])
# walk through directory structure
for dir_name, subdir_list, file_list in os.walk(root_dir):
    for fname in file_list:
        # check if the file has '.csv_bi' extension
        if fname.endswith('.csv_bi'):
            # construct full file path
            file_path = os.path.join(dir_name, fname)

            # open and read the file
            with open(file_path, 'r') as file:
                labels_lines = file.readlines()
                # print(fname, labels_lines)
                duration = labels_lines[2].split('secs')[0].split('= ')[1]
                label_lists = [line.strip().split(',') for line in labels_lines[6:]]
                onset, offset = get_bckg_seiz_list(label_lists)
                new_record = {"filename": fname.split('.csv_bi')[0],
                           "filepath": "/".join(dir_name.split("/")[3:]),
                           "length": float(duration),
                           "onsets": [onset],
                           "offsets": [offset]}
                df = pd.concat([df, pd.DataFrame(new_record)])

df.sample(n=5)

Unnamed: 0,filename,filepath,onsets,offsets,length
0,aaaaasdq_s005_t001,TUSZv2/edf/train/aaaaasdq/s005_2014_11_26/03_t...,[0],[0],601.0
0,aaaaapqh_s002_t000,TUSZv2/edf/train/aaaaapqh/s002_2013_07_17/01_t...,[0],[0],300.0
0,aaaaajqo_s010_t003,TUSZv2/edf/train/aaaaajqo/s010_2010_05_04/03_t...,[0],[0],3600.0
0,aaaaaron_s001_t003,TUSZv2/edf/train/aaaaaron/s001_2014_08_12/01_t...,[0],[0],113.0
0,aaaaarei_s007_t002,TUSZv2/edf/eval/aaaaarei/s007_2015_01_16/01_tc...,[0],[0],302.0


In [41]:
# specify the directory you want to start from
root_dir = './input/TUSZv2_labels'

fs_df = pd.DataFrame(columns=['filename', 'fs', 'channels'])
# walk through directory structure
for dir_name, subdir_list, file_list in os.walk(root_dir):
    for fname in file_list:
        # check if the file has '.csv_bi' extension
        if fname.endswith('.json'):
            # construct full file path
            file_path = os.path.join(dir_name, fname)

            # open and read the file
            with open(file_path, 'r') as file:
                headers = json.load(file)
                fs_list = []
                channel_list = []
                for ch_header in headers:
                    channel_list.append(ch_header['label'])
                    fs_list.append(ch_header['sample_rate'])

                new_record = {"filename": fname.split('_header.json')[0],
                              "channels": [channel_list],
                              "fs": [fs_list]}
                fs_df = pd.concat([fs_df, pd.DataFrame(new_record)])

fs_df.sample(n=5)

Unnamed: 0,filename,fs,channels
0,aaaaamhj_s004_t000,"[250, 250, 250, 250, 250, 250, 250, 250, 250, ...","[EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-..."
0,aaaaappt_s006_t008,"[256, 256, 256, 256, 256, 256, 256, 256, 256, ...","[EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-..."
0,aaaaanrb_s005_t003,"[250, 250, 250, 250, 250, 250, 250, 250, 250, ...","[EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-..."
0,aaaaaict_s006_t010,"[400, 400, 400, 400, 400, 400, 400, 400, 400, ...","[EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-..."
0,aaaaapks_s004_t005,"[256, 256, 256, 256, 256, 256, 256, 256, 256, ...","[EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-..."


In [48]:
# There are duplicated files in eval/aaaaaqvx/s003_2015_08_24 and eval/aaaaaqvx/s010_2015_08_27
fs_df = fs_df.drop_duplicates(subset='filename', keep='first')
df = df.drop_duplicates(subset='filename', keep='first')

In [56]:
fs_df['channels'].drop_duplicates()

0    [EEG FP1-LE, EEG FP2-LE, EEG F3-LE, EEG F4-LE,...
0    [EEG FP1-LE, EEG FP2-LE, EEG F3-LE, EEG F4-LE,...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-LE, EEG FP2-LE, EEG F3-LE, EEG F4-LE,...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...
0    [EEG 

In [61]:
# Explode the lists in the 'channels' column
df_exploded = fs_df.explode('channels')

# Get the unique values from the 'channels' column
unique_channels = df_exploded['channels'].value_counts()
unique_channels

EEG FP2-REF    6814
EEG FP1-REF    6814
EEG F4-REF     6814
EEG C3-REF     6814
EEG C4-REF     6814
               ... 
ECG EKG-REF      15
EEG OZ-REF       15
PULSE RATE       15
EEG 23-LE        14
EEG 24-LE        14
Name: channels, Length: 202, dtype: int64

In [72]:
bipolar_montage = [('FP1', 'F7'), ('F7', 'T3'), ('T3', 'T5'), ('T5', 'O1'),
                   ('FP1', 'F3'), ('F3', 'C3'), ('C3', 'P3'), ('P3', 'O1'),
                   ('FP2', 'F8'), ('F8', 'T4'), ('T4', 'T6'), ('T6', 'O2'),
                   ('FP2', 'F4'), ('F4', 'C4'), ('C4', 'P4'), ('P4', 'O2'),
                   ('FZ', 'CZ'), ('PZ', 'CZ'), ('C3', 'CZ'), ('C4', 'CZ'),]

def find_index(channels, word):
    for i, channel in enumerate(channels):
        if channel.startswith('EEG {}'.format(word)):
            return i
    return -1

def get_bipolar_montage_index(channels):
    bipolar_montage_index = []
    for x, y in bipolar_montage:
        bipolar_montage_index.append((find_index(channels, x), find_index(channels, y)))

    return bipolar_montage_index

In [74]:
fs_df['bipolar_montage'] = fs_df['channels'].apply(get_bipolar_montage_index)

In [81]:
fs_df.sample(3)

Unnamed: 0,filename,fs,channels,bipolar_montage
0,aaaaaplb_s002_t010,"[256, 256, 256, 256, 256, 256, 256, 256, 256, ...","[EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...","[(0, 10), (10, 12), (12, 14), (14, 8), (0, 2),..."
0,aaaaapsl_s005_t000,"[256, 256, 256, 256, 256, 256, 256, 256, 256, ...","[EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...","[(0, 10), (10, 12), (12, 14), (14, 8), (0, 2),..."
0,aaaaaijh_s005_t002,"[256, 256, 256, 256, 256, 256, 256, 256, 256, ...","[EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...","[(0, 10), (10, 12), (12, 14), (14, 8), (0, 2),..."


In [90]:
def check_fs_values(row):
    # Get the 'bipolar_montage' and 'fs' columns
    bipolar_montage_list = row['bipolar_montage']
    fs = row['fs']
    # Extract the indices from the 'bipolar_montage' list
    indices = [index for tup in bipolar_montage_list for index in tup if index != -1]

    # Get the unique values of 'fs' where the index is present in 'bipolar_montage'
    unique_fs_values = set([fs[index] for index in indices])

    # Return True if all the unique values are the same, False otherwise
    return len(unique_fs_values) == 1

In [96]:
fs_df['same_elements'] = fs_df.apply(lambda row: check_fs_values(row), axis=1)
fs_df[fs_df['same_elements']!=True]

Unnamed: 0,filename,fs,channels,bipolar_montage,same_elements


In [97]:
fs_df['sampling_frequency'] = fs_df['fs'].apply(lambda x: x[0])

In [98]:
# Remove the 'same_elements' column
fs_df = fs_df.drop('same_elements', axis=1)

# Rename the 'fs' column to 'fs_list'
fs_df = fs_df.rename(columns={'fs': 'fs_list'})
fs_df.sample(n=3)

Unnamed: 0,filename,fs_list,channels,bipolar_montage,sampling_frequency
0,aaaaaraf_s006_t000,"[256, 256, 256, 256, 256, 256, 256, 256, 256, ...","[EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...","[(0, 10), (10, 12), (12, 14), (14, 8), (0, 2),...",256
0,aaaaaskf_s004_t002,"[256, 256, 256, 256, 256, 256, 256, 256, 256, ...","[EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...","[(0, 10), (10, 12), (12, 14), (14, 8), (0, 2),...",256
0,aaaaaoxa_s008_t004,"[256, 256, 256, 256, 256, 256, 256, 256, 256, ...","[EEG FP1-REF, EEG FP2-REF, EEG F3-REF, EEG F4-...","[(0, 10), (10, 12), (12, 14), (14, 8), (0, 2),...",256
