In [2]:
import os
from os.path import join, dirname
from dotenv import load_dotenv

dotenv_path = join(dirname('__file__'), '.env')

load_dotenv(dotenv_path)

DATALAKE_PATH = os.environ.get("DATALAKE_PATH")
ROOT_PATH = os.environ.get("ROOT_PATH")
SOURCE_PATH = os.environ.get("SOURCE_PATH")

In [3]:
import pandas as pd
import numpy as np
from mne.io import read_raw_edf

In [4]:
file = pd.ExcelFile('../docs/seizures_v36s.xlsx')
file.sheet_names

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

In [5]:
def table_info(sheet:str):
    df = file.parse(sheet)
    df = df.iloc[:,:15].drop([0]).reset_index(drop=True)
    df = df[df['No. Seizures/ File']>0].reset_index(drop=True)
    df['dataset'] = np.repeat(sheet, len(df))
    return df


In [6]:
dataset_train = table_info('train')
dataset_dev = table_info('dev')
dataset_eval = table_info('eval')


In [28]:
def get_data_info_edf(file):
    file = file.replace('tse','edf')[1:]
    path = f'/media/davi/6A81-05CF/tuh_eeg_seizure/edf/{file}'

    return read_raw_edf(path, verbose=False)


def get_others_signals(raw, chs, name, label, dataset):
    df = raw.to_data_frame()
    df = df[chs]
    df.to_parquet(f"{DATALAKE_PATH}/tuh_seizures/raw/{label}/{dataset}/{name}")
    
def check_others_signals(chs, label):
    return [ch for ch in chs if label in ch]
    



In [29]:
def pipeline_datasets(dataset, dataset_tag):
    df = dataset
    files = df['Filename'].unique()

    freqs = []
    ecg_ch = []
    emg_ch = []

    for file in files:
        raw = get_data_info_edf(file)
        freqs.append(int(raw.info['sfreq']))

        chs = raw.info['ch_names']
        ecg = check_others_signals(chs=chs, label='EKG')
        emg = check_others_signals(chs=chs, label='EMG')

        name = file.split('/')[-1].replace('tse','parquet')

        if len(ecg)>0:
            ecg_ch.append(True)
            get_others_signals(raw, ecg, name, 'ecg', dataset_tag)
        else:
            ecg_ch.append(False)


        if len(emg)>0:
            emg_ch.append(True)
            get_others_signals(raw, emg, name, 'emg', dataset_tag)
        else:
            emg_ch.append(False)
    
    return pd.DataFrame(data=[files, freqs, ecg_ch, emg_ch], index=['file','freq','ecg','emg']).T


In [30]:
dataset_eval_details = pipeline_datasets(dataset_eval, 'eval')
dataset_dev_details = pipeline_datasets(dataset_dev, 'dev')
dataset_train_details = pipeline_datasets(dataset_train, 'train')

  return read_raw_edf(path, verbose=False)


In [34]:
dataset_train_details.to_csv('../features/train_details.csv', index=False)
dataset_dev_details.to_csv('../features/dev_details.csv', index=False)
dataset_eval_details.to_csv('../features/eval_details.csv', index=False)


In [36]:
dataset_train.to_csv('../features/train.csv', index=False)
dataset_dev.to_csv('../features/dev.csv', index=False)
dataset_eval.to_csv('../features/eval.csv', index=False)