In [1]:
import os
import wfdb
import numpy as np
import pandas as pd
import pyedflib

In [2]:
# DOWNLOAD RECORD INFORMATION FILES
wfdb.dl_files('chbmit',dl_dir='record_info',
              files=['RECORDS','RECORDS-WITH-SEIZURES'])

Downloading files...
Finished downloading files


In [3]:
# Get channels, start-end, and seizure start-end for all files in subject
def get_sub_info(file):
    subject = [i[:-1] for i in open(file, 'r')]
    file_chans = {}
    start_end = {}
    seiz_start_end = {}
    collect_seiz_time = False
    for i in subject:
        if 'Channel 1:' in i:
            collect_chan = True
            channel_group = {}
        if 'Channel' not in i:
            collect_chan = False
        if collect_chan:
            channel_group[i.split(':')[0]] = i.split(':')[1].strip()
        if 'File Name:' in i:
            f_name = i.split(':')[1].strip()
            file_chans[f_name] = channel_group
            start_end[f_name] = []
        if i.split(':')[0]=='Number of Seizures in File' \
        and float(i.split(':')[1].strip())>0:
            s_e = []
            seiz_start_end[f_name] = []
        if ('Seizure' in i and 'Start' in i):
            collect_seiz_time = True
        if i=='':
            collect_seiz_time = False
        if collect_seiz_time:
            s_e.append(float(i.split(':')[1].strip().split(' ')[0]))
            seiz_start_end[f_name] = s_e
        if 'File Start Time' in i:
            s_time = i.split(': ')[1]
        if 'File End Time' in i:
            start_end[f_name] = [s_time,i.split(': ')[1]]
    return file_chans, start_end, seiz_start_end

In [4]:
# Get and combine file_chans, start_end, and seiz_start_end for all subjects
sub_files = ['record_info/' + i for i in \
             os.listdir('record_info') if 'summary' in i]
file_chans = {}
file_s_e = {}
file_seize_s_e = {}

for i in sub_files:
    fc, se, sse = get_sub_info(i)
    file_chans.update(fc)
    file_s_e.update(se)
    file_seize_s_e.update(sse)

In [5]:
# Get list of seizure files, only all electrodes
seizure_files = list(file_seize_s_e.keys())
seizure_electrodes = [file_chans[i].values() for i in seizure_files]
matched_electrodes = [i for i in range(len(seizure_electrodes)) \
                      if len(set(seizure_electrodes[0]).intersection\
                             (seizure_electrodes[i]))==22]
seizures = np.array(seizure_files)[matched_electrodes]

In [6]:
# Get list of no seizure files, only all electrodes, subset same len
no_seizure_files = [i for i in file_chans.keys() if i not in seizure_files]
no_seizure_electrodes = [file_chans[i].values() for i in no_seizure_files]
matched_electrodes = [i for i in range(len(no_seizure_electrodes)) \
                      if len(set(no_seizure_electrodes[0]).intersection\
                             (no_seizure_electrodes[i]))==22]
no_seizures = np.array(no_seizure_files)[matched_electrodes]
np.random.seed(1234)
no_seizures = np.random.choice(no_seizures,len(seizures),replace=False)

In [7]:
# Get seizures (just once, takes some time)
# s_getfiles = [i[:5]+'/'+i for i in seizures]
# wfdb.dl_files('chbmit',
#               dl_dir='seizure_edf',
#               keep_subdirs=False,
#               files=s_getfiles)

In [8]:
# Get seizures (just once, takes some time)
# ns_getfiles = [i[:5]+'/'+i for i in no_seizures]
# wfdb.dl_files('chbmit',
#               dl_dir='no_seizure_edf',
#               keep_subdirs=False,
#               files=ns_getfiles)

In [9]:
chan_order = [i for i in seizure_electrodes[0]]
print(chan_order)

['FP1-F7', 'F7-T7', 'T7-P7', 'P7-O1', 'FP1-F3', 'F3-C3', 'C3-P3', 'P3-O1', 'FP2-F4', 'F4-C4', 'C4-P4', 'P4-O2', 'FP2-F8', 'F8-T8', 'T8-P8', 'P8-O2', 'FZ-CZ', 'CZ-PZ', 'P7-T7', 'T7-FT9', 'FT9-FT10', 'FT10-T8', 'T8-P8']


In [10]:
# Drop 2nd 'T8-P8' label to keep unique keys
chan_order = chan_order[:-1]
chan_order_dict = dict(zip(chan_order,range(len(chan_order))))

In [11]:
# Get indices of rows to keep and order (keep_ind, order)
s_keep_inds = []
for file in seizures:
    keep_ind = []
    for i in range(len(file_chans[file].values())):
        e = list(file_chans[file].values())[i]
        if e in chan_order:
            keep_ind.append((i,chan_order_dict[e]))
    s_keep_inds.append(keep_ind)
            
ns_keep_inds = []
for file in no_seizures:
    keep_ind = []
    for i in range(len(file_chans[file].values())):
        e = list(file_chans[file].values())[i]
        if e in chan_order:
            keep_ind.append((i,chan_order_dict[e]))
    ns_keep_inds.append(keep_ind)        

In [12]:
# Do all keep_inds end in duplicate (14) (yes, both equal 0 here)?
print(sum([np.array(i)[len(i)-1,1]!=14 for i in s_keep_inds]),
     sum([np.array(i)[len(i)-1,1]!=14 for i in ns_keep_inds]))

0 0


In [13]:
# Drop final row of keep_inds to drop duplicate
s_keep_inds = [i[:-1] for i in s_keep_inds]
ns_keep_inds = [i[:-1] for i in ns_keep_inds]

In [14]:
def seiz_samp_range(file,window=5,padding=5):
    '''Returns sample numbers that captures window leading up to 
    the start of the first seizure on record for each file. In order
    to avoid modeling the seizure itself, padding (in sec) before onset.
    Samples at 256 Hz'''
    end = file_seize_s_e[file][0]*256 - padding*256
    start = end-(window*60*256)
    return (start,end)

In [15]:
# Get all seizure sequences
s_array = []
for i in range(len(seizures)):
    file = seizures[i]
    print('start ', file)
    f = pyedflib.EdfReader('seizure_edf/'+file)
    edf_import = [f.readSignal(
        chn = e,
        start = seiz_samp_range(file)[0], 
        n = 5*60*256) for e in np.array(s_keep_inds[i])[:,0] ]
    edf_import = np.array(edf_import)[np.array(s_keep_inds[i])[:,1]]
    s_array.append(edf_import)
    f._close()
    print('completed ', file)

start  chb03_01.edf
completed  chb03_01.edf
start  chb03_02.edf
completed  chb03_02.edf
start  chb03_03.edf
completed  chb03_03.edf
start  chb03_04.edf
completed  chb03_04.edf
start  chb03_34.edf
completed  chb03_34.edf
start  chb03_35.edf
completed  chb03_35.edf
start  chb03_36.edf
completed  chb03_36.edf
start  chb16_10.edf
completed  chb16_10.edf
start  chb16_11.edf
completed  chb16_11.edf
start  chb16_14.edf
completed  chb16_14.edf
start  chb16_16.edf
completed  chb16_16.edf
start  chb16_17.edf
completed  chb16_17.edf
start  chb24_01.edf
completed  chb24_01.edf
start  chb24_03.edf
completed  chb24_03.edf
start  chb24_04.edf
completed  chb24_04.edf
start  chb24_06.edf
completed  chb24_06.edf
start  chb24_07.edf
completed  chb24_07.edf
start  chb24_09.edf
completed  chb24_09.edf
start  chb24_11.edf
completed  chb24_11.edf
start  chb24_13.edf
completed  chb24_13.edf
start  chb24_14.edf
completed  chb24_14.edf
start  chb24_15.edf
completed  chb24_15.edf
start  chb24_17.edf
completed  c

In [16]:
pre_5 = [i for i in range(len(s_array)) if s_array[i].shape[1]==0]
print(f'These seizures happened before 5 minutes: {pre_5}')
X_S = np.array([i for i in s_array if i.shape[1]>0])
X_S_file = seizures[ [i for i in range(len(s_array)) \
                      if s_array[i].shape[1]>0]]

These seizures happened before 5 minutes: [11, 13, 16, 47, 63, 75, 80, 91, 99, 121]


In [17]:
print(X_S.shape,len(X_S_file))

(126, 22, 76800) 126


In [18]:
# Get no_seizure controls, 5-minute intervals, randomly starting from
# 10-30 min

# Get all seizure sequences
np.random.seed(1234)
ns_array = []
for i in range(len(no_seizures)):
    file = no_seizures[i]
    print('start ', file)
    f = pyedflib.EdfReader('no_seizure_edf/'+file)
    edf_import = [f.readSignal(
        chn = e,
        start = int(np.random.uniform(10*60*256, 30*60*256)), 
        n = 5*60*256) for e in np.array(ns_keep_inds[i])[:,0] ]
    edf_import = np.array(edf_import)[np.array(ns_keep_inds[i])[:,1]]
    ns_array.append(edf_import)
    f._close()
    print('completed ', file)

start  chb21_18.edf
completed  chb21_18.edf
start  chb05_18.edf
completed  chb05_18.edf
start  chb08_24.edf
completed  chb08_24.edf
start  chb04_22.edf
completed  chb04_22.edf
start  chb18_17.edf
completed  chb18_17.edf
start  chb17b_59.edf
completed  chb17b_59.edf
start  chb05_31.edf
completed  chb05_31.edf
start  chb04_14.edf
completed  chb04_14.edf
start  chb21_24.edf
completed  chb21_24.edf
start  chb10_17.edf
completed  chb10_17.edf
start  chb04_10.edf
completed  chb04_10.edf
start  chb21_29.edf
completed  chb21_29.edf
start  chb09_15.edf
completed  chb09_15.edf
start  chb19_14.edf
completed  chb19_14.edf
start  chb07_16.edf
completed  chb07_16.edf
start  chb11_02.edf
completed  chb11_02.edf
start  chb08_19.edf
completed  chb08_19.edf
start  chb09_17.edf
read 0, less than 76800 requested!!!
read 22379, less than 76800 requested!!!
read 0, less than 76800 requested!!!
read 0, less than 76800 requested!!!
read 0, less than 76800 requested!!!
read 0, less than 76800 requested!!!
read

In [19]:
np.random.seed(1234)
X_NS_I = np.random.choice(np.concatenate([np.arange(17),
                                          np.arange(18,
                                                    len(no_seizures))]),
                          len(X_S_file))

In [20]:
X_NS = np.array(ns_array)[X_NS_I]
X_NS_file = no_seizures[X_NS_I]
print(X_S.shape,len(X_S_file),
      X_NS.shape,len(X_NS_file))

(126, 22, 76800) 126 (126, 22, 76800) 126


In [21]:
print(X_S.shape,len(X_S_file),'\n',
      X_NS.shape,len(X_NS_file))

(126, 22, 76800) 126 
 (126, 22, 76800) 126


In [22]:
import pickle
pickle.dump(X_NS,open('X_NS.pickle','wb'))
pickle.dump(X_NS_file,open('X_NS_file.pickle','wb'))
pickle.dump(X_S,open('X_S.pickle','wb'))
pickle.dump(X_S_file,open('X_S_file.pickle','wb'))
pickle.dump(chan_order_dict,open('chan_order_dict.pickle','wb'))

In [23]:
chan_order_dict

{'FP1-F7': 0,
 'F7-T7': 1,
 'T7-P7': 2,
 'P7-O1': 3,
 'FP1-F3': 4,
 'F3-C3': 5,
 'C3-P3': 6,
 'P3-O1': 7,
 'FP2-F4': 8,
 'F4-C4': 9,
 'C4-P4': 10,
 'P4-O2': 11,
 'FP2-F8': 12,
 'F8-T8': 13,
 'T8-P8': 14,
 'P8-O2': 15,
 'FZ-CZ': 16,
 'CZ-PZ': 17,
 'P7-T7': 18,
 'T7-FT9': 19,
 'FT9-FT10': 20,
 'FT10-T8': 21}