This notebook is a tutorial to show how to manage the preprocessed data for sleep stage classification

In [13]:
import numpy as np
import gzip as gz
from tqdm.notebook import tqdm
import torch as th
import pickle

In [14]:
datad = '/Users/martinblot/Desktop/ESPCI/adl-24-sleep-stage/sleep-edf-prepared/5-telemetry'


In [15]:
fp = gz.open(datad+'/SC4671G0.npz.gz','rb')
data = np.load(fp,allow_pickle=True)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/martinblot/Desktop/ESPCI/adl-24-sleep-stage/sleep-edf-prepared/5-telemetry/SC4671G0.npz.gz'

In [8]:
# To see what it contains
data.files

['x', 'y', 'fs', 'ch_label', 'header_raw', 'header_annotation']

In [9]:
# The data are stored in 'x' and 'y'
x = data['x']
y = data['y']

In [10]:
print(x.shape, y.shape)

(1968, 600, 4) (1968,)


In [11]:
# The header is the copy of the original one
data["header_raw"]
data['header_annotation']

array({'local_subject_id': 'X F X Female_87yr', 'local_recording_id': 'Startdate 07-AUG-1991 X X X', 'date_time': '2091-08-07 16:00:00', 'EDF+': True, 'contiguous': True, 'n_records': 1, 'record_length': 0.0, 'n_channels': 1, 'label': ['EDF Annotations'], 'transducer_type': [''], 'units': [''], 'physical_min': array([0.]), 'physical_max': array([1.]), 'digital_min': array([-32768.]), 'digital_max': array([32767.]), 'prefiltering': [''], 'n_samples_per_record': [3062]},
      dtype=object)

In [12]:
# The four channels in x are 'EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', 'EMG submental'
# You can take more if you modify the preparation script and rerun it.
# To get a list all the files:
import os
import glob
fnames = glob.glob(os.path.join(datad, "*npz.gz"))
print(fnames[:10]) # print the first 10

['/Users/constouille/Documents/GitHub/adl24-sleep-stage/data/5-cassette/SC4381F0.npz.gz', '/Users/constouille/Documents/GitHub/adl24-sleep-stage/data/5-cassette/SC4532E0.npz.gz', '/Users/constouille/Documents/GitHub/adl24-sleep-stage/data/5-cassette/SC4561F0.npz.gz', '/Users/constouille/Documents/GitHub/adl24-sleep-stage/data/5-cassette/SC4201E0.npz.gz', '/Users/constouille/Documents/GitHub/adl24-sleep-stage/data/5-cassette/SC4722E0.npz.gz', '/Users/constouille/Documents/GitHub/adl24-sleep-stage/data/5-cassette/SC4011E0.npz.gz', '/Users/constouille/Documents/GitHub/adl24-sleep-stage/data/5-cassette/SC4082E0.npz.gz', '/Users/constouille/Documents/GitHub/adl24-sleep-stage/data/5-cassette/SC4491G0.npz.gz', '/Users/constouille/Documents/GitHub/adl24-sleep-stage/data/5-cassette/SC4801G0.npz.gz', '/Users/constouille/Documents/GitHub/adl24-sleep-stage/data/5-cassette/SC4151E0.npz.gz']


In [39]:
devpart = 10
xtrain , xvalid = None , None
ytrain , yvalid = None , None
# If you take all the data you dhould end with
#
for fn in tqdm(fnames):
    fp = gz.open(fn,'rb')
    data = np.load(fp,allow_pickle=False) # for now, don't care about headers
    x = data['x'][:,:,:2] # Take only the EOG
    y = data['y'] # Take the labels
    idx = np.arange(x.shape[0])
    np.random.shuffle(idx)
    devlim = x.shape[0]//devpart
    devpart = 10
    idx = np.arange(x.shape[0])
    np.random.shuffle(idx)
    devlim = x.shape[0]//devpart
    if xtrain is None:
        xtrain = np.zeros((1,x.shape[1], 2))
        xvalid = np.zeros((1,x.shape[1], 2))
        ytrain , yvalid = np.zeros(1) , np.zeros(1)
    xvalid = np.concatenate((xvalid,x[idx[:devlim]]), axis=0)
    yvalid = np.concatenate((yvalid,y[idx[:devlim]]), axis=0)
    xtrain = np.concatenate((xtrain,x[idx[devlim:]]), axis=0)
    ytrain = np.concatenate((ytrain,y[idx[devlim:]]), axis=0)
    del x,y

  0%|          | 0/153 [00:00<?, ?it/s]

In [40]:
print(xtrain.shape, xvalid.shape)
print(ytrain.shape, yvalid.shape)


(175996, 600, 2) (19485, 600, 2)
(175996,) (19485,)


In [41]:
data['x'][:, :, :2].shape

(1132, 600, 2)

In [42]:
# clean the first dummy example
xtrain , xvalid = xtrain[1:] , xvalid[1:]
ytrain , yvalid = ytrain[1:] , yvalid[1:]
print(xtrain.shape, xvalid.shape)
print(ytrain.shape, yvalid.shape)

(175995, 600, 2) (19484, 600, 2)
(175995,) (19484,)


In [43]:
# In Torch version
xtrain, xvalid = th.FloatTensor(xtrain), th.FloatTensor(xvalid)
ytrain, yvalid = th.IntTensor(ytrain), th.IntTensor(yvalid)

In [46]:
outf="data/cassette-th-data-all.pck"
fp = open(outf,"wb")
pickle.dump((xtrain , xvalid , ytrain , yvalid), fp)

In [47]:
!ls -lh ./data/cassette-th-data-all.pck

-rw-r--r--  1 constouille  staff   896M Apr  3 08:43 ./data/cassette-th-data-all.pck
