In [1]:
import os
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from mne.io import read_raw_edf
import yasa
from preprocessing import crop_hypno, extract_features

  return warn(


In [2]:
# Define paths (can be defined in config files)
eeg_dir = 'data/edfs/shhs2/'
hypno_dir = 'data/annotations-events-profusion/shhs2/'
parent_dir = os.path.dirname(os.getcwd())
out_dir = './output/features/'

In [4]:
df_subj = pd.read_csv("./output/data/shhs_split.csv")
df_subj = df_subj.query("set == 'training'").set_index("subj")

In [5]:
print(df_subj.shape[0], 'subjects remaining')
df_subj.head(10)

600 subjects remaining


Unnamed: 0_level_0,dataset,age,male,bmi,ahi,ethnicity,set,hypertension
subj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
200083,SHHS,54.0,1,,,caucasian,training,0.0
200086,SHHS,68.0,1,23.225432,22.258065,caucasian,training,0.0
200091,SHHS,77.0,0,22.89282,7.407407,caucasian,training,1.0
200093,SHHS,47.0,0,21.664973,14.415584,caucasian,training,1.0
200100,SHHS,72.0,0,,,caucasian,training,0.0
200108,SHHS,60.0,0,27.029866,2.4,caucasian,training,0.0
200111,SHHS,65.0,0,22.74204,13.16129,caucasian,training,0.0
200116,SHHS,83.0,0,24.594907,42.075949,other,training,1.0
200121,SHHS,51.0,0,31.023997,8.197674,caucasian,training,0.0
200123,SHHS,49.0,0,23.711845,1.706783,caucasian,training,0.0


In [None]:
df = []
include = ['EEG', 'EOG(L)', 'EMG']
sf = 100

for sub in ['200077', '200078', '200079']:#tqdm(df_subj.index):
    eeg_file = eeg_dir + 'shhs2-' + str(sub) + '.edf'
    hypno_file = hypno_dir + 'shhs2-' + str(sub) + '-profusion.xml'
    
    # Check that file exists
    if not os.path.isfile(eeg_file):
        warnings.warn("File not found %s" % eeg_file)
        continue
    if not os.path.isfile(hypno_file):
        warnings.warn("File not found %s" % hypno_file)
        continue

    # LOAD EEG DATA
    try:
        raw = read_raw_edf(eeg_file, preload=False, verbose=0)
        raw.drop_channels(np.setdiff1d(raw.info['ch_names'], include))
        # Skip subjects if channel were not found
        assert len(raw.ch_names) == len(include)
        raw.load_data()
    except:
        continue
        
    # Resample and high-pass filter 
    raw.resample(sf, npad="auto")
    
    # LOAD HYPNOGRAM
    hypno, sf_hyp = yasa.load_profusion_hypno(hypno_file)
    # (Optional) We keep up to 15 minutes before / after sleep
    # hypno, tmin, tmax = crop_hypno(hypno)
    # raw.crop(tmin, tmax)
    # Check that hypno and data have the same number of epochs
    n_epochs = hypno.shape[0]
    if n_epochs != np.floor(raw.n_times / sf / 30):
        print("- Hypno and data size do not match.")
        continue
    
    # Convert hypnogram to str
    df_hypno = pd.Series(hypno)
    df_hypno.replace({0: 'W', 1: 'N1', 2: 'N2', 3: 'N3', 4: 'R'}, inplace=True)
    stage_min = df_hypno.value_counts(sort=False) / 2

    # INCLUSION CRITERIA (DISABLED)
    # Hypnogram must include all stages
#     if np.unique(hypno).tolist() != [0, 1, 2, 3, 4]:
#         print("- Not all stages are present.")
#         continue
#     # If the duration is not between 4 to 12 hours, skip subject
#     if not(4 < n_epochs / 120 < 12):
#         print("- Recording too short/long.")
#         continue

    # EXTRACT FEATURES
    features = extract_features(df_subj, sub, raw, include)
    # Add hypnogram
    features['stage'] = df_hypno.to_numpy()
    df.append(features)

df = pd.concat(df)

In [None]:
# Convert to category
df['stage'] = df['stage'].astype('category')

In [None]:
# Export
df.to_parquet(out_dir + "features_nsrr_shhs2.parquet")

## Stats

In [None]:
df['stage'].value_counts(normalize=True, sort=True)

In [None]:
df.groupby('stage')['eeg_iqr'].median()

In [None]:
df