# Prepare EEG data for training of machine-learning models
+ Import data.
+ Apply filters (bandpass).
+ Detect potential bad channels and replace them by interpolation.
+ Detect potential bad epochs and remove them.

## Import packages & links

In [1]:
# Import packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import mne
#%matplotlib inline

from mayavi import mlab

In [2]:
ROOT = "C:\\OneDrive - Netherlands eScience Center\\Project_ePodium\\"
PATH_CODE = ROOT + "EEG_dyslexia_prediction\\"
PATH_DATA = ROOT + "Data\\EEGdata_Karin_Jan2020\\"
PATH_OUTPUT = PATH_DATA + "processed_data\\"
PATH_METADATA = PATH_DATA + "metadata\\"

import sys
sys.path.insert(0, PATH_CODE)

In [40]:
filename_labels = PATH_METADATA + "Screening_children5a_summary_new.txt" 
metadata = pd.read_csv(filename_labels, sep='\t')
metadata.head()

Unnamed: 0,id_child,groupDDP,atRiskOrNotDDP,dyslexicAtMidGroup3DDP,assignment1,assignment2,assignment3,assignment4,childInfoPresent,relativeInfoPresent,mmr_2mth,mmr_5mth,mmr_11mth,mmr_17mth,mmr_23mth,mmr_29mth,mmr_35mth,mmr_41mth,mmr_47mth
0,1,4,unclear,1,notEnoughInfo,notEnoughInfo,notEnoughInfo,notEnoughInfo,1,1,1,0,1,1,1,1,1,1,1
1,2,missing,missing,missing,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,0,1,0,0,0,0,1,1,1,0,0
2,3,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,0,1,0,1,0,1,1,0
3,4,missing,missing,missing,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,1,1,0,1,1,0,1,0
4,5,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,0,1,1,0,1,1,1,0


In [64]:
metadata.shape

(336, 19)

## Search all *.cnt files and check for how many we have a label

In [11]:
PATH_DATA_01 = PATH_DATA + "17mnd mmn\\"

import fnmatch
import warnings
warnings.filterwarnings('ignore')

import helper_functions

dirs = os.listdir(PATH_DATA_01)
cnt_files = fnmatch.filter(dirs, "*.cnt")

In [21]:
found_ids = [x[:3] for x in cnt_files]
idx = np.where(np.array(found_ids) == '036')[0]
[cnt_files[x] for x in idx]

['036_17_mc_mmn25_wk.cnt', '036_17_mc_mmn36.cnt']

In [41]:
metadata[metadata['id_child'] == '036']['groupDDP'].values[0]

'3Ctrl'

In [46]:
labels = []

found_ids = [x[:3] for x in cnt_files]
for ID in list(set(found_ids)): 
    idx = np.where(np.array(found_ids) == ID)[0]
    filenames = [cnt_files[x] for x in idx]
    label = metadata[metadata['id_child'] == ID]['groupDDP'].values[0]
    label_risk = metadata[metadata['id_child'] == ID]['assignment4'].values[0]
    if label == '1FRdys':
        label = 1
    elif label == '2FRndys':
        label = 0
    elif label == '3Ctrl': #TODO: check if this is correct!
        label = 0
    labels.append([ID, label, label_risk, filenames])

In [55]:
labels[:10]

[['128', 1, 'notEnoughInfo', ['128_17_jr_mmn.cnt']],
 ['613',
  0,
  'notAtRisk_rest',
  ['613-176-17m-mc-mmn25.cnt', '613-176-17m-mc-mmn36.cnt']],
 ['408', 0, 'notAtRisk_rest', ['408_17_md_mmn.cnt']],
 ['163',
  1,
  'atRisk',
  ['163_17_jr_mmn25_slp.cnt',
   '163_17_jr_mmn36.cnt',
   '163_17_jr_mmn47_slp.cnt',
   '163_17_jr_mmn58_wk.cnt']],
 ['602', 0, 'notAtRisk_rest', ['602-115-17m-mc-mmn.cnt']],
 ['313',
  0,
  'notAtRisk_rest',
  ['313_17_mc_mmn25_wk.cnt', '313_17_mc_mmn36_wk.cnt']],
 ['451',
  'missing',
  'notAtRisk_rest',
  ['451_17_md_mmn25_wk.cnt', '451_17_md_mmn36_wk.cnt']],
 ['017',
  0,
  'notAtRisk_highestScores',
  ['017_17_jc_mmn25_wk.cnt', '017_17_jc_mmn36.cnt']],
 ['338',
  0,
  'notAtRisk_rest',
  ['338_17_jc_mmn25_wk.cnt', '338_17_jc_mmn36_wk.cnt']],
 ['314',
  0,
  'notAtRisk_highestScores',
  ['314_17_mc_mmn25_wk.cnt', '314_17_mc_mmn36_wk.cnt']]]

In [66]:
len(labels), len(list(set(found_ids)))

(247, 247)

### Count number (and type) of labels found:

In [53]:
labels_known = 0
labels_unknown = 0
labels_type = []

for x in labels:
    if x[1] == 1: #'dyslexic'
        labels_known += 1
        labels_type.append(1)
    elif x[1] == 0: #'non-dyslexic'
        labels_known += 1
        labels_type.append(0)
    else: # missing or unclear
        labels_unknown += 1  
        labels_type.append('missing')
        
print("Data with proper labels:", labels_known, "||| Data without proper label:", labels_unknown)     

Data with proper labels: 209 ||| Data without proper label: 38


In [54]:
print("Data for 'dyslexic':", labels_type.count(1))
print("Data for 'non-dyslexic':", labels_type.count(0))

Data for 'dyslexic': 49
Data for 'non-dyslexic': 160


In [58]:
# Check types of risk group labels found
labels_risktype = [x[2] for x in labels]
list(set(labels_risktype))

['atRisk',
 'notEnoughInfo',
 'notAtRisk_rest',
 'notAtRisk_highestScores',
 'missing']

In [82]:
metadata['atRiskOrNotDDP'][:10]

0      unclear
1      missing
2    notAtRisk
3      missing
4    notAtRisk
5    notAtRisk
6    notAtRisk
7    notAtRisk
8    notAtRisk
9      unclear
Name: atRiskOrNotDDP, dtype: object

In [99]:
group_notrisk = np.array(1*((metadata['atRiskOrNotDDP'] == 'notAtRisk')
                   | (metadata['assignment4'].isin(['notAtRisk_rest', 'notAtRisk_highestScores']))))

group_risk = np.array(1*((metadata['atRiskOrNotDDP'] == 'atRisk')
                   | (metadata['assignment4'] == 'at risk')))

In [100]:
np.sum(group_risk) + np.sum(group_notrisk)

307

In [111]:
label_risk = group_notrisk + 2*group_risk 
label_risk[label_risk == 3] = 2
label_risk = label_risk -1

In [112]:
label_risk

array([-1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0,
        0,  0, -1, -1,  1, -1,  1,  0, -1,  0,  1,  1,  1, -1, -1,  0,  1,
        1,  0,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,
        1,  1,  0,  1,  1,  1,  1,  1,  0, -1,  1,  1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,
        0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0, -1,
        0,  0,  0,  0,  0,  0, -1, -1,  1,  1, -1,  0, -1,  1,  1,  1,  1,
        1,  1,  0,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  0,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1,  0, -1, -1,  1,  1,  1,  0,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

In [113]:
group_notdys = np.array(1*(metadata['groupDDP'].isin(['1FRdys', '3Ctrl'])))

group_dys = np.array(1*(metadata['groupDDP'] == '2FRndys'))

In [102]:
np.sum(group_notdys) + np.sum(group_dys)

252

In [116]:
label_dys = group_notdys + 2*group_dys 
label_dys[label_dys == 3] = 2
label_dys = label_dys -1

In [117]:
label_dys

array([-1, -1,  0, -1,  0,  0,  0,  0,  0, -1, -1, -1, -1,  0,  0,  0,  0,
        0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0, -1, -1,  0,  0,  0,
        0,  0, -1, -1,  1, -1,  1, -1, -1, -1,  1,  1,  0, -1, -1, -1,  0,
        0, -1,  1,  1,  0,  0,  1,  1, -1,  0,  0,  1,  1, -1, -1,  1, -1,
        0,  1, -1,  1,  1,  0,  1,  0, -1, -1,  0,  1,  1,  0,  1, -1,  1,
        1,  1,  0,  1,  0,  1,  0,  1, -1,  1,  0,  1, -1,  0,  1,  0,  0,
        1,  0, -1,  0,  0, -1,  0,  1,  1,  1,  1,  1,  1,  0,  0, -1, -1,
        0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,
        0,  0,  0, -1,  0,  0,  0, -1, -1, -1,  0,  0,  0,  0,  0,  0, -1,
       -1,  0,  0,  0,  0,  0, -1, -1,  1,  0, -1,  0, -1,  0,  0,  0,  0,
        0,  0, -1,  1,  1,  1,  1, -1, -1, -1,  1,  0,  1, -1,  1,  0,  0,
        1,  1,  0,  1, -1,  1,  0, -1, -1, -1,  1,  0,  1,  0,  1,  0,  1,
        1,  1,  1,  0,  0,  1,  0,  1,  1,  1,  0,  1, -1,  1,  1,  1,  0,
        1,  1,  0,  1,  1

In [131]:
labels_final = pd.DataFrame(data=metadata['id_child'].values, columns=['id_child'])
labels_final['label_dys'] = label_dys
labels_final['label_risk'] = label_risk
labels_final.head()

Unnamed: 0,id_child,label_dys,label_risk
0,1,-1,-1
1,2,-1,0
2,3,0,0
3,4,-1,0
4,5,0,0


In [59]:
print("Data for 'at risk':", labels_risktype.count('atRisk'))
print("Data for 'notAtRisk_rest':", labels_risktype.count('notAtRisk_rest'))
print("Data for 'notAtRisk_highestScores':", labels_risktype.count('notAtRisk_highestScores'))

Data for 'at risk': 91
Data for 'notAtRisk_rest': 56
Data for 'notAtRisk_highestScores': 49


In [70]:
labels_risktype = [x[2] for x in labels if x[1] in [1,0]]
print("Data for 'at risk':", labels_risktype.count('atRisk'))
print("Data for 'notAtRisk_rest':", labels_risktype.count('notAtRisk_rest'))
print("Data for 'notAtRisk_highestScores':", labels_risktype.count('notAtRisk_highestScores'))

Data for 'at risk': 79
Data for 'notAtRisk_rest': 43
Data for 'notAtRisk_highestScores': 45


In [71]:
metadata.loc[(metadata['groupDDP'].isin(['1FRdys', '2FRndys', '3Ctrl']) 
              & metadata['assignment4'].isin(['at risk', 'notAtRisk_rest', 'notAtRisk_highestScores', ]))]

Unnamed: 0,id_child,groupDDP,atRiskOrNotDDP,dyslexicAtMidGroup3DDP,assignment1,assignment2,assignment3,assignment4,childInfoPresent,relativeInfoPresent,mmr_2mth,mmr_5mth,mmr_11mth,mmr_17mth,mmr_23mth,mmr_29mth,mmr_35mth,mmr_41mth,mmr_47mth
2,003,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,0,1,0,1,0,1,1,0
4,005,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,0,1,1,0,1,1,1,0
5,006,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,0,0,1,1,1,0,1,0
8,009,3Ctrl,notAtRisk,1,notEnoughInfo,notEnoughInfo,notEnoughInfo,notAtRisk_rest,1,1,1,0,1,1,1,1,1,1,0
14,015,3Ctrl,notAtRisk,0,notEnoughInfo,notEnoughInfo,notEnoughInfo,notEnoughInfo,1,1,1,0,1,1,1,1,1,1,0
15,016,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,1,1,0,0,0,0,0,0
16,017,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,0,1,1,1,0,0,1,0
17,018,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,0,1,1,1,1,1,1,0
19,021,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,0,1,1,1,1,1,1,0
21,023,3Ctrl,notAtRisk,0,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,notAtRisk_highestScores,1,1,1,0,1,0,1,1,1,1,0


In [6]:
files_present = [x[:-4] for x in cnt_files]
files_labels_known = list(metadata["file"])

In [66]:
labels_known = 0
labels_unknown = 0
labels_type = []
for file in files_present:
    if file in files_labels_known:
        labels_known += 1
        labels_type.append(int(metadata[metadata["file"].str.match(file)]['group']))
    else:
        labels_unknown += 1  
        labels_type.append(0)
print("Files with proper labels:", labels_known, "||| Files without proper label:", labels_unknown)

Files with proper labels: 57 ||| Files without proper label: 135


In [72]:
labels_type.count(1), labels_type.count(2)

(24, 33)

## Custom cnt-file import function:

In [132]:
def read_cnt_file(file,
                  label_group,
                  event_idx = [3, 13, 66],
                  channel_set = "30",
                  tmin = -0.2,
                  tmax = 0.8,
                  lpass = 0.5, 
                  hpass = 40, 
                  threshold = 5, 
                  max_bad_fraction = 0.2):
    """ Function to read cnt file. Run bandpass filter. 
    Then detect and correct/remove bad channels and bad epochs.
    Store resulting epochs as arrays.
    
    Args:
    --------
    file: str
        Name of file to import.
    label_group: int
        Unique ID of specific group (must be >0).
    channel_set: str
        Select among pre-defined channel sets. Here: "30" or "62"
    """
    
    if channel_set == "30":
        channel_set = ['O2', 'O1', 'OZ', 'PZ', 'P4', 'CP4', 'P8', 'C4', 'TP8', 'T8', 'P7', 
                       'P3', 'CP3', 'CPZ', 'CZ', 'FC4', 'FT8', 'TP7', 'C3', 'FCZ', 'FZ', 
                       'F4', 'F8', 'T7', 'FT7', 'FC3', 'F3', 'FP2', 'F7', 'FP1']
    elif channel_set == "62":
        channel_set = ['O2', 'O1', 'OZ', 'PZ', 'P4', 'CP4', 'P8', 'C4', 'TP8', 'T8', 'P7', 
                       'P3', 'CP3', 'CPZ', 'CZ', 'FC4', 'FT8', 'TP7', 'C3', 'FCZ', 'FZ', 
                       'F4', 'F8', 'T7', 'FT7', 'FC3', 'F3', 'FP2', 'F7', 'FP1', 'AFZ', 'PO3', 
                       'P1', 'POZ', 'P2', 'PO4', 'CP2', 'P6', 'M1', 'CP6', 'C6', 'PO8', 'PO7', 
                       'P5', 'CP5', 'CP1', 'C1', 'C2', 'FC2', 'FC6', 'C5', 'FC1', 'F2', 'F6', 
                       'FC5', 'F1', 'AF4', 'AF8', 'F5', 'AF7', 'AF3', 'FPZ']
    else:
        print("Predefined channel set given by 'channel_set' not known...")
        
    
    # Initialize array
    signal_collection = np.zeros((0,len(channel_set),501))
    label_collection = np.zeros((0))
    
    # Import file 
    data_raw = mne.io.read_raw_cnt(file, montage=None, eog='auto', preload=True)
    
    # Band-pass filter (between 0.5 and 40 Hz. was 0.5 to 30Hz in Stober 2016)
    data_raw.filter(0.5, 40, fir_design='firwin')

    events = mne.find_events(data_raw, shortest_event=0, stim_channel='STI 014', verbose=False)
    
    # Set baseline:
    baseline = (None, 0)  # means from the first instant to t = 0

    # Select channels to exclude (if any)
    channels_exclude = [x for x in data_raw.ch_names if x not in channel_set]
    channels_exclude = [x for x in channels_exclude if x not in ['HEOG', 'VEOG', 'STI 014']]
    
    for event_id in event_idx:
    
        # Pick EEG channels 
        picks = mne.pick_types(data_raw.info, meg=False, eeg=True, stim=False, eog=False,
                           #exclude=data_exclude)#'bads'])
                               include=channel_set, exclude=channels_exclude)#'bads'])

        epochs = mne.Epochs(data_raw, events, event_id, tmin, tmax, proj=True, picks=picks,
                        baseline=baseline, preload=True, verbose=False)

        # Detect potential bad channels and epochs
        bad_channels, bad_epochs = helper_functions.select_bad_epochs(epochs, 
                                                                      event_id, 
                                                                      threshold = threshold, 
                                                                      max_bad_fraction = max_bad_fraction)

        # Interpolate bad channels
        if len(bad_channels) > 0: 
            # Mark bad channels:
            data_raw.info['bads'] = bad_channels
            # Pick EEG channels:
            picks = mne.pick_types(data_raw.info, meg=False, eeg=True, stim=False, eog=False,
                               #exclude=data_exclude)#'bads'])
                               include=channel_set, exclude=channels_exclude)#'bads'])
            epochs = mne.Epochs(data_raw, events, event_id, tmin, tmax, proj=True, picks=picks,
                            baseline=baseline, preload=True, verbose=False)
            # Interpolate bad channels using functionality of 'mne'
            epochs.interpolate_bads()

        # Get signals as array and add to total collection
        signals_cleaned = epochs[str(event_id)].drop(bad_epochs).get_data()
        signal_collection = np.concatenate((signal_collection, signals_cleaned), axis=0)
        #label_collection = np.concatenate((label_collection, event_id*label_group*np.ones((signals_cleaned.shape[0]))), axis=0)
        label_collection = [str(event_id) + label_group] * signals_cleaned.shape[0]

    return signal_collection, label_collection#.astype(int)

In [73]:
# channel names for 30 EEG channel case: 
print(epochs.ch_names)

NameError: name 'epochs' is not defined

In [126]:
# channel names for 62 EEG channel case: 
print(epochs.ch_names)

['O2', 'O1', 'OZ', 'PZ', 'P4', 'CP4', 'P8', 'C4', 'TP8', 'T8', 'P7', 'P3', 'CP3', 'CPZ', 'CZ', 'FC4', 'FT8', 'TP7', 'C3', 'FCZ', 'FZ', 'F4', 'F8', 'T7', 'FT7', 'FC3', 'F3', 'FP2', 'F7', 'FP1', 'AFZ', 'PO3', 'P1', 'POZ', 'P2', 'PO4', 'CP2', 'P6', 'M1', 'CP6', 'C6', 'PO8', 'PO7', 'P5', 'CP5', 'CP1', 'C1', 'C2', 'FC2', 'FC6', 'C5', 'FC1', 'F2', 'F6', 'FC5', 'F1', 'AF4', 'AF8', 'F5', 'AF7', 'AF3', 'FPZ']


## Check how many EEG channels the cnt-files feature... 

In [95]:
format_collection = []
for i, filename in enumerate(cnt_files):
    # Import file 
    file = PATH_DATA + filename
    data_raw = mne.io.read_raw_cnt(file, montage=None, eog='auto', preload=True)
    format_collection.append((i, len(data_raw.ch_names)))
    print(i, len(data_raw.ch_names))

Reading 0 ... 370279  =      0.000 ...   740.558 secs...
0 65
Reading 0 ... 373379  =      0.000 ...   746.758 secs...
1 65
Reading 0 ... 743799  =      0.000 ...  1487.598 secs...
2 65
Reading 0 ... 778159  =      0.000 ...  1556.318 secs...
3 65
Reading 0 ... 1497319  =      0.000 ...  2994.638 secs...
4 65
Reading 0 ... 372959  =      0.000 ...   745.918 secs...
5 65
Reading 0 ... 758319  =      0.000 ...  1516.638 secs...
6 65
Reading 0 ... 751639  =      0.000 ...  1503.278 secs...
7 65
Reading 0 ... 458319  =      0.000 ...   916.638 secs...
8 65
Reading 0 ... 376679  =      0.000 ...   753.358 secs...
9 65
Reading 0 ... 368719  =      0.000 ...   737.438 secs...
10 65
Reading 0 ... 774119  =      0.000 ...  1548.238 secs...
11 65
Reading 0 ... 373459  =      0.000 ...   746.918 secs...
12 33
Reading 0 ... 758559  =      0.000 ...  1517.118 secs...
13 65
Reading 0 ... 743039  =      0.000 ...  1486.078 secs...
14 65
Reading 0 ... 743199  =      0.000 ...  1486.398 secs...
15 65
R

129 65
Reading 0 ... 372199  =      0.000 ...   744.398 secs...
130 65
Reading 0 ... 744299  =      0.000 ...  1488.598 secs...
131 65
Reading 0 ... 742379  =      0.000 ...  1484.758 secs...
132 65
Reading 0 ... 740279  =      0.000 ...  1480.558 secs...
133 65
Reading 0 ... 153799  =      0.000 ...   307.598 secs...
134 65
Reading 0 ... 736119  =      0.000 ...  1472.238 secs...
135 65
Reading 0 ... 760639  =      0.000 ...  1521.278 secs...
136 65
Reading 0 ... 739679  =      0.000 ...  1479.358 secs...
137 65
Reading 0 ... 586559  =      0.000 ...  1173.118 secs...
138 65
Reading 0 ... 741919  =      0.000 ...  1483.838 secs...
139 65
Reading 0 ... 997039  =      0.000 ...  1994.078 secs...
140 33
Reading 0 ... 997039  =      0.000 ...  1994.078 secs...
141 33
Reading 0 ... 370459  =      0.000 ...   740.918 secs...
142 33
Reading 0 ... 741979  =      0.000 ...  1483.958 secs...
143 33
Reading 0 ... 381219  =      0.000 ...   762.438 secs...
144 33
Reading 0 ... 378859  =      0.00

In [111]:
a,b = zip(*format_collection)
len(np.where((np.array(b) == 65))[0]), len(np.where((np.array(b) == 33))[0]), len(a)

(128, 64, 192)

In [40]:
# number of cnt files which have both 65 channels (62 EEG) AND a label.
len(np.where((np.array(b) == 65) & (np.array(labels_type) == 1))[0])

NameError: name 'b' is not defined

So far we 'only' have about 60 cnt-files of which we have a label ("risk group" vs "no risc group").
And only 42 of them feature 62 EEG channels. I hence switched to 30 EEG channels and picked the ones that are present in all patient datasets.

# Workflow data processing
1. Load cnt files.
2. Select same number of channels (here: 30 same channels which exist for both 30 and 62 channel data)
3. Preprocess raw data (bandpass + detect outliers and 'bad' epochs).
4. Store epoch data and event type as array

## LABELS:
+ After Karin's search we have proper labels for much more files!  


In [134]:
PATH_CNTS = PATH_DATA + "17mnd mmn\\"

# Initialize array
signal_collection = np.zeros((0,30,501)) #62
label_collection = np.zeros((0))
metadata_collection = []

for i, filename in enumerate(cnt_files[:5]):
    
    # First check if we have proper label for that file
    # -----------------------------------------------------------
    
    ID = filename[:3]
    label = labels_final[labels_final['id_child'] == ID]['label_dys'].values[0]
    label_risk = labels_final[labels_final['id_child'] == ID]['label_risk'].values[0]
    #label = metadata[metadata['id_child'] == ID]['groupDDP'].values[0]
    #label_risk = metadata[metadata['id_child'] == ID]['assignment4'].values[0]
    
    if (label < 0) or (label_risk < 0):
        print("No proper label found for file: ", filename)
    else:
        #label_group = int(metadata[metadata["file"].str.match(filename[:-4])]['group'])
        label_group = 'dys' + str(label) + '_risk' + str(label_risk)
        
        print(40*"=")
        print("Importing file: ",filename)
        print("Data belongs into group: ", label_group)

        # Import data and events
        file = PATH_CNTS + filename

        signal_collect, label_collect = read_cnt_file(file, 
                                                      label_group,
                                                      event_idx = [3, 13, 66],
                                                      channel_set = "30",
                                                      tmin = -0.2,
                                                      tmax = 0.8,
                                                      lpass = 0.5, 
                                                      hpass = 40, 
                                                      threshold = 5, 
                                                      max_bad_fraction = 0.2)

        # Get signals as array and add to total collection
        signal_collection = np.concatenate((signal_collection, signal_collect), axis=0)
        label_collection = np.concatenate((label_collection, label_collect), axis=0)
        metadata_collection.append((i, filename, signal_collection.shape[0]))


No proper label found for file:  001_17_jc_mmn36_slp_mmn25_slp_mmn47_slp_mmn58_slp.cnt
Importing file:  005_17_jc_mmn36_slp_mmn25_slp_mmn47_mixed.cnt
Data belongs into group:  dys0_risk0
Reading 0 ... 1120199  =      0.000 ...  2240.398 secs...
Setting up band-pass filter from 0.5 - 40 Hz
l_trans_bandwidth chosen to be 0.5 Hz
h_trans_bandwidth chosen to be 10.0 Hz
Filter length of 3301 samples (6.602 sec) selected
No outliers found with given threshold.
No outliers found with given threshold.
No outliers found with given threshold.
Importing file:  006_17_mc_mmn36_slp.cnt
Data belongs into group:  dys0_risk0
Reading 0 ... 1481999  =      0.000 ...  2963.998 secs...
Setting up band-pass filter from 0.5 - 40 Hz
l_trans_bandwidth chosen to be 0.5 Hz
h_trans_bandwidth chosen to be 10.0 Hz
Filter length of 3301 samples (6.602 sec) selected
Found 6 bad epochs in a total of 4  channels.
Marked 6 bad epochs in a total of 400  epochs.
Found 1 bad epochs in a total of 1  channels.
Marked 1 bad e

ValueError: not enough values to unpack (expected 2, got 0)

In [53]:
signal_collection.shape, label_collection.shape

((39083, 30, 501), (39083,))

In [74]:
metadata_collection[:10]

[(1, '034_17_mc_mmn36_wk.cnt', 443),
 (2, '036_17_mc_mmn36_wk.cnt', 1384),
 (18, '175_17_jd_mmn_wk.cnt', 1858),
 (26, '305_17_jc_mmn36_wk.cnt', 2349),
 (27, '306_17_mc_mmn36_wk.cnt', 2813),
 (28, '307_17_jc_mmn36_wakker.cnt', 3447),
 (29, '308_17_jc_mmn36_wk.cnt', 3938),
 (30, '309_17_jc_mmn.cnt', 4425),
 (33, '314_17_mc_mmn36_wk.cnt', 4917),
 (38, '337_17_jc_mmn36_wk.cnt', 5389)]

We hence get a dataset of 39083 datapoints with known label.  
Each datapoint consits of a 1-second EEG signal of 30 channels with a 500Hz sampling rate. Thus arrays with a size of 30 x 501. 

## Labels 
Here we have 6 labels. 
1. Group 1, stimuli 3 --> "3"
2. Group 1, stimuli 13 --> "13"
3. Group 1, stimuli 66 --> "66"
4. Group 2, stimuli 3 --> "6"
5. Group 2, stimuli 13 --> "26"
6. Group 2, stimuli 66 --> "132"

In [41]:
label_collection[1500:2000].astype(int)

array([  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
         6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   

# Save entire processed dataset:

In [75]:
filename = PATH_OUTPUT + "EEG_data_30channels_1s_corrected.npy"
np.save(filename, signal_collection)

filename = PATH_OUTPUT + "EEG_data_30channels_1s_corrected_labels.npy"
np.save(filename, label_collection)

import csv
filename = PATH_OUTPUT + "EEG_data_30channels_1s_corrected_metadata.csv"

with open(filename, 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(metadata_collection)
csvFile.close()