# Preprocess data

This notebook will create 5 preprocessed datasets (with only handcrafted features):

1. ecg+gsr where ecg does not contain nans and it is not all 0
2. ecg+gsr where gsr is not all 0
3. ecg+gsr where ecg does not contain nans and it is not all 0, and gsr is not all 0
4. ecg where ecg does not contain nans and it is not all 0
5. gsr where gsr is not all 0


# Load modules

In [1]:
import numpy as np
import pandas as pd

import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib notebook

# Load original data

In [2]:
data = np.load(os.path.join('data','original', 'dataset_smile_challenge.npy'), allow_pickle = True).item()

#training dataset 
train = data['train']
deep_features_train = train['deep_features'] # for deep features {'ECG_features_C', 'ECG_features_T'}.
handcrafted_features_train = train['hand_crafted_features'] # for hand-crafted features {'ECG_features', 'GSR_features'}.

#test dataset
test = data['test']
deep_features_test = test['deep_features'] # for deep features {'ECG_features_C', 'ECG_features_T'}.
handcrafted_features_test = test['hand_crafted_features'] # for hand-crafted features {'ECG_features', 'GSR_features'}.

# extracting labels and converting labels >= 1 to just 1.
y_train_orig = train['labels']
th = 1
y_train_orig[y_train_orig<th] = 0
y_train_orig[y_train_orig>0] = 1

# Get original shapes

In [3]:
#Evaluate shapes
print('Shape of deep_features[ECG_features_C]: ' + str(deep_features_train['ECG_features_C'].shape))
print('Shape of deep_features[ECG_features_T]: ' + str(deep_features_train['ECG_features_T'].shape))

print('Shape of handcrafted_features[ECG_features]: ' + str(handcrafted_features_train['ECG_features'].shape))
print('Shape of handcrafted_features[GSR_features]: ' + str(handcrafted_features_train['GSR_features'].shape))

Shape of deep_features[ECG_features_C]: (2070, 60, 256)
Shape of deep_features[ECG_features_T]: (2070, 60, 64)
Shape of handcrafted_features[ECG_features]: (2070, 60, 8)
Shape of handcrafted_features[GSR_features]: (2070, 60, 12)


# Dataset 1 creation

# 1a: ECG, find rows with nans

In [4]:
#Idxs of nans
idx_not_train = np.unique(np.argwhere(np.isnan(handcrafted_features_train['ECG_features']))[:,0]).tolist()

#Number of rows with nans
print('Number of ECG rows with nans, TRAIN: ' + str(len(idx_not_train)) + ' out of ' + str(handcrafted_features_train['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')

#Idxs of nans
idx_not_test = np.unique(np.argwhere(np.isnan(handcrafted_features_test['ECG_features']))[:,0]).tolist()

#Number of rows with nans
print('Number of ECG rows with nans, TEST: ' + str(len(idx_not_test)) + ' out of ' + str(handcrafted_features_test['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')


Number of ECG rows with nans, TRAIN: 29 out of 2070(1.4009661835748792%)
Number of ECG rows with nans, TEST: 73 out of 986(7.403651115618661%)


# 1b: ECG, find rows with all 0 

In [5]:
i = 0
for hft in handcrafted_features_train['ECG_features']:
    if np.nansum(hft) == 0:
        idx_not_train.append(i)
    i = i+1
idx_not_train = np.unique(idx_not_train).tolist()
print('Number of all-0 or nan ECG rows, TRAIN: ' + str(len(idx_not_train)) +' out of ' + str(handcrafted_features_train['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')

i = 0
for hft in handcrafted_features_test['ECG_features']:
    if np.nansum(hft) == 0:
        idx_not_test.append(i)
    i = i+1
idx_not_test = np.unique(idx_not_test).tolist()
print('Number of all-0 or nan ECG rows, TEST: ' + str(len(idx_not_test)) +' out of ' + str(handcrafted_features_test['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')



Number of all-0 or nan ECG rows, TRAIN: 256 out of 2070(12.367149758454106%)
Number of all-0 or nan ECG rows, TEST: 114 out of 986(11.561866125760648%)


# 1c: Drop and save

In [6]:
#Drop train data
x_train = dict()
x_train['ECG_features'] = np.delete(handcrafted_features_train['ECG_features'],idx_not_train,axis=0)
x_train['GSR_features'] = np.delete(handcrafted_features_train['GSR_features'],idx_not_train,axis=0)
y_train = np.delete(y_train_orig,idx_not_train,axis=0)
idx_train = [c for c in range(handcrafted_features_train['GSR_features'].shape[0]) if c not in idx_not_train]

#Drop test data
x_test = dict()
x_test['ECG_features'] = np.delete(handcrafted_features_test['ECG_features'],idx_not_test,axis=0)
x_test['GSR_features'] = np.delete(handcrafted_features_test['GSR_features'],idx_not_test,axis=0)
idx_test = [c for c in range(handcrafted_features_test['GSR_features'].shape[0]) if c not in idx_not_test]

#Print stats
print('Final number of rows, TRAIN: ' + str(len(idx_train)) +' out of ' + str(handcrafted_features_train['ECG_features'].shape[0]) + ' (' + str(100*len(idx_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')
print('Final number of rows, TEST: ' + str(len(idx_test)) +' out of ' + str(handcrafted_features_test['ECG_features'].shape[0]) + ' (' + str(100*len(idx_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')

print('Prevalence of CLASS = 0, TRAIN: ' + str(100*np.sum(y_train)/y_train.shape[0]) + '%')
print('Prevalence of CLASS = 1, TRAIN: ' + str(100*(y_train.shape[0]-np.sum(y_train))/y_train.shape[0]) + '%')

#Save
np.savez(os.path.join('data', 'preprocessed', 'preprocessed_dataset_1'), 
         x_train =x_train,
         y_train = y_train,
         idx_train = idx_train,
         x_test = x_test,
         idx_test = idx_test)

print('Saved dataset 1')


Final number of rows, TRAIN: 1814 out of 2070 (87.6328502415459%)
Final number of rows, TEST: 872 out of 986 (88.43813387423936%)
Prevalence of CLASS = 0, TRAIN: 54.07938257993385%
Prevalence of CLASS = 1, TRAIN: 45.92061742006615%
Saved dataset 1


# Dataset 2 creation

# 2a: GSR, find rows with all 0 

In [7]:
idx_not_train = []
i = 0
for hft in handcrafted_features_train['GSR_features']:
    if np.nansum(hft) == 0:
        idx_not_train.append(i)
    i = i+1
idx_not_train = np.unique(idx_not_train).tolist()
print('Number of all-0 GSR rows, TRAIN: ' + str(len(idx_not_train)) +' out of ' + str(handcrafted_features_train['GSR_features'].shape[0]) + '(' + str(100*len(idx_not_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')

idx_not_test = []
i = 0
for hft in handcrafted_features_test['GSR_features']:
    if np.nansum(hft) == 0:
        idx_not_test.append(i)
    i = i+1
idx_not_test = np.unique(idx_not_test).tolist()
print('Number of all-0 GSR rows, TEST: ' + str(len(idx_not_test)) +' out of ' + str(handcrafted_features_test['GSR_features'].shape[0]) + '(' + str(100*len(idx_not_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')



Number of all-0 GSR rows, TRAIN: 253 out of 2070(12.222222222222221%)
Number of all-0 GSR rows, TEST: 64 out of 986(6.490872210953347%)


# 2b: Drop and save

In [9]:
#Drop train data
x_train = dict()
x_train['ECG_features'] = np.delete(handcrafted_features_train['ECG_features'],idx_not_train,axis=0)
x_train['GSR_features'] = np.delete(handcrafted_features_train['GSR_features'],idx_not_train,axis=0)
y_train = np.delete(y_train_orig,idx_not_train,axis=0)
idx_train = [c for c in range(handcrafted_features_train['GSR_features'].shape[0]) if c not in idx_not_train]

#Drop test data
x_test = dict()
x_test['ECG_features'] = np.delete(handcrafted_features_test['ECG_features'],idx_not_test,axis=0)
x_test['GSR_features'] = np.delete(handcrafted_features_test['GSR_features'],idx_not_test,axis=0)
idx_test = [c for c in range(handcrafted_features_test['GSR_features'].shape[0]) if c not in idx_not_test]

#Print stats
print('Final number of rows, TRAIN: ' + str(len(idx_train)) +' out of ' + str(handcrafted_features_train['ECG_features'].shape[0]) + ' (' + str(100*len(idx_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')
print('Final number of rows, TEST: ' + str(len(idx_test)) +' out of ' + str(handcrafted_features_test['ECG_features'].shape[0]) + ' (' + str(100*len(idx_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')

print('Prevalence of CLASS = 0, TRAIN: ' + str(100*np.sum(y_train)/y_train.shape[0]) + '%')
print('Prevalence of CLASS = 1, TRAIN: ' + str(100*(y_train.shape[0]-np.sum(y_train))/y_train.shape[0]) + '%')

#Save
np.savez(os.path.join('data', 'preprocessed', 'preprocessed_dataset_2'), 
         x_train =x_train,
         y_train = y_train,
         idx_train = idx_train,
         x_test = x_test,
         idx_test = idx_test)

print('Saved dataset 2')

Final number of rows, TRAIN: 1817 out of 2070 (87.77777777777777%)
Final number of rows, TEST: 922 out of 986 (93.50912778904666%)
Prevalence of CLASS = 0, TRAIN: 54.870665932856355%
Prevalence of CLASS = 1, TRAIN: 45.129334067143645%
Saved dataset 2


# Dataset 3 creation

# 3a: ECG, find rows with nans

In [10]:
#Idxs of nans
idx_not_train = np.unique(np.argwhere(np.isnan(handcrafted_features_train['ECG_features']))[:,0]).tolist()

#Number of rows with nans
print('Number of ECG rows with nans, TRAIN: ' + str(len(idx_not_train)) + ' out of ' + str(handcrafted_features_train['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')

#Idxs of nans
idx_not_test = np.unique(np.argwhere(np.isnan(handcrafted_features_test['ECG_features']))[:,0]).tolist()

#Number of rows with nans
print('Number of ECG rows with nans, TEST: ' + str(len(idx_not_test)) + ' out of ' + str(handcrafted_features_test['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')


Number of ECG rows with nans, TRAIN: 29 out of 2070(1.4009661835748792%)
Number of ECG rows with nans, TEST: 73 out of 986(7.403651115618661%)


# 3b: ECG, find rows with all 0 

In [11]:
i = 0
for hft in handcrafted_features_train['ECG_features']:
    if np.nansum(hft) == 0:
        idx_not_train.append(i)
    i = i+1
idx_not_train = np.unique(idx_not_train).tolist()
print('Number of all-0 or nan ECG rows, TRAIN: ' + str(len(idx_not_train)) +' out of ' + str(handcrafted_features_train['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')

i = 0
for hft in handcrafted_features_test['ECG_features']:
    if np.nansum(hft) == 0:
        idx_not_test.append(i)
    i = i+1
idx_not_test = np.unique(idx_not_test).tolist()
print('Number of all-0 or nan ECG rows, TEST: ' + str(len(idx_not_test)) +' out of ' + str(handcrafted_features_test['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')


Number of all-0 or nan ECG rows, TRAIN: 256 out of 2070(12.367149758454106%)
Number of all-0 or nan ECG rows, TEST: 114 out of 986(11.561866125760648%)


# 3c: GSR, find rows with all 0 

In [12]:
i = 0
for hft in handcrafted_features_train['GSR_features']:
    if np.nansum(hft) == 0:
        idx_not_train.append(i)
    i = i+1
idx_not_train = np.unique(idx_not_train).tolist()
print('Number of all-0 or nan ECG or all-0 GSR rows, TRAIN: ' + str(len(idx_not_train)) +' out of ' + str(handcrafted_features_train['GSR_features'].shape[0]) + '(' + str(100*len(idx_not_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')

i = 0
for hft in handcrafted_features_test['GSR_features']:
    if np.nansum(hft) == 0:
        idx_not_test.append(i)
    i = i+1
idx_not_test = np.unique(idx_not_test).tolist()
print('Number of all-0 or nan ECG or all-0 GSR rows, TEST: ' + str(len(idx_not_test)) +' out of ' + str(handcrafted_features_test['GSR_features'].shape[0]) + '(' + str(100*len(idx_not_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')


Number of all-0 or nan ECG or all-0 GSR rows, TRAIN: 403 out of 2070(19.468599033816425%)
Number of all-0 or nan ECG or all-0 GSR rows, TEST: 173 out of 986(17.545638945233264%)


# 3d: Drop and save

In [13]:
#Drop train data
x_train = dict()
x_train['ECG_features'] = np.delete(handcrafted_features_train['ECG_features'],idx_not_train,axis=0)
x_train['GSR_features'] = np.delete(handcrafted_features_train['GSR_features'],idx_not_train,axis=0)
y_train = np.delete(y_train_orig,idx_not_train,axis=0)
idx_train = [c for c in range(handcrafted_features_train['GSR_features'].shape[0]) if c not in idx_not_train]

#Drop test data
x_test = dict()
x_test['ECG_features'] = np.delete(handcrafted_features_test['ECG_features'],idx_not_test,axis=0)
x_test['GSR_features'] = np.delete(handcrafted_features_test['GSR_features'],idx_not_test,axis=0)
idx_test = [c for c in range(handcrafted_features_test['GSR_features'].shape[0]) if c not in idx_not_test]

#Print stats
print('Final number of rows, TRAIN: ' + str(len(idx_train)) +' out of ' + str(handcrafted_features_train['ECG_features'].shape[0]) + ' (' + str(100*len(idx_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')
print('Final number of rows, TEST: ' + str(len(idx_test)) +' out of ' + str(handcrafted_features_test['ECG_features'].shape[0]) + ' (' + str(100*len(idx_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')

print('Prevalence of CLASS = 0, TRAIN: ' + str(100*np.sum(y_train)/y_train.shape[0]) + '%')
print('Prevalence of CLASS = 1, TRAIN: ' + str(100*(y_train.shape[0]-np.sum(y_train))/y_train.shape[0]) + '%')

#Save
np.savez(os.path.join('data', 'preprocessed', 'preprocessed_dataset_3'), 
         x_train =x_train,
         y_train = y_train,
         idx_train = idx_train,
         x_test = x_test,
         idx_test = idx_test)

print('Saved dataset 3')

Final number of rows, TRAIN: 1667 out of 2070 (80.53140096618357%)
Final number of rows, TEST: 813 out of 986 (82.45436105476674%)
Prevalence of CLASS = 0, TRAIN: 55.48890221955609%
Prevalence of CLASS = 1, TRAIN: 44.51109778044391%
Saved dataset 3


# Dataset 4 creation

# 4a: ECG, find rows with nans

In [14]:
#Idxs of nans
idx_not_train = np.unique(np.argwhere(np.isnan(handcrafted_features_train['ECG_features']))[:,0]).tolist()

#Number of rows with nans
print('Number of ECG rows with nans, TRAIN: ' + str(len(idx_not_train)) + ' out of ' + str(handcrafted_features_train['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')

#Idxs of nans
idx_not_test = np.unique(np.argwhere(np.isnan(handcrafted_features_test['ECG_features']))[:,0]).tolist()

#Number of rows with nans
print('Number of ECG rows with nans, TEST: ' + str(len(idx_not_test)) + ' out of ' + str(handcrafted_features_test['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')


Number of ECG rows with nans, TRAIN: 29 out of 2070(1.4009661835748792%)
Number of ECG rows with nans, TEST: 73 out of 986(7.403651115618661%)


# 4b: ECG, find rows with all 0 

In [15]:
i = 0
for hft in handcrafted_features_train['ECG_features']:
    if np.nansum(hft) == 0:
        idx_not_train.append(i)
    i = i+1
idx_not_train = np.unique(idx_not_train).tolist()
print('Number of all-0 or nan ECG rows, TRAIN: ' + str(len(idx_not_train)) +' out of ' + str(handcrafted_features_train['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')

i = 0
for hft in handcrafted_features_test['ECG_features']:
    if np.nansum(hft) == 0:
        idx_not_test.append(i)
    i = i+1
idx_not_test = np.unique(idx_not_test).tolist()
print('Number of all-0 or nan ECG rows, TEST: ' + str(len(idx_not_test)) +' out of ' + str(handcrafted_features_test['ECG_features'].shape[0]) + '(' + str(100*len(idx_not_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')



Number of all-0 or nan ECG rows, TRAIN: 256 out of 2070(12.367149758454106%)
Number of all-0 or nan ECG rows, TEST: 114 out of 986(11.561866125760648%)


# 4c: Drop and save

In [16]:
#Drop train data
x_train = dict()
x_train['ECG_features'] = np.delete(handcrafted_features_train['ECG_features'],idx_not_train,axis=0)
y_train = np.delete(y_train_orig,idx_not_train,axis=0)
idx_train = [c for c in range(handcrafted_features_train['GSR_features'].shape[0]) if c not in idx_not_train]

#Drop test data
x_test = dict()
x_test['ECG_features'] = np.delete(handcrafted_features_test['ECG_features'],idx_not_test,axis=0)
idx_test = [c for c in range(handcrafted_features_test['GSR_features'].shape[0]) if c not in idx_not_test]

#Print stats
print('Final number of rows, TRAIN: ' + str(len(idx_train)) +' out of ' + str(handcrafted_features_train['ECG_features'].shape[0]) + ' (' + str(100*len(idx_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')
print('Final number of rows, TEST: ' + str(len(idx_test)) +' out of ' + str(handcrafted_features_test['ECG_features'].shape[0]) + ' (' + str(100*len(idx_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')

print('Prevalence of CLASS = 0, TRAIN: ' + str(100*np.sum(y_train)/y_train.shape[0]) + '%')
print('Prevalence of CLASS = 1, TRAIN: ' + str(100*(y_train.shape[0]-np.sum(y_train))/y_train.shape[0]) + '%')

#Save
np.savez(os.path.join('data', 'preprocessed', 'preprocessed_dataset_4'), 
         x_train =x_train,
         y_train = y_train,
         idx_train = idx_train,
         x_test = x_test,
         idx_test = idx_test)

print('Saved dataset 4')

Final number of rows, TRAIN: 1814 out of 2070 (87.6328502415459%)
Final number of rows, TEST: 872 out of 986 (88.43813387423936%)
Prevalence of CLASS = 0, TRAIN: 54.07938257993385%
Prevalence of CLASS = 1, TRAIN: 45.92061742006615%
Saved dataset 4


# Dataset 4 creation


# 5a: GSR, find rows with all 0

In [17]:
idx_not_train = []
i = 0
for hft in handcrafted_features_train['GSR_features']:
    if np.nansum(hft) == 0:
        idx_not_train.append(i)
    i = i+1
idx_not_train = np.unique(idx_not_train).tolist()
print('Number of all-0 GSR rows, TRAIN: ' + str(len(idx_not_train)) +' out of ' + str(handcrafted_features_train['GSR_features'].shape[0]) + '(' + str(100*len(idx_not_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')

idx_not_test = []
i = 0
for hft in handcrafted_features_test['GSR_features']:
    if np.nansum(hft) == 0:
        idx_not_test.append(i)
    i = i+1
idx_not_test = np.unique(idx_not_test).tolist()
print('Number of all-0 GSR rows, TEST: ' + str(len(idx_not_test)) +' out of ' + str(handcrafted_features_test['GSR_features'].shape[0]) + '(' + str(100*len(idx_not_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')


Number of all-0 GSR rows, TRAIN: 253 out of 2070(12.222222222222221%)
Number of all-0 GSR rows, TEST: 64 out of 986(6.490872210953347%)


# 5b: Drop and save

In [18]:
#Drop train data
x_train = dict()
x_train['GSR_features'] = np.delete(handcrafted_features_train['GSR_features'],idx_not_train,axis=0)
y_train = np.delete(y_train_orig,idx_not_train,axis=0)
idx_train = [c for c in range(handcrafted_features_train['GSR_features'].shape[0]) if c not in idx_not_train]

#Drop test data
x_test = dict()
x_test['GSR_features'] = np.delete(handcrafted_features_test['GSR_features'],idx_not_test,axis=0)
idx_test = [c for c in range(handcrafted_features_test['GSR_features'].shape[0]) if c not in idx_not_test]

#Print stats
print('Final number of rows, TRAIN: ' + str(len(idx_train)) +' out of ' + str(handcrafted_features_train['ECG_features'].shape[0]) + ' (' + str(100*len(idx_train)/handcrafted_features_train['ECG_features'].shape[0]) + '%)')
print('Final number of rows, TEST: ' + str(len(idx_test)) +' out of ' + str(handcrafted_features_test['ECG_features'].shape[0]) + ' (' + str(100*len(idx_test)/handcrafted_features_test['ECG_features'].shape[0]) + '%)')

print('Prevalence of CLASS = 0, TRAIN: ' + str(100*np.sum(y_train)/y_train.shape[0]) + '%')
print('Prevalence of CLASS = 1, TRAIN: ' + str(100*(y_train.shape[0]-np.sum(y_train))/y_train.shape[0]) + '%')

#Save
np.savez(os.path.join('data', 'preprocessed', 'preprocessed_dataset_5'), 
         x_train =x_train,
         y_train = y_train,
         idx_train = idx_train,
         x_test = x_test,
         idx_test = idx_test)

print('Saved dataset 5')

Final number of rows, TRAIN: 1817 out of 2070 (87.77777777777777%)
Final number of rows, TEST: 922 out of 986 (93.50912778904666%)
Prevalence of CLASS = 0, TRAIN: 54.870665932856355%
Prevalence of CLASS = 1, TRAIN: 45.129334067143645%
Saved dataset 5
