In [35]:
import numpy as np
import pandas as pd
import pickle
from scipy import signal
import os
import zipfile
import urllib.request
import shutil
import cvxopt
import pyarrow
import matplotlib.pyplot as plt

# 1. Download WESAD dataset

The WESAD dataset has to be downloaded and extracted. The website is:
https://uni-siegen.sciebo.de/s/HGdUkoNlW1Ub0Gx/download

In [36]:
%%time

# download if not already available
filename = 'WESAD.zip'
if not os.path.isfile(filename):
    print('downloading')
    urllib.request.urlretrieve('https://uni-siegen.sciebo.de/s/HGdUkoNlW1Ub0Gx/download', filename)

CPU times: total: 0 ns
Wall time: 0 ns


In [37]:
%%time

# exctract if folder is not present
toFolder = 'WESAD'
if not os.path.isdir(toFolder):
    print('extracting')
    with zipfile.ZipFile(filename, 'r') as zfile:
        zfile.extractall(path=toFolder)

CPU times: total: 0 ns
Wall time: 0 ns


In [38]:
# we need to move everything inside a data folder, so all scripts work
if not os.path.isdir('WESAD/data'):
    os.makedirs('WESAD/data')

    original = 'WESAD/WESAD'
    target = 'WESAD/data/WESAD'

    shutil.move(original, target)

# 2. Create demographic data within WESAD folder
For this, we need to run a tiny script within the WESAD folder. We write the tiny script from within this notebook and execute it. This triggers the creation of the m14_merged.csv file which contains demographic data bout the users, which we can use as features in the ML pipeline.

In [39]:
# download scripts that we need
script_file_1 = 'readme_parser.py'
script_file_2 = 'data_wrangling.py'
script_file_3 = 'cvxEDA.py'

if not os.path.isfile('WESAD/'+script_file_1):
    print('downloading script 1')
    urllib.request.urlretrieve('https://raw.githubusercontent.com/WJMatthew/WESAD/master/readme_parser.py',
                               'WESAD/'+script_file_1)

if not os.path.isfile('WESAD/'+script_file_2):
    print('downloading script file 2')
    urllib.request.urlretrieve('https://raw.githubusercontent.com/WJMatthew/WESAD/master/data_wrangling.py',
                               'WESAD/'+script_file_2)

if not os.path.isfile('WESAD/'+script_file_3):
    print('downloading script file 3')
    urllib.request.urlretrieve('https://raw.githubusercontent.com/lciti/cvxEDA/master/src/cvxEDA.py',
                               'WESAD/'+script_file_3)

In [40]:
# we need to fix one line in scipt 2

with open('WESAD/data_wrangling.py', 'r') as file:
    # read a list of lines into data
    data = file.readlines()

corrected_line = '                    feat_names.append(\'_\'.join(str([row, col])))\n'

data[208] = corrected_line

# and write everything back
with open('WESAD/data_wrangling.py', 'w') as file:
    file.writelines(data)

In [41]:
%%time

# execute script 2 - if file it creates does not already exist
os.chdir('WESAD')
if not os.path.isfile('data/may14_feats4.csv'):
    %run data_wrangling.py
os.chdir('..')

CPU times: total: 0 ns
Wall time: 0 ns


In [42]:
%%writefile WESAD/parse_readmes.py
from readme_parser import rparser
rp = rparser()

Overwriting WESAD/parse_readmes.py


In [43]:
# parse readmes (this uses script 1)
os.chdir('WESAD')
%run parse_readmes.py
os.chdir('..')

Parsing Readme files


In [44]:
# read demographic data
df_demo = pd.read_csv('WESAD/data/m14_merged.csv', index_col=0)
df_demo = df_demo.loc[:, ~df_demo.columns.str.startswith('[')]
df_demo = df_demo.drop(columns = ['BVP_peak_freq', 'TEMP_slope', 'label', 'smoker_NO', 'gender_ female'])
df_demo = df_demo.drop_duplicates()

In [45]:
df_demo

Unnamed: 0,subject,age,height,weight,gender_ male,coffee_today_YES,sport_today_YES,smoker_YES,feel_ill_today_YES
0,2,27,175,80,1,0,0,0,0
76,3,27,173,69,1,0,0,0,0
153,4,25,175,90,1,0,0,0,0
229,5,35,189,80,1,1,0,0,0
308,6,27,170,66,1,1,0,1,0
386,7,28,184,74,1,0,1,0,0
464,8,27,172,64,0,1,1,0,0
543,9,26,181,75,1,0,0,0,1
621,10,28,178,76,1,0,0,0,0
702,11,26,171,54,0,1,0,0,0


# 3. Extract all data from WESAD dataset
The dataseet contains lots of data from several sources. In this project, we focus on accelerometer data from a wrist band sensor - such sensors are ubiquitous and a predition made from their data has the most business impact.

In [46]:
# get all participant folders
participants = [name for name in os.listdir('WESAD/data/WESAD/') if os.path.isdir('WESAD/data/WESAD/'+name)]
result_dfs_acc = []
result_dfs_bvp = []

In [47]:
%%time
# look over all participant data and extract wrist acc and label data;
# combine with demographic data

cnt = 0

for p in participants:
    
    cnt += 1
    print('Processing data: ',cnt,'/',len(participants))
    
    file = open('WESAD/data/WESAD/'+p+'/'+p+'.pkl', 'rb')
    s = pickle.load(file, encoding = 'latin1')
    
    
    
    ###### ACC
    # we ge the relevant data - wrist accelerometer data and labels
    acc = s['signal']['wrist']['ACC']
    labels_acc = s['label']
    
    # create df for acc data and add subject column
    df_acc = pd.DataFrame(data = acc, columns = ['x', 'y', 'z'])
    df_acc['subject'] = int(p[1:])
    
    # label is recorded in 700 HZ, wrist acc is recorded in 32 HZ, we need to resample
    s_resampled_acc = signal.resample(labels_acc, len(acc))
    s_resampled_rounded_acc = np.rint(s_resampled_acc)
    s_resampled_rounded_int_acc = s_resampled_rounded_acc.astype(int)
    df_labels_acc = pd.DataFrame(data = s_resampled_rounded_int_acc, columns = ['label'])
    
    # concat both acc and label dataframes
    df_data_acc = pd.concat([df_acc, df_labels_acc], axis=1)

    # label definitions (see WESAD/wesad_readme.pdf)
    # 0: not defined / transient; 1: baseline; 2: stress; 3: amusement; 4: meditation; 5/6/7: should be ignored
    # we drop everything except 1 (baseline) and 2 (stress)
    df_data_acc = df_data_acc[(df_data_acc['label'] == 1) | (df_data_acc['label'] == 2)]
    
    # merge with demographic data - skipped because not available in other dataset
    #df = df_data.merge(df_demo)
    
    
    
    ##### BVP
    # we ge the relevant data - wrist BVP data and labels
    bvp = s['signal']['wrist']['BVP']
    labels_bvp = s['label']
    
    # create df for acc data and add subject column
    df_bvp = pd.DataFrame(data = bvp, columns = ['BVP'])
    df_bvp['subject'] = int(p[1:])
    
    # label is recorded in 700 HZ, wrist bvp is recorded in 64 HZ, we need to resample
    s_resampled_bvp = signal.resample(labels_bvp, len(bvp))
    s_resampled_rounded_bvp = np.rint(s_resampled_bvp)
    s_resampled_rounded_int_bvp = s_resampled_rounded_bvp.astype(int)
    df_labels_bvp = pd.DataFrame(data = s_resampled_rounded_int_bvp, columns = ['label'])
    
    # concat both bvp and label dataframes
    df_data_bvp = pd.concat([df_bvp, df_labels_bvp], axis=1)

    # label definitions (see WESAD/wesad_readme.pdf)
    # 0: not defined / transient; 1: baseline; 2: stress; 3: amusement; 4: meditation; 5/6/7: should be ignored
    # we drop everything except 1 (baseline) and 2 (stress)
    df_data_bvp = df_data_bvp[(df_data_bvp['label'] == 1) | (df_data_bvp['label'] == 2)]
    
    
    
    # store results
    result_dfs_acc.append(df_data_acc)
    result_dfs_bvp.append(df_data_bvp)

Processing data:  1 / 15
Processing data:  2 / 15
Processing data:  3 / 15
Processing data:  4 / 15
Processing data:  5 / 15
Processing data:  6 / 15
Processing data:  7 / 15
Processing data:  8 / 15
Processing data:  9 / 15
Processing data:  10 / 15
Processing data:  11 / 15
Processing data:  12 / 15
Processing data:  13 / 15
Processing data:  14 / 15
Processing data:  15 / 15
CPU times: total: 44.3 s
Wall time: 1min 5s


In [48]:
# merge results into one df
df_acc = pd.concat(result_dfs_acc, axis=0)
df_bvp = pd.concat(result_dfs_bvp, axis=0)

In [49]:
# set the labels to 0 (no stress) and 1 (stress)
df_acc['label'] = df_acc['label'].replace({1: 0, 2: 1})
df_bvp['label'] = df_bvp['label'].replace({1: 0, 2: 1})

In [50]:
# rename columns for easier readability
#df = df.rename(columns={'coffee_today_YES': 'coffee_today',
#                   'sport_today_YES': 'sport_today',
#                   'smoker_YES':'smoker',
#                   'feel_ill_today_YES':'feel_ill_today'})

In [51]:
# drop survey columns because the other dataset doesn't have them
#df = df[['x', 'y', 'z', 'subject', 'label']]

In [52]:
# make subject names unique for all datasets
df_acc = df_acc.astype({'subject': str})
df_acc['subject'] = 'S' + df_acc['subject']

df_bvp = df_bvp.astype({'subject': str})
df_bvp['subject'] = 'S' + df_bvp['subject']

In [53]:
# clean-up: remove entries with 5 or fewer consecutive labels in acc
df_acc['consec_labels'] = (df_acc.groupby(['subject'])['label'].diff(1) != 0).astype('int').cumsum()
df_acc = df_acc.groupby('consec_labels').filter(lambda x : len(x)>5)
df_acc = df_acc.drop(columns=['consec_labels'])

# clean-up: remove entries with 10 or fewer consecutive labels in bvp
df_bvp['consec_labels'] = (df_bvp.groupby(['subject'])['label'].diff(1) != 0).astype('int').cumsum()
df_bvp = df_bvp.groupby('consec_labels').filter(lambda x : len(x)>10)
df_bvp = df_bvp.drop(columns=['consec_labels'])

In [54]:
# look into created dfs
df_acc

Unnamed: 0,x,y,z,subject,label
2972,23.0,-1.0,59.0,S10,0
2973,24.0,0.0,59.0,S10,0
2974,25.0,0.0,58.0,S10,0
2975,23.0,-1.0,57.0,S10,0
2976,24.0,0.0,59.0,S10,0
...,...,...,...,...,...
80547,59.0,-11.0,17.0,S9,1
80548,61.0,-13.0,16.0,S9,1
80549,59.0,-13.0,16.0,S9,1
80550,59.0,-11.0,16.0,S9,1


In [55]:
df_bvp

Unnamed: 0,BVP,subject,label
5943,-158.68,S10,0
5944,-135.23,S10,0
5945,-119.96,S10,0
5946,-110.27,S10,0
5947,-103.25,S10,0
...,...,...,...
161099,56.31,S9,1
161100,57.67,S9,1
161101,56.09,S9,1
161102,51.16,S9,1


# 4. Store to Parquet file

In [56]:
# store to parquet

if not os.path.isdir('data-input'):
    os.makedirs('data-input')

df_acc.to_parquet('data-input/dataset_wesad_acc.parquet')
df_bvp.to_parquet('data-input/dataset_wesad_bvp.parquet')