In [2]:
import numpy as np
import pandas as pd
import pickle
from scipy import signal
import os
import zipfile
import urllib.request
import shutil
import cvxopt
import pyarrow
import matplotlib.pyplot as plt

# 1. Download WESAD dataset

The WESAD dataset has to be downloaded and extracted. The website is:
https://uni-siegen.sciebo.de/s/HGdUkoNlW1Ub0Gx/download

In [3]:
%%time

# download if not already available
filename = 'WESAD.zip'
if not os.path.isfile(filename):
    print('downloading')
    urllib.request.urlretrieve('https://uni-siegen.sciebo.de/s/HGdUkoNlW1Ub0Gx/download', filename)

CPU times: total: 0 ns
Wall time: 0 ns


In [4]:
%%time

# exctract if folder is not present
toFolder = 'WESAD'
if not os.path.isdir(toFolder):
    print('extracting')
    with zipfile.ZipFile(filename, 'r') as zfile:
        zfile.extractall(path=toFolder)

CPU times: total: 0 ns
Wall time: 0 ns


In [5]:
# we need to move everything inside a data folder, so all scripts work
if not os.path.isdir('WESAD/data'):
    os.makedirs('WESAD/data')

    original = 'WESAD/WESAD'
    target = 'WESAD/data/WESAD'

    shutil.move(original, target)

# 2. Extract all data from WESAD dataset
The dataseet contains lots of data from several sources. In this project, we focus on accelerometer data from a wrist band sensor - such sensors are ubiquitous and a predition made from their data has the most business impact.

In [6]:
# get all participant folders
participants = [name for name in os.listdir('WESAD/data/WESAD/') if os.path.isdir('WESAD/data/WESAD/'+name)]
result_dfs_acc = []
result_dfs_bvp = []
result_dfs_eda = []
result_dfs_temp = []

In [7]:
def get_data(data, location, sensor, columnslist):
    
    # we ge the relevant data - wrist accelerometer data and labels
    sensor_data = data['signal'][location][sensor]
    labels = data['label']
    
    # create df data and add subject column
    df = pd.DataFrame(data=sensor_data, columns=columnslist)
    df['subject'] = int(p[1:])
    
    # label is recorded in 700 HZ, resample to given frequency
    labels_resampled = signal.resample(labels, len(sensor_data))
    labels_resampled = np.rint(labels_resampled)
    labels_resampled = labels_resampled.astype(int)
    df_labels = pd.DataFrame(data=labels_resampled, columns = ['label'])
    
    #print(len_data)
    #print(len_labels)
    
    # concat df and label df
    df_res = pd.concat([df, df_labels], axis=1)

    # label definitions (see WESAD/wesad_readme.pdf)
    # 0: not defined / transient; 1: baseline; 2: stress; 3: amusement; 4: meditation; 5/6/7: should be ignored
    # we drop everything except 1 (baseline) and 2 (stress)
    df_res = df_res[(df_res['label'] == 1) | (df_res['label'] == 2)]
    
    # set the labels to 0 (no stress) and 1 (stress)
    df_res['label'] = df_res['label'].replace({1: 0, 2: 1})
        
    # remove miniscule errors introduced from resampling
    # remove rows with too few consecutive labels
    n = 10
    df_res['consec_labels'] = (df_res.groupby(['subject'])['label'].diff(1) != 0).astype('int').cumsum()
    df_res = df_res.groupby('consec_labels').filter(lambda x : len(x)>n)
    df_res = df_res.drop(columns=['consec_labels'])
    
    # split into session of specific labels
    df_res['session'] = (df_res['label'].diff() != 0).cumsum()

    return df_res

In [8]:
%%time
# look over all participant data and extract wrist acc and label data;
# combine with demographic data

cnt = 0

for p in participants:
    
    cnt += 1
    print('Processing data: ',cnt,'/',len(participants))
    
    file = open('WESAD/data/WESAD/'+p+'/'+p+'.pkl', 'rb')
    s = pickle.load(file, encoding = 'latin1')
    
    
    df_acc = get_data(s, 'wrist', 'ACC', ['x', 'y', 'z'])
    df_bvp = get_data(s, 'wrist', 'BVP', ['BVP'])
    df_eda = get_data(s, 'wrist', 'EDA', ['EDA'])
    df_temp = get_data(s, 'wrist', 'TEMP', ['TEMP'])
    
    # assert that all features have the same number of sessions
    acc_sessions = df_acc['session'].nunique()
    assert(df_bvp['session'].nunique() == acc_sessions)
    assert(df_eda['session'].nunique() == acc_sessions)
    assert(df_temp['session'].nunique() == acc_sessions)
    
    # store results
    result_dfs_acc.append(df_acc)
    result_dfs_bvp.append(df_bvp)
    result_dfs_eda.append(df_eda)
    result_dfs_temp.append(df_temp)

Processing data:  1 / 15
Processing data:  2 / 15
Processing data:  3 / 15
Processing data:  4 / 15
Processing data:  5 / 15
Processing data:  6 / 15
Processing data:  7 / 15
Processing data:  8 / 15
Processing data:  9 / 15
Processing data:  10 / 15
Processing data:  11 / 15
Processing data:  12 / 15
Processing data:  13 / 15
Processing data:  14 / 15
Processing data:  15 / 15
CPU times: total: 13.5 s
Wall time: 46.3 s


In [10]:
# merge results into one df
df_acc = pd.concat(result_dfs_acc, axis=0)
df_bvp = pd.concat(result_dfs_bvp, axis=0)
df_eda = pd.concat(result_dfs_eda, axis=0)
df_temp = pd.concat(result_dfs_temp, axis=0)

In [11]:
# look into created dfs
df_acc

Unnamed: 0,x,y,z,subject,label,session
2972,23.0,-1.0,59.0,10,0,1
2973,24.0,0.0,59.0,10,0,1
2974,25.0,0.0,58.0,10,0,1
2975,23.0,-1.0,57.0,10,0,1
2976,24.0,0.0,59.0,10,0,1
...,...,...,...,...,...,...
80547,59.0,-11.0,17.0,9,1,2
80548,61.0,-13.0,16.0,9,1,2
80549,59.0,-13.0,16.0,9,1,2
80550,59.0,-11.0,16.0,9,1,2


In [12]:
df_bvp

Unnamed: 0,BVP,subject,label,session
5943,-158.68,10,0,1
5944,-135.23,10,0,1
5945,-119.96,10,0,1
5946,-110.27,10,0,1
5947,-103.25,10,0,1
...,...,...,...,...
161099,56.31,9,1,2
161100,57.67,9,1,2
161101,56.09,9,1,2
161102,51.16,9,1,2


In [13]:
df_eda

Unnamed: 0,EDA,subject,label,session
372,0.373519,10,0,1
373,0.374798,10,0,1
374,0.373519,10,0,1
375,0.376077,10,0,1
376,0.372240,10,0,1
...,...,...,...,...
10064,1.131126,9,1,2
10065,1.140080,9,1,2
10066,1.147755,9,1,2
10067,1.146476,9,1,2


In [14]:
df_temp

Unnamed: 0,TEMP,subject,label,session
372,33.25,10,0,1
373,33.23,10,0,1
374,33.23,10,0,1
375,33.23,10,0,1
376,33.23,10,0,1
...,...,...,...,...
10064,32.83,9,1,2
10065,32.83,9,1,2
10066,32.84,9,1,2
10067,32.84,9,1,2


# 3. Store to Parquet file

In [15]:
# store to parquet

if not os.path.isdir('data-input'):
    os.makedirs('data-input')

df_acc.to_parquet('data-input/dataset_wesad_wrist_acc.parquet')
df_bvp.to_parquet('data-input/dataset_wesad_wrist_bvp.parquet')
df_eda.to_parquet('data-input/dataset_wesad_wrist_eda.parquet')
df_temp.to_parquet('data-input/dataset_wesad_wrist_temp.parquet')