# WESAD dataset preprocessing and exploratory data analysis 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
DATA_PATH = 'data/WESAD/'
DATA_PATH = 'data/WESAD/'
chest_columns=['sid', 'acc1', 'acc2', 'acc3', 'ecg', 'emg', 'eda', 'temp', 'resp', 'label']
all_columns =['sid', 'c_acc_x', 'c_acc_y', 'c_acc_z', 'ecg', 'emg', 'c_eda', 'c_temp', 'resp', 'w_acc_x' , 'w_acc_y', 'w_acc_z', 'bvp', 'w_eda', 'w_temp', 'label']
ids = [2,3,4,5,6,7,8,9,10,11,13,14,15,16,17]


sf_BVP = 64
sf_EDA = 4
sf_TEMP = 4
sf_ACC = 32
sf_chest = 700 


In [3]:
for i in ids:
    print(DATA_PATH + 'S' + str(i) + '/S' + str(i) + '.pkl')

data/WESAD/S2/S2.pkl
data/WESAD/S3/S3.pkl
data/WESAD/S4/S4.pkl
data/WESAD/S5/S5.pkl
data/WESAD/S6/S6.pkl
data/WESAD/S7/S7.pkl
data/WESAD/S8/S8.pkl
data/WESAD/S9/S9.pkl
data/WESAD/S10/S10.pkl
data/WESAD/S11/S11.pkl
data/WESAD/S13/S13.pkl
data/WESAD/S14/S14.pkl
data/WESAD/S15/S15.pkl
data/WESAD/S16/S16.pkl
data/WESAD/S17/S17.pkl


## Load data for all subjects

Once data was loaded it will also be stored in the dataframe format back to the file, so next time it can be read from the dataframe formated file instead of the pickle dictionary file.

In [4]:
df = pd.read_pickle(("merged_chest.pkl"))

FileNotFoundError: [Errno 2] No such file or directory: 'merged_chest.pkl'

In [None]:
df.head()

In [None]:
df.label.value_counts()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df_fltr = df[df["label"].isin([1,2,3])]

In [None]:
df_fltr.groupby(['sid', 'label']).count()

In [None]:
# 1 = baseline, 2 = stress, 3 = amusement
perc_data = df_fltr.label.value_counts()

In [None]:
perc_data

In [None]:
perc_data = pd.DataFrame (perc_data)

In [None]:
perc_data

In [None]:
perc_data.index

In [None]:
plot = perc_data.plot.pie(y="label", title="Baseline vs Stress vs Amusement", legend=False, \
                   autopct='%1.1f%%',  \
                   shadow=True, startangle=0)

In [None]:
df_fltr.info()

## Data Visualization

In [None]:
# check correlation
plt.figure(figsize=(10,8))
sns.heatmap(df_fltr.corr(),cmap='Blues',annot=False) 

In [None]:
# Label correlation matrix
k = 15 #number of variables for heatmap
cols = df_fltr.corr().nlargest(k, 'label')['label'].index
cm = df_fltr[cols].corr()
plt.figure(figsize=(16,12))
sns.heatmap(cm, annot=True, cmap = 'viridis')

## TSNE analysis based on individual subject data

In [None]:
df2 = pd.read_pickle("S2.pkl")

In [None]:
df2.columns

In [None]:
N = 10000
feat_cols = ['acc1', 'acc2', 'acc3', 'ecg', 'emg', 'eda', 'temp', 'resp']
df_subset = df2.sample(n=N, random_state=42)
data_subset = df_subset[feat_cols].values

In [None]:
import time
from sklearn.manifold import TSNE

In [None]:
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=5000)
tsne_results = tsne.fit_transform(data_subset)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
df_subset['tsne-2d-one'] = tsne_results[:,0]
df_subset['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(10,8))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="label",
    palette=sns.color_palette("hls", 3),
    data=df_subset,
    legend="full",
    alpha=0.3
)

## TSNE analysis based on all subjects data

In [None]:
N = 10000
feat_cols = ['acc1', 'acc2', 'acc3', 'ecg', 'emg', 'eda', 'temp', 'resp']
df_subset = df_fltr.sample(n=N, random_state=42)
data_subset = df_subset[feat_cols].values

In [None]:
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=5000)
tsne_results = tsne.fit_transform(data_subset)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
df_subset['tsne-2d-one'] = tsne_results[:,0]
df_subset['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(10,8))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="label",
    palette=sns.color_palette("hls", 3),
    data=df_subset,
    legend="full",
    alpha=0.3
)

## Outliers

The box shows the quartiles of the dataset while the whiskers extend to show the rest of the distribution.


In [None]:
df_fltr.columns.values

In [None]:
# to check outliers
l = df_fltr.columns.values[1:]
number_of_columns=14
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(20,70))
for i in range(0,len(l)-1):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.set_style('whitegrid')
    sns.boxplot(df_fltr[l[i]],color='green',orient='v')
    plt.tight_layout()

## Check destribution of variables

In [None]:
plt.figure(figsize=(2*number_of_columns,5*number_of_rows))
for i in range(0,len(l)-1):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.distplot(df_fltr[l[i]],kde=True) 

## Analysis on individual subjects

In [None]:
for i, sid in enumerate(ids):
    file = 'S' + str(sid) + '.pkl'
    print("")
    print(file)
    df = pd.read_pickle(file)
    df=df.drop(['sid'], axis=1)
    print("df.label.value_counts: ")
    print(df.label.value_counts())
    plt.figure(figsize=(10,8))
    plt.title('S' + str(sid))
    sns.heatmap(df.corr(),cmap='Blues',annot=False) 
    

In [None]:
l = ['acc1', 'acc2', 'acc3', 'ecg', 'emg', 'eda', 'temp', 'resp']
number_of_columns=8
number_of_rows = len(l)-1/number_of_columns
for i, sid in enumerate(ids):
    file = 'S' + str(sid) + '.pkl'
    df = pd.read_pickle(file)
    df=df.drop(['sid'], axis=1)
    plt.figure(figsize=(2*number_of_columns,5*number_of_rows))
    plt.title('S' + str(sid))
    for i in range(0,len(l)-1):
        plt.subplot(number_of_rows + 1,number_of_columns,i+1)
        sns.distplot(df[l[i]],kde=True) 

In [None]:
for i, sid in enumerate(ids):
    file = 'S' + str(sid) + '.pkl'
    df = pd.read_pickle(file)
    df=df.drop(['sid'], axis=1)
    df.plot(subplots=True, figsize=(15,10), title ='S' + str(sid) )

## Investigate negative or invalid values

In [None]:
df3 = pd.read_pickle("S3.pkl")

In [None]:
df3[df3["temp"]<0]

In [None]:
df_fltr = df_fltr[df_fltr["temp"]>0]

In [None]:
 df_fltr["temp"].min()

In [None]:
 df_fltr["temp"].max()

In [None]:
 df3["temp"].plot(subplots=True, figsize=(15,10), title ='S' + str(sid) )