In [139]:
import pandas as pd
# import sweetviz as sv
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
#from pandas_profiling import ProfileReport

In [140]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# 1. Description
The data we have is from the nurse study and contains labled data (stress/no stress). The data contains data from differente sensors: ACC (accelerometer), BVP (blood volume pulse), EDA (electrodermal activity), HR (hear rate), TEMP (temperature). In the following, we look at the features created with the FLIRT (https://flirt.readthedocs.io/en/latest/) library (see other script).

Note: there might be other potential features to be calculated on the raw data, for example via tsfresh (https://tsfresh.readthedocs.io/en/latest/index.html) or TSFEL (https://tsfel.readthedocs.io/en/latest/). However, FLIRT was specifically developed with the wrist sensor used in the two dataset used here, so we can reasonably expect it to produce meaningful features based on the available data.

# 2. Data Source

In [146]:
# load data - features calculated with Flirt with
# window_length = 60 and
# window_step_size = 10
df = pd.read_parquet('data-input/flirt-nurse-acc-bvp-eda-hr-temp-60-10.parquet')

In [175]:
df.shape

(26654, 266)

In [147]:
df.head(3)

Unnamed: 0_level_0,bvp_BVP_mean,bvp_BVP_std,bvp_BVP_min,bvp_BVP_max,bvp_BVP_ptp,bvp_BVP_sum,bvp_BVP_energy,bvp_BVP_skewness,bvp_BVP_kurtosis,bvp_BVP_peaks,...,hr_l2_n_sign_changes,hr_l2_iqr,hr_l2_iqr_5_95,hr_l2_pct_5,hr_l2_pct_95,hr_l2_entropy,hr_l2_perm_entropy,hr_l2_svd_entropy,subject,label
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-08 08:31:00,0.376078,207.18698,-1336.28,1333.44,2669.72,1444.14,164838100.0,-0.210934,8.208982,174,...,0.0,16.19,20.185,78.669,98.854,4.090539,0.537376,0.040885,15,1
2020-07-08 08:31:10,0.813401,205.085789,-1336.28,1470.89,2807.17,3123.46,161513600.0,0.453604,10.256058,172,...,0.0,11.9525,18.416,78.669,97.085,4.091445,0.879136,0.054576,15,1
2020-07-08 08:31:20,-1.458391,183.18793,-823.18,1470.89,2294.07,-5600.22,128870200.0,1.219684,12.75353,178,...,0.0,6.735,14.3385,78.669,93.0075,4.092899,0.998001,0.068318,15,1


In [148]:
# there are no missing values in the dataset
df.isnull().sum().value_counts()

0    266
dtype: int64

In [149]:
df['subject'].value_counts()

83    4289
E4    3516
DF    2610
7A    2566
CE    1918
BG    1901
EG    1716
6B    1612
6D    1508
5C    1432
F5    1196
94     830
7E     690
8B     586
15     284
Name: subject, dtype: int64

In [150]:
df['label'].value_counts(normalize=True)

1    0.839274
0    0.160726
Name: label, dtype: float64

# 3. Train-test split
We perform the train-test split before we conduct EDA on the train set. Thus, we avoid data leakage from the test set.

In [151]:
X = df.drop(columns=['subject', 'label'])
y = df[['label']]

In [152]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [153]:
df_train= pd.concat([X_train, y_train], axis=1)

# 4. EDA

## 4.1 Looking into data

In [154]:
# we do not have categorical features, only int (count) and float
df_train.dtypes.value_counts()

float64    224
int32       30
int64       11
dtype: int64

In [156]:
df_train.describe()

  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)


Unnamed: 0,bvp_BVP_mean,bvp_BVP_std,bvp_BVP_min,bvp_BVP_max,bvp_BVP_ptp,bvp_BVP_sum,bvp_BVP_energy,bvp_BVP_skewness,bvp_BVP_kurtosis,bvp_BVP_peaks,...,hr_l2_n_below_mean,hr_l2_n_sign_changes,hr_l2_iqr,hr_l2_iqr_5_95,hr_l2_pct_5,hr_l2_pct_95,hr_l2_entropy,hr_l2_perm_entropy,hr_l2_svd_entropy,label
count,21323.0,21323.0,21323.0,21323.0,21323.0,21323.0,21323.0,21323.0,21323.0,21323.0,...,21323.0,21323.0,21323.0,21323.0,21323.0,21323.0,21323.0,21323.0,21323.0,21323.0
mean,-0.000777,81.896289,-480.656962,441.288654,921.945616,-2.289911,38882190.0,-0.275215,9.714851,147.879942,...,29.690475,0.0,5.03422,8.901721,79.926363,88.828083,4.069301,0.693065,0.028639,0.838859
std,0.689915,60.261046,370.4849,347.764399,695.564727,2201.859181,55445800.0,0.982297,12.333167,38.255196,...,6.448868,0.0,4.774026,7.483727,12.146316,14.762834,0.164423,0.330956,0.014352,0.367669
min,-19.12429,2.391237,-3096.16,7.68,16.71,-21276.77,4267.821,-10.078009,-1.214924,11.0,...,3.0,0.0,0.06,0.1,51.6365,53.937,1.945893,-0.0,0.001518,0.0
25%,-0.154538,35.961048,-667.85,192.485,406.155,-581.305,4857504.0,-0.645942,3.209126,123.0,...,26.0,0.0,1.6575,3.539,72.98,79.324,4.092998,0.537376,0.018573,1.0
50%,-0.000911,63.184157,-381.27,353.58,749.17,-3.49,15088620.0,-0.239709,6.138227,146.0,...,30.0,0.0,3.49,6.676,78.522,86.47,4.093934,0.831474,0.026046,1.0
75%,0.156691,114.524212,-205.97,588.04,1265.59,590.59,49438610.0,0.152924,11.628347,170.0,...,34.0,0.0,6.85,11.9995,85.7975,95.8225,4.09423,0.962146,0.035681,1.0
max,20.174836,468.793463,-8.27,2506.46,5525.44,22740.86,520889000.0,10.436767,172.777141,479.0,...,51.0,0.0,50.9125,80.0085,165.049,179.8715,4.094344,1.0,0.265936,1.0


In [157]:
# remove rows with only one value for each row
#overall_length = len(df_train)
columns = df_train.columns.tolist()
constant_columns = []

for c in columns:
    unique_in_column = len(df_train[c].unique())
    
    #if unique_in_column/overall_length < 0.1 and c != 'label' and c != 'subject':
    if unique_in_column == 1:
        constant_columns.append(c)

In [158]:
constant_columns

['bvp_BVP_entropy',
 'acc_l2_n_sign_changes',
 'temp_TEMP_n_sign_changes',
 'temp_l2_n_sign_changes',
 'hr_HR_n_sign_changes',
 'hr_l2_n_sign_changes']

In [159]:
# remove rows where we cannot calculate sandard deviation of the column

df_desc = df_train.describe()
columns_describe = df_train.describe().columns.tolist()
no_std_columns = []

for c in columns:
    std = df_desc[c]['std']

    #if unique_in_column/overall_length < 0.1 and c != 'label' and c != 'subject':
    if np.isnan(std):
        no_std_columns.append(c)

  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)


In [160]:
no_std_columns

['bvp_BVP_entropy', 'acc_x_entropy', 'acc_y_entropy', 'acc_z_entropy']

In [170]:
columns_to_drop = list((set(constant_columns).union(set(no_std_columns))))

In [171]:
columns_to_drop

['acc_y_entropy',
 'bvp_BVP_entropy',
 'acc_z_entropy',
 'hr_HR_n_sign_changes',
 'hr_l2_n_sign_changes',
 'temp_l2_n_sign_changes',
 'acc_l2_n_sign_changes',
 'acc_x_entropy',
 'temp_TEMP_n_sign_changes']

In [172]:
df_train = df_train.drop(columns=columns_to_drop)

## 4.2 Correlations

In [None]:
plt.figure(figsize=(35, 35))
corr = df_train.corr(method='spearman')
heatmap = sns.heatmap(corr.sort_values(by='label', ascending=False),
                      vmin=-1, vmax=1, annot=True, fmt='.1g', cmap='BrBG')
heatmap.set_title('Features correlating with stress label', fontdict={'fontsize':15}, pad=16);

The image is really large, so we load a screenshot here
![alt text](eda-nurse-acc-bvp-eda-hr-temp.jpg)

# 5. Documenting data lineage

The dataset contains
* BVP (blood volume pulse) sensor data in 64hz
* ACC: accelerometer data (x, y, z values) in 32hz
* EDA (electrodermal activity) in 4hz
* HR (hear rate) in 1hz
* TEMP (temperature) in 4hz

The dataset is labeled (stress/no stress).

Script ```01-extract-data-from-nurse-dataset``` downloads the nurse study dataset. It unzips all included files. From the raw data from the Empatica E4 wrist sensor, we use the devicely library (https://hpi-dhc.github.io/devicely/) for extracing the needed data and setting it in the correct timezone given in the publication about the data (https://www.nature.com/articles/s41597-022-01361-y). We look up the ground-truth labels and use the stress/no stress labels to label the sensor data. The results are stored as a parquet file.

In script ```02-calculate-features```, we calculate features with the FLIRT library (https://flirt.readthedocs.io/en/latest/). In this notebook you're currently reading, we perform EDA. In the following steps, we might want to go back to feature calculation and calculate other/more features.

# 6. Observations from EDA

### Looking into the dataset
* Given the windows size and step size, we have 26654 rows and 264 features.
* We could also calculate feature via tsfresh and/or TSFEL; and we could try different parameters for window_length and window_step_size when using FLIRT.
* There are no missing values.
* We have 84% of positive cases (stress) and 16% of negative cases in our data - we have to account for this when building and evaluating the model, e.g., by using appropriate evaluation metrics for imbalanced data.
* We do not have categorical variables, only numerical (count and float).

* There are a some columns that we drop, because they either have have the same value for each row, or it is impossible to calculate the standard deviation on them:
  * acc_y_entropy
  * bvp_BVP_entropy
  * acc_z_entropy
  * hr_HR_n_sign_changes
  * hr_l2_n_sign_changes
  * temp_l2_n_sign_changes
  * acc_l2_n_sign_changes
  * acc_x_entropy
  * temp_TEMP_n_sign_changes
* The ranges of the values are quite far from each other - we should normalize/standardize.

### Correlations
* There are several correlated features. Because of the amount of features, we should apply an automated method for deciding which features to keep.