In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler, NeighbourhoodCleaningRule
import seaborn as sns
import pickle
from sklearn.ensemble import RandomForestClassifier

In [2]:
variable_to_predict = 'stress'

## Configuration

In [33]:
datasets = {
    'StudentLife' : {
        'path' : 'data/processed/sl_processed_data.csv',
        'ws' : 2,
        'train_weeks' : [13, 18],
        'test_weeks' : [19, 19],
        'clf' : RandomForestClassifier(max_depth=2, n_estimators=180, criterion='gini')
    },
    'FBK' : {
        'path' : 'data/processed/fbk_processed_data.csv',
        'ws' : 2,
        'train_weeks' : [45, 50],
        'test_weeks' : [51, 51],
        # 'clf' : LogisticRegression(max_iter=1500000, random_state=24091993, C=0.046415888336127795)
        'clf' : RandomForestClassifier(max_depth=7 , n_estimators=40, random_state=24091993, criterion='gini')
    },
}

In [34]:
## create additional features
def create_features(data, window_size=2):
    data['date'] = data['date'].astype('datetime64[ns]')
    data['weekday'] = data['date'].dt.weekday
    
    # add a new column with the average value of the last two rows of other column
    wl_sum = 0
    sp_sum = 0
    wl_values = []
    sp_values = []
    wl_delta = pd.Series(dtype='float64')
    sp_delta = pd.Series(dtype='float64')
    for i in range(0, window_size):
        workload = data.groupby('user')['workload'].shift(i)
        sleep = data.groupby('user')['sleep'].shift(i)
        wl_sum += workload
        wl_values.append(workload)
        sp_sum += sleep
        sp_values.append(sleep)
        wl_delta = workload if wl_delta.empty else wl_delta - workload
        sp_delta = sleep if sp_delta.empty else sp_delta - sleep

    data['prev_stress'] = data.groupby('user')['stress'].shift(1)

    if window_size > 1:
        data['wl_avg_{}'.format(window_size)] = wl_sum / window_size
        data['sp_avg_{}'.format(window_size)] = sp_sum / window_size
        data['wl_std_{}'.format(window_size)] = np.std(wl_values, axis=0)
        data['sp_std_{}'.format(window_size)] = np.std(sp_values, axis=0)
        data['wl_max_{}'.format(window_size)] = np.max(wl_values, axis=0)
        data['sp_max_{}'.format(window_size)] = np.max(sp_values, axis=0)
        data['wl_min_{}'.format(window_size)] = np.min(wl_values, axis=0)
        data['sp_min_{}'.format(window_size)] = np.min(sp_values, axis=0)
        data['wl_delta_{}'.format(window_size)] = wl_delta
        data['sp_delta_{}'.format(window_size)] = sp_delta
    
    return data

## Classifier

In [45]:

for dataset_name, dataset in datasets.items():
    np.random.seed(24091993)
    data = pd.read_csv(dataset['path'])
    ws = dataset['ws']

    scaler = StandardScaler()

    _data = data.copy()

    _data = create_features(_data, window_size=ws)
    _data.dropna(how='any', inplace=True)
    
    _data = _data[_data['prev_stress'] == 1]
    
    _data['week'] = _data['date'].astype('datetime64[ns]').dt.isocalendar().week
    _data['weekday'] = _data['date'].astype('datetime64[ns]').dt.weekday

    features = ['sleep','workload','prev_stress', 'weekday', 'duration', 'wl_avg_{}'.format(ws), 'wl_std_{}'.format(ws), 'wl_max_{}'.format(ws), 'wl_min_{}'.format(ws), 'wl_delta_{}'.format(ws), 'sp_avg_{}'.format(ws), 'sp_std_{}'.format(ws), 'sp_max_{}'.format(ws), 'sp_min_{}'.format(ws), 'sp_delta_{}'.format(ws)]

    train_all = False
    if train_all:
        X_train = _data[features]
        y_train = _data[variable_to_predict]
    else:
        X_train = _data[features][(_data['week'] >= dataset['train_weeks'][0]) & (_data['week'] <= dataset['train_weeks'][1])]
        y_train = _data[variable_to_predict][(_data['week'] >= dataset['train_weeks'][0]) & (_data['week'] <= dataset['train_weeks'][1])]
    
    # X_train = _data[['sleep','workload','prev_stress', 'weekday']][(_data['week'] >= dataset['train_weeks'][0]) & (_data['week'] <= dataset['train_weeks'][1])]
    # y_train = _data['stress'][(_data['week'] >= dataset['train_weeks'][0]) & (_data['week'] <= dataset['train_weeks'][1])]
    
    if False and dataset_name == 'StudentLife':
        # define undersample strategy
        undersample = RandomUnderSampler(sampling_strategy='not minority')
        
        # fit and apply the transform
        X_train, y_train = undersample.fit_resample(X_train, y_train)

    X_test = _data[features][(_data['week'] >= dataset['test_weeks'][0]) & (_data['week'] <= dataset['test_weeks'][1])]
    # X_test = _data[['sleep','workload','prev_stress', 'weekday']][(_data['week'] >= dataset['test_weeks'][0]) & (_data['week'] <= dataset['test_weeks'][1])]
    y_test = _data[variable_to_predict][(_data['week'] >= dataset['test_weeks'][0]) & (_data['week'] <= dataset['test_weeks'][1])]
    
    # {'min_samples_split': 8, 'min_samples_leaf': 1, 'max_depth': 3}

    scale=False
    if scale:
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    clf = dataset['clf']
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # print coefficients
    if True or dataset_name == 'FBK':
        print('Coefficients for {} dataset'.format(dataset_name))
        print(clf.feature_importances_)
        print('')
    else:
        print('Coefficients for {} dataset'.format(dataset_name))
        print(clf.coef_)
        print('')

    # confusion matrix
    cm = classification_report(y_test, y_pred)
    print('Classification report for {} dataset'.format(dataset_name))
    print(cm)

    # save model
    pickle.dump(clf, open('{}_model_trained.pickle'.format(dataset_name), "wb"))

Coefficients for StudentLife dataset
[0.06304225 0.09257126 0.         0.09698217 0.09264077 0.10156944
 0.07480018 0.10139883 0.00140741 0.0617757  0.07759905 0.08022283
 0.0601129  0.01438156 0.08149565]

Classification report for StudentLife dataset
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.50      0.67         2

    accuracy                           0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4

Coefficients for FBK dataset
[0.07281016 0.05812827 0.         0.15962994 0.15089401 0.08621256
 0.05214806 0.04686988 0.05614315 0.08245412 0.05797999 0.02582965
 0.04702557 0.06477078 0.03910385]

Classification report for FBK dataset
              precision    recall  f1-score   support

           0       0.89      0.85      0.87        20
           1       0.25      0.33      0.29         3

    accuracy       

In [46]:
print(classification_report(y_train, clf.predict(X_train)))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94       124
           1       1.00      0.56      0.72        34

    accuracy                           0.91       158
   macro avg       0.95      0.78      0.83       158
weighted avg       0.92      0.91      0.89       158



In [47]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87        20
           1       0.25      0.33      0.29         3

    accuracy                           0.78        23
   macro avg       0.57      0.59      0.58        23
weighted avg       0.81      0.78      0.80        23

