In [7]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import seaborn as sns

## Configuration

In [2]:
datasets = {
    'StudentLife' : {
        'path' : 'data/processed/sl_processed_data.csv',
        'features' : ['sleep','workload','weekday','prev_stress'],
        'ws' : 1,
        'train_weeks' : [13, 18],
        'test_weeks' : [19, 19],
        'clf' : LogisticRegression(max_iter=1500000, random_state=24091993, C= 2.1544346900318843)
    },
    'FBK' : {
        'path' : 'data/processed/fbk_processed_data.csv',
        'features' : ['sleep','workload','weekday','prev_stress'],
        'ws' : 1,
        'train_weeks' : [45, 50],
        'test_weeks' : [51, 51],
        'clf' : DecisionTreeClassifier(max_depth=3, min_samples_leaf=1, min_samples_split=8)
    },
}

In [3]:
## create additional features
def create_features(data, window_size=2):
    data['date'] = data['date'].astype('datetime64[ns]')
    data['weekday'] = data['date'].dt.weekday
    
    # add a new column with the average value of the last two rows of other column
    wl_sum = 0
    sp_sum = 0
    wl_values = []
    sp_values = []
    wl_delta = pd.Series(dtype='float64')
    sp_delta = pd.Series(dtype='float64')
    for i in range(0, window_size):
        workload = data.groupby('user')['workload'].shift(i)
        sleep = data.groupby('user')['sleep'].shift(i)
        wl_sum += workload
        wl_values.append(workload)
        sp_sum += sleep
        sp_values.append(sleep)
        wl_delta = workload if wl_delta.empty else wl_delta - workload
        sp_delta = sleep if sp_delta.empty else sp_delta - sleep

    data['prev_stress'] = data.groupby('user')['stress'].shift(1)

    if window_size > 1:
        data['wl_avg_{}'.format(window_size)] = wl_sum / window_size
        data['sp_avg_{}'.format(window_size)] = sp_sum / window_size
        data['wl_std_{}'.format(window_size)] = np.std(wl_values, axis=0)
        data['sp_std_{}'.format(window_size)] = np.std(sp_values, axis=0)
        data['wl_max_{}'.format(window_size)] = np.max(wl_values, axis=0)
        data['sp_max_{}'.format(window_size)] = np.max(sp_values, axis=0)
        data['wl_min_{}'.format(window_size)] = np.min(wl_values, axis=0)
        data['sp_min_{}'.format(window_size)] = np.min(sp_values, axis=0)
        data['wl_delta_{}'.format(window_size)] = wl_delta
        data['sp_delta_{}'.format(window_size)] = sp_delta
    
    return data

## Classifier

In [11]:

for dataset_name, dataset in datasets.items():
    data = pd.read_csv(dataset['path'])
    ws = dataset['ws']
    features = dataset['features']
    _data = data.copy()
    _data = create_features(_data, window_size=ws)
    _data.dropna(how='any', inplace=True)
    
    _data['week'] = _data['date'].astype('datetime64[ns]').dt.isocalendar().week
    scores = []
    for user in _data['user'].unique().tolist():
        __data = _data.loc[_data['user'] == user].copy()
        X_train = __data[features][(__data['week'] >= dataset['train_weeks'][0]) & (__data['week'] <= dataset['train_weeks'][1])]
        y_train = __data['stress'][(__data['week'] >= dataset['train_weeks'][0]) & (__data['week'] <= dataset['train_weeks'][1])]
        X_test = __data[features][(__data['week'] >= dataset['test_weeks'][0]) & (__data['week'] <= dataset['test_weeks'][1])]
        y_test = __data['stress'][(__data['week'] >= dataset['test_weeks'][0]) & (__data['week'] <= dataset['test_weeks'][1])]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        # {'min_samples_split': 8, 'min_samples_leaf': 1, 'max_depth': 3}

        clf = dataset['clf']
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # fscore
        fscore = f1_score(y_test, y_pred, average='weighted')

        # confusion matrix
        cm = classification_report(y_test, y_pred)
        scores.append(fscore)
    
    print('Average F1 score for {} dataset: {}'.format(dataset_name, np.mean(scores)))

Average F1 score for StudentLife dataset: 0.6130952380952381
Average F1 score for FBK dataset: 0.5978937728937729


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
print(classification_report(y_train, clf.predict(X_train)))

              precision    recall  f1-score   support

           1       0.82      0.96      0.89        28
           2       0.88      0.54      0.67        13

    accuracy                           0.83        41
   macro avg       0.85      0.75      0.78        41
weighted avg       0.84      0.83      0.82        41

