In [39]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler, NeighbourhoodCleaningRule
import seaborn as sns
import pickle
from sklearn.ensemble import RandomForestClassifier

In [40]:
variable_to_predict = 'stress'

## Configuration

In [49]:
datasets = {
    'StudentLife' : {
        'path' : 'data/processed/sl_processed_data.csv',
        'ws' : 2,
        'train_weeks' : [13, 18],
        'test_weeks' : [19, 19],
        'clf' : RandomForestClassifier(max_depth=10, n_estimators=80, random_state=24091993, criterion='entropy') # no tocar
    },
    'FBK' : {
        'path' : 'data/processed/fbk_processed_data.csv',
        'ws' : 2,
        'train_weeks' : [45, 50],
        'test_weeks' : [51, 51],
        # 'clf' : LogisticRegression(max_iter=1500000, random_state=24091993, C=0.046415888336127795)
        'clf' : RandomForestClassifier(max_depth=9, n_estimators=140, random_state=24091993, criterion='gini')
    },
}

In [50]:
## create additional features
def create_features(data, window_size=2):
    data['date'] = data['date'].astype('datetime64[ns]')
    data['weekday'] = data['date'].dt.weekday
    
    # add a new column with the average value of the last two rows of other column
    wl_sum = 0
    sp_sum = 0
    wl_values = []
    sp_values = []
    wl_delta = pd.Series(dtype='float64')
    sp_delta = pd.Series(dtype='float64')
    for i in range(0, window_size):
        workload = data.groupby('user')['workload'].shift(i)
        sleep = data.groupby('user')['sleep'].shift(i)
        wl_sum += workload
        wl_values.append(workload)
        sp_sum += sleep
        sp_values.append(sleep)
        wl_delta = workload if wl_delta.empty else wl_delta - workload
        sp_delta = sleep if sp_delta.empty else sp_delta - sleep

    data['prev_stress'] = data.groupby('user')['stress'].shift(1)

    if window_size > 1:
        data['wl_avg_{}'.format(window_size)] = wl_sum / window_size
        data['sp_avg_{}'.format(window_size)] = sp_sum / window_size
        data['wl_std_{}'.format(window_size)] = np.std(wl_values, axis=0)
        data['sp_std_{}'.format(window_size)] = np.std(sp_values, axis=0)
        data['wl_max_{}'.format(window_size)] = np.max(wl_values, axis=0)
        data['sp_max_{}'.format(window_size)] = np.max(sp_values, axis=0)
        data['wl_min_{}'.format(window_size)] = np.min(wl_values, axis=0)
        data['sp_min_{}'.format(window_size)] = np.min(sp_values, axis=0)
        data['wl_delta_{}'.format(window_size)] = wl_delta
        data['sp_delta_{}'.format(window_size)] = sp_delta
    
    return data

## Classifier

In [51]:

for dataset_name, dataset in datasets.items():
    np.random.seed(24091993)
    data = pd.read_csv(dataset['path'])
    ws = dataset['ws']
    _data = data.copy()
    _data = create_features(_data, window_size=ws)
    _data.dropna(how='any', inplace=True)
    
    _data['week'] = _data['date'].astype('datetime64[ns]').dt.isocalendar().week
    _data['weekday'] = _data['date'].astype('datetime64[ns]').dt.weekday

    features = ['sleep','workload','prev_stress', 'weekday', 'duration', 'wl_avg_{}'.format(ws), 'wl_std_{}'.format(ws), 'wl_max_{}'.format(ws), 'wl_min_{}'.format(ws), 'wl_delta_{}'.format(ws), 'sp_avg_{}'.format(ws), 'sp_std_{}'.format(ws), 'sp_max_{}'.format(ws), 'sp_min_{}'.format(ws), 'sp_delta_{}'.format(ws)]

    train_all = False
    if train_all:
        X_train = _data[features]
        y_train = _data[variable_to_predict]
    else:
        X_train = _data[features][(_data['week'] >= dataset['train_weeks'][0]) & (_data['week'] <= dataset['train_weeks'][1])]
        y_train = _data[variable_to_predict][(_data['week'] >= dataset['train_weeks'][0]) & (_data['week'] <= dataset['train_weeks'][1])]
    
    # X_train = _data[['sleep','workload','prev_stress', 'weekday']][(_data['week'] >= dataset['train_weeks'][0]) & (_data['week'] <= dataset['train_weeks'][1])]
    # y_train = _data['stress'][(_data['week'] >= dataset['train_weeks'][0]) & (_data['week'] <= dataset['train_weeks'][1])]
    

    if False and dataset_name == 'StudentLife':
        # define undersample strategy
        undersample = NeighbourhoodCleaningRule(sampling_strategy='not minority')
        
        # fit and apply the transform
        X_train, y_train = undersample.fit_resample(X_train, y_train)

    X_test = _data[features][(_data['week'] >= dataset['test_weeks'][0]) & (_data['week'] <= dataset['test_weeks'][1])]
    # X_test = _data[['sleep','workload','prev_stress', 'weekday']][(_data['week'] >= dataset['test_weeks'][0]) & (_data['week'] <= dataset['test_weeks'][1])]
    y_test = _data[variable_to_predict][(_data['week'] >= dataset['test_weeks'][0]) & (_data['week'] <= dataset['test_weeks'][1])]
    
    # {'min_samples_split': 8, 'min_samples_leaf': 1, 'max_depth': 3}

    clf = dataset['clf']
    clf.fit(X_train.values, y_train.values)
    y_pred = clf.predict(X_test.values)

    # print coefficients
    if True or dataset_name == 'FBK':
        print('Coefficients for {} dataset'.format(dataset_name))
        print(clf.feature_importances_)
        print('')
    else:
        print('Coefficients for {} dataset'.format(dataset_name))
        print(clf.coef_)
        print('')

    # confusion matrix
    cm = classification_report(y_test.values, y_pred)
    print('Classification report for {} dataset'.format(dataset_name))
    print(cm)

    # save model
    pickle.dump(clf, open('{}_model_trained_2.pickle'.format(dataset_name), "wb"))

Coefficients for StudentLife dataset
[0.02749085 0.02266432 0.49493701 0.11398498 0.09214629 0.02962568
 0.01710214 0.02276477 0.0216628  0.0240162  0.03283002 0.02359133
 0.02191695 0.02420806 0.03105859]

Classification report for StudentLife dataset
              precision    recall  f1-score   support

           1       0.88      0.76      0.82        50
           2       0.88      0.92      0.90       148
           3       0.87      0.87      0.87        68

    accuracy                           0.88       266
   macro avg       0.88      0.85      0.86       266
weighted avg       0.88      0.88      0.87       266

Coefficients for FBK dataset
[0.0502371  0.05975045 0.23954748 0.15791923 0.11633755 0.0536789
 0.02564742 0.03383004 0.03076849 0.05472889 0.05015318 0.03001484
 0.02891562 0.03456028 0.03391052]

Classification report for FBK dataset
              precision    recall  f1-score   support

           1       0.76      0.81      0.78        31
           2       0.

In [52]:
print(classification_report(y_train, clf.predict(X_train.values)))

              precision    recall  f1-score   support

           1       0.93      1.00      0.96       184
           2       1.00      0.80      0.89        69
           3       1.00      1.00      1.00        34

    accuracy                           0.95       287
   macro avg       0.98      0.93      0.95       287
weighted avg       0.95      0.95      0.95       287



In [53]:
print(classification_report(y_test, clf.predict(X_test.values)))

              precision    recall  f1-score   support

           1       0.76      0.81      0.78        31
           2       0.33      0.33      0.33        12
           3       0.50      0.33      0.40         6

    accuracy                           0.63        49
   macro avg       0.53      0.49      0.50        49
weighted avg       0.62      0.63      0.62        49



In [8]:
_data[['change']].value_counts()

KeyError: "None of [Index(['change'], dtype='object')] are in the [columns]"