# Fine tuning of the approach

## Imports

In [20]:
import pandas as pd
import numpy as np
import pickle

from tqdm.notebook import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.feature_selection import RFECV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler, NeighbourhoodCleaningRule
from sklearn.preprocessing import StandardScaler

In [39]:
def select_classifier(clf_name):
    if clf_name == 'LogR':
        return LogisticRegression(max_iter=1500000, random_state=24091993)
    elif clf_name == 'DT':
        return DecisionTreeClassifier(random_state=24091993)
    elif clf_name == 'RF':
        return RandomForestClassifier(random_state=24091993)
    elif clf_name == 'ADA':
        return AdaBoostClassifier(random_state=24091993, estimator=LogisticRegression(max_iter=1500000, random_state=24091993))
    

def select_params(clf_name):
    if clf_name == 'LogR':
        return {'C': np.logspace(-3,2, num=10),}
    elif clf_name == 'LinSVM':
        return {'C': np.logspace(-3,2, num=10), }
    elif clf_name == 'DT':
        return {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
    elif clf_name == 'RF':
        return {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'criterion' :['gini', 'entropy']}
    elif clf_name == 'ADA':
        return {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}

def create_features(data, window_size=2):
    data['date'] = data['date'].astype('datetime64[ns]')
    data['weekday'] = data['date'].dt.weekday
    
    # add a new column with the average value of the last two rows of other column
    wl_sum = 0
    sp_sum = 0
    voice_sum = 0
    wl_values = []
    sp_values = []
    voice_values = []
    wl_delta = pd.Series(dtype='float64')
    sp_delta = pd.Series(dtype='float64')
    voice_delta = pd.Series(dtype='float64')
    for i in range(0, window_size):
        workload = data.groupby('user')['workload'].shift(i)
        sleep = data.groupby('user')['sleep'].shift(i)
        voice = data.groupby('user')['voice'].shift(i)
        wl_sum += workload
        wl_values.append(workload)
        sp_sum += sleep
        sp_values.append(sleep)
        voice_sum += voice
        voice_values.append(voice)
        wl_delta = workload if wl_delta.empty else wl_delta - workload
        sp_delta = sleep if sp_delta.empty else sp_delta - sleep
        voice_delta = voice if voice_delta.empty else voice_delta - voice        

    if window_size > 2:
        stress_sum = 0
        stress_values = []
        stress_delta = pd.Series(dtype='float64')

        for i in range(1, window_size):
            stress = data.groupby('user')['stress'].shift(i)
            stress_sum += stress
            stress_values.append(stress)
            stress_delta = stress if stress_delta.empty else stress_delta - stress

    data['prev_stress'] = data.groupby('user')['stress'].shift(1)
    
    
    if window_size > 1:
        data['wl_avg_{}'.format(window_size)] = wl_sum / window_size
        data['sp_avg_{}'.format(window_size)] = sp_sum / window_size
        data['wl_std_{}'.format(window_size)] = np.std(wl_values, axis=0)
        data['sp_std_{}'.format(window_size)] = np.std(sp_values, axis=0)
        data['wl_max_{}'.format(window_size)] = np.max(wl_values, axis=0)
        data['sp_max_{}'.format(window_size)] = np.max(sp_values, axis=0)
        data['wl_min_{}'.format(window_size)] = np.min(wl_values, axis=0)
        data['sp_min_{}'.format(window_size)] = np.min(sp_values, axis=0)
        data['wl_delta_{}'.format(window_size)] = wl_delta
        data['sp_delta_{}'.format(window_size)] = sp_delta
        
        
        data['voice_avg_{}'.format(window_size)] = voice_sum / window_size
        data['voice_std_{}'.format(window_size)] = np.std(voice_values, axis=0)
        data['voice_max_{}'.format(window_size)] = np.max(voice_values, axis=0)
        data['voice_min_{}'.format(window_size)] = np.min(voice_values, axis=0)
        data['voice_delta_{}'.format(window_size)] = voice_delta

        if window_size > 2:
            data['stress_avg_{}'.format(window_size)] = stress_sum / (window_size-1)
            data['stress_std_{}'.format(window_size)] = np.std(stress_values, axis=0)
            data['stress_max_{}'.format(window_size)] = np.max(stress_values, axis=0)
            data['stress_min_{}'.format(window_size)] = np.min(stress_values, axis=0)
            data['stress_delta_{}'.format(window_size)] = stress_delta
        
    return data


In [22]:
data_strategies = ['balanced']

datasets = {
    'sl' : {
        'cut_strategies' : [''],
        'weeks' : {
            'balanced' : {
                'start' : 13,
                'length' : 7,
                'train_lengths': [5, 6] 
            },
            'reliable' : {
                'start' : 13,
                'length' : 4,
                'train_lengths': [2, 3] 
            },
        },
        'max_missing_percentages' : [75]

    },
    'fbk' : {
        'cut_strategies' : ['_notcut'],
        'weeks' : {
            'balanced' : {
                'start' : 45,
                'length' : 7,
                'train_lengths': [5, 6] 
            },
            'reliable' : {
                'start' : 46,
                'length' : 5,
                'train_lengths': [3, 4] 
            },
        },
        'max_missing_percentages' : [60]
    }
}

In [70]:
def evaluate(dataset_name, cut_strategy, max_missing_percentage, data_strategy, ws, dataset):
    scores = []
    for clf_name in ['RF']:
        np.random.seed(24091993)
        cv = StratifiedKFold(shuffle=True, n_splits=5, random_state=24091993)
        clf = select_classifier(clf_name)
        params = select_params(clf_name)
        
        data = dataset.copy()

        X = data.drop(['stress', 'date', 'duration','user'], axis=1)#data[['sleep', 'workload', 'weekday', 'prev_stress', 'voice', 'weekday']]
        cols = X.columns
        y = data['stress']
        
        
        # undersample = NeighbourhoodCleaningRule(sampling_strategy='not minority')
        # fit and apply the transform
        # X, y = undersample.fit_resample(X, y)
        
        rscv = RandomizedSearchCV(clf, params, cv=cv, scoring='f1_weighted', n_jobs=-1)
        rscv.fit(X, y)
        scores.append([dataset_name, clf_name, rscv.best_params_, rscv.best_score_,  max_missing_percentage, data_strategy, ws, cut_strategy, cols])
    return scores

In [71]:
scores_array = []
for dataset_name in ['sl', 'fbk']:
    for cut_strategy in datasets[dataset_name]['cut_strategies']:
        for max_missing_percentage in datasets[dataset_name]['max_missing_percentages']:
            for data_strategy in data_strategies:
                dataframe = pd.read_csv('../data/processed/{}_processed_data_{}_{}{}.csv'.format(dataset_name, max_missing_percentage, data_strategy, cut_strategy))
                data = dataframe.copy()
                data = create_features(data, window_size=2)
                data.dropna(inplace=True)
                scores = evaluate(dataset_name, cut_strategy, max_missing_percentage, data_strategy, 2, data)
                scores = pd.DataFrame(scores, columns=['dataset', 'classifier','params', 'score', 'max_missing_percentage', 'data_strategy', 'ws', 'cut_strategy', 'features'])
                scores_array.append(scores)
scores = pd.concat(scores_array)

In [72]:
scores.sort_values(by=['score'], ascending=False).head(15)

Unnamed: 0,dataset,classifier,params,score,max_missing_percentage,data_strategy,ws,cut_strategy,features
0,fbk,RF,"{'n_estimators': 120, 'max_depth': 6, 'criteri...",0.759557,60,balanced,2,_notcut,"Index(['workload', 'sleep', 'voice', 'weekday'..."
0,sl,RF,"{'n_estimators': 80, 'max_depth': 10, 'criteri...",0.752359,75,balanced,2,,"Index(['workload', 'sleep', 'voice', 'weekday'..."


## Train model with the fine-tuned parameters

In [73]:
clf = select_classifier(scores[scores['dataset'] == 'sl'].query('score == score.max()').iloc[0]['classifier'])
clf.set_params(**scores[scores['dataset'] == 'sl'].query('score == score.max()').iloc[0]['params'])
data = pd.read_csv('../data/processed/sl_processed_data_75_balanced.csv')
data = create_features(data, window_size=2)
data.dropna(inplace=True)
data['prev_stress'] = data['prev_stress'].astype(int)
data['week'] = data['date'].astype('datetime64[ns]').dt.isocalendar().week
training_data = data[(data['week'] >= 13) & (data['week'] <= 17)]
test_data = data[(data['week'] >= 18) & (data['week'] < 20)]
X_train = training_data.drop(['stress', 'date', 'duration','user', 'week'], axis=1).values
y_train = training_data['stress']

# undersample = NeighbourhoodCleaningRule(sampling_strategy='not minority')
# fit and apply the transform
# X_train, y_train = undersample.fit_resample(X_train, y_train)

clf.fit(X_train, y_train)
y_pred = clf.predict(test_data.drop(['stress', 'date', 'duration','user', 'week'], axis=1).values)
y_true = test_data['stress']
print(clf.feature_importances_)
print(classification_report(y_true, y_pred))
pickle.dump(clf, open('../trained_models/sl_data_75_balanced_model_trained.pickle', "wb"))

[0.02517346 0.02842446 0.02897885 0.09050737 0.38226862 0.0323143
 0.03561193 0.01497267 0.02499668 0.02530832 0.02482898 0.02303304
 0.02487759 0.02583235 0.0347456  0.05091746 0.03168468 0.03220943
 0.0292124  0.03410179]
              precision    recall  f1-score   support

           1       0.70      0.74      0.72        88
           2       0.82      0.83      0.82       265
           3       0.84      0.82      0.83       179

    accuracy                           0.81       532
   macro avg       0.79      0.79      0.79       532
weighted avg       0.81      0.81      0.81       532



In [59]:
print(classification_report(training_data['stress'], clf.predict(training_data.drop(['stress', 'date', 'duration', 'user', 'week'], axis=1).values)))

              precision    recall  f1-score   support

           1       0.81      0.68      0.74       343
           2       0.82      0.81      0.81       608
           3       0.76      0.91      0.83       341

    accuracy                           0.80      1292
   macro avg       0.80      0.80      0.80      1292
weighted avg       0.80      0.80      0.80      1292



In [12]:
clf1 = RandomForestClassifier(criterion='entropy', max_depth=9, n_estimators=30, random_state=24091993)
clf2 = LogisticRegression(max_iter=1500000, random_state=24091993, C=27.825594022071257)
clf = RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=80,
                       random_state=24091993)
data = pd.read_csv('../data/processed/fbk_processed_data_60_balanced_notcut.csv')
data = create_features(data, window_size=2)
data.dropna(inplace=True)
data['week'] = data['date'].astype('datetime64[ns]').dt.isocalendar().week
training_data = data[(data['week'] >= 45) & (data['week'] <= 45 + 6 - 1)]
test_data = data[(data['week'] >= 45 + 6) & (data['week'] < 45 + 7)]
clf.fit(training_data.drop(['stress', 'date', 'duration', 'user', 'week'], axis=1).values, training_data['stress'])
y_pred = clf.predict(test_data.drop(['stress', 'date', 'duration', 'user', 'week'], axis=1).values)
y_true = test_data['stress']
print(classification_report(y_true, y_pred))
pickle.dump(clf, open('../trained_models/fbk_data_60_balanced_notcut_model_trained.pickle', "wb"))

              precision    recall  f1-score   support

           1       0.87      0.90      0.89       100
           2       0.76      0.80      0.78        59
           3       0.83      0.67      0.74        30

    accuracy                           0.83       189
   macro avg       0.82      0.79      0.80       189
weighted avg       0.83      0.83      0.83       189



In [13]:
training_data.drop(['stress', 'date', 'duration', 'user', 'week'], axis=1)

Unnamed: 0,workload,sleep,voice,weekday,prev_stress,wl_avg_2,sp_avg_2,wl_std_2,sp_std_2,wl_max_2,sp_max_2,wl_min_2,sp_min_2,wl_delta_2,sp_delta_2,voice_avg_2,voice_std_2,voice_max_2,voice_min_2,voice_delta_2
1,3,3,4,1,2.0,3.0,3.0,0.0,0.0,3.0,3.0,3.0,3.0,0.0,0.0,4.0,0.0,4.0,4.0,0.0
2,3,3,4,2,2.0,3.0,3.0,0.0,0.0,3.0,3.0,3.0,3.0,0.0,0.0,4.0,0.0,4.0,4.0,0.0
3,3,3,4,3,2.0,3.0,3.0,0.0,0.0,3.0,3.0,3.0,3.0,0.0,0.0,4.0,0.0,4.0,4.0,0.0
4,3,3,4,4,2.0,3.0,3.0,0.0,0.0,3.0,3.0,3.0,3.0,0.0,0.0,4.0,0.0,4.0,4.0,0.0
5,3,3,4,5,2.0,3.0,3.0,0.0,0.0,3.0,3.0,3.0,3.0,0.0,0.0,4.0,0.0,4.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1311,5,2,5,2,1.0,4.5,3.5,0.5,1.5,5.0,5.0,4.0,2.0,1.0,-3.0,4.5,0.5,5.0,4.0,1.0
1312,5,2,5,3,2.0,5.0,2.0,0.0,0.0,5.0,2.0,5.0,2.0,0.0,0.0,5.0,0.0,5.0,5.0,0.0
1313,5,4,5,4,2.0,5.0,3.0,0.0,1.0,5.0,4.0,5.0,2.0,0.0,2.0,5.0,0.0,5.0,5.0,0.0
1314,5,4,1,5,1.0,5.0,4.0,0.0,0.0,5.0,4.0,5.0,4.0,0.0,0.0,3.0,2.0,5.0,1.0,-4.0


In [14]:
print(classification_report(training_data['stress'], clf.predict(training_data.drop(['stress', 'date', 'duration', 'user', 'week'], axis=1).values)))

              precision    recall  f1-score   support

           1       0.95      0.98      0.97       592
           2       0.94      0.92      0.93       322
           3       0.98      0.93      0.96       193

    accuracy                           0.95      1107
   macro avg       0.96      0.94      0.95      1107
weighted avg       0.95      0.95      0.95      1107

