# Fine tuning of the approach

## Imports

In [2]:
import pandas as pd
import numpy as np
import pickle

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report



In [4]:
def select_classifier(clf_name):
    if clf_name == 'LogR':
        return LogisticRegression(max_iter=1500000, random_state=24091993)
    elif clf_name == 'DT':
        return DecisionTreeClassifier(random_state=24091993)
    elif clf_name == 'RF':
        return RandomForestClassifier(random_state=24091993)
    

def select_params(clf_name):
    if clf_name == 'LogR':
        return {'C': np.logspace(-3,2, num=10),}
    elif clf_name == 'LinSVM':
        return {'C': np.logspace(-3,2, num=10), }
    elif clf_name == 'DT':
        return {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
    elif clf_name == 'RF':
        return {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'criterion' :['gini', 'entropy']}


def create_features(data, window_size=2):
    data['date'] = data['date'].astype('datetime64[ns]')
    data['weekday'] = data['date'].dt.weekday
    
    # add a new column with the average value of the last two rows of other column
    wl_sum = 0
    sp_sum = 0
    voice_sum = 0
    wl_values = []
    sp_values = []
    voice_values = []
    wl_delta = pd.Series(dtype='float64')
    sp_delta = pd.Series(dtype='float64')
    voice_delta = pd.Series(dtype='float64')
    for i in range(0, window_size):
        workload = data.groupby('user')['workload'].shift(i)
        sleep = data.groupby('user')['sleep'].shift(i)
        voice = data.groupby('user')['voice'].shift(i)
        wl_sum += workload
        wl_values.append(workload)
        sp_sum += sleep
        sp_values.append(sleep)
        voice_sum += voice
        voice_values.append(voice)
        wl_delta = workload if wl_delta.empty else wl_delta - workload
        sp_delta = sleep if sp_delta.empty else sp_delta - sleep
        voice_delta = voice if voice_delta.empty else voice_delta - voice

    data['prev_stress'] = data.groupby('user')['stress'].shift(1)

    if window_size > 1:
        data['wl_avg_{}'.format(window_size)] = wl_sum / window_size
        data['sp_avg_{}'.format(window_size)] = sp_sum / window_size
        data['wl_std_{}'.format(window_size)] = np.std(wl_values, axis=0)
        data['sp_std_{}'.format(window_size)] = np.std(sp_values, axis=0)
        data['wl_max_{}'.format(window_size)] = np.max(wl_values, axis=0)
        data['sp_max_{}'.format(window_size)] = np.max(sp_values, axis=0)
        data['wl_min_{}'.format(window_size)] = np.min(wl_values, axis=0)
        data['sp_min_{}'.format(window_size)] = np.min(sp_values, axis=0)
        data['wl_delta_{}'.format(window_size)] = wl_delta
        data['sp_delta_{}'.format(window_size)] = sp_delta
        data['voice_avg_{}'.format(window_size)] = voice_sum / window_size
        data['voice_std_{}'.format(window_size)] = np.std(voice_values, axis=0)
        data['voice_max_{}'.format(window_size)] = np.max(voice_values, axis=0)
        data['voice_min_{}'.format(window_size)] = np.min(voice_values, axis=0)
        data['voice_delta_{}'.format(window_size)] = voice_delta
    
    return data


In [3]:
max_missing_percentages = [30, 45, 60, 75]
data_strategies = ['balanced', 'reliable']

datasets = {
    'sl' : {
        'cut_strategies' : [''],
        'weeks' : {
            'balanced' : {
                'start' : 13,
                'length' : 7,
                'train_lengths': [5, 6] 
            },
            'reliable' : {
                'start' : 13,
                'length' : 4,
                'train_lengths': [2, 3] 
            },
        },
    },
    'fbk' : {
        'cut_strategies' : ['_cut', '_notcut'],
        'weeks' : {
            'balanced' : {
                'start' : 45,
                'length' : 7,
                'train_lengths': [5, 6] 
            },
            'reliable' : {
                'start' : 46,
                'length' : 5,
                'train_lengths': [3, 4] 
            },
        },
    }
}

In [4]:
def evaluate(dataset_name, cut_strategy, max_missing_percentage, data_strategy, train_length, dataset):
    scores = []
    for clf_name in ['LogR', 'DT', 'RF']:
        np.random.seed(24091993)
        clf = select_classifier(clf_name)
        params = select_params(clf_name)
        
        data = dataset.copy()
        
        # training_data = data[(data['week'] >= data_strategy['start']) and (data['week'] < data_strategy['start'] + train_length - 1)]
        # test_data = data[(data['week'] >= data_strategy['start'] + train_length) and (data['week'] < data_strategy['start'] + data_strategy['length'])]

        rscv = GridSearchCV(clf, params, cv=5, scoring='f1_weighted', n_jobs=-1)
        rscv.fit(data.drop(['stress', 'date', 'duration', 'user'], axis=1), data['stress'])
        scores.append([dataset_name, clf_name, str(rscv.best_params_), rscv.best_score_,  max_missing_percentage, data_strategy, train_length, cut_strategy])
    return scores


In [5]:
scores_array = []
for dataset_name in datasets:
    for cut_strategy in datasets[dataset_name]['cut_strategies']:
        for max_missing_percentage in max_missing_percentages:
            for data_strategy in data_strategies:
                dataframe = pd.read_csv('../data/processed/{}_processed_data_{}_{}{}.csv'.format(dataset_name, max_missing_percentage, data_strategy, cut_strategy))
                for train_length in datasets[dataset_name]['weeks'][data_strategy]['train_lengths']:
                    data = dataframe.copy()
                    data = create_features(data, window_size=2)
                    data.dropna(inplace=True)
                    scores = evaluate(dataset_name, cut_strategy, max_missing_percentage, data_strategy, train_length, data)
                    scores = pd.DataFrame(scores, columns=['dataset', 'classifier', 'params', 'score', 'max_missing_percentage', 'data_strategy', 'train_length', 'cut_strategy'])
                    scores_array.append(scores)
scores = pd.concat(scores_array)

In [6]:
scores

Unnamed: 0,dataset,classifier,params,score,max_missing_percentage,data_strategy,train_length,cut_strategy
0,sl,LogR,{'C': 7.742636826811278},0.681652,30,balanced,5,
1,sl,DT,"{'max_depth': 3, 'min_samples_leaf': 4, 'min_s...",0.661957,30,balanced,5,
2,sl,RF,"{'criterion': 'entropy', 'max_depth': 4, 'n_es...",0.634313,30,balanced,5,
0,sl,LogR,{'C': 7.742636826811278},0.681652,30,balanced,6,
1,sl,DT,"{'max_depth': 3, 'min_samples_leaf': 4, 'min_s...",0.661957,30,balanced,6,
...,...,...,...,...,...,...,...,...
1,fbk,DT,"{'max_depth': 2, 'min_samples_leaf': 1, 'min_s...",0.736233,75,reliable,3,_notcut
2,fbk,RF,"{'criterion': 'gini', 'max_depth': 6, 'n_estim...",0.723822,75,reliable,3,_notcut
0,fbk,LogR,{'C': 7.742636826811278},0.730231,75,reliable,4,_notcut
1,fbk,DT,"{'max_depth': 2, 'min_samples_leaf': 1, 'min_s...",0.736233,75,reliable,4,_notcut


In [10]:
scores[scores['dataset'] == 'fbk'].query('score == score.max()').iloc[0]['params']

"{'max_depth': 4, 'min_samples_leaf': 10, 'min_samples_split': 2}"

In [11]:
scores[scores['dataset'] == 'fbk'].query('score == score.max()')

Unnamed: 0,dataset,classifier,params,score,max_missing_percentage,data_strategy,train_length,cut_strategy
1,fbk,DT,"{'max_depth': 4, 'min_samples_leaf': 10, 'min_...",0.77398,75,balanced,5,_cut
1,fbk,DT,"{'max_depth': 4, 'min_samples_leaf': 10, 'min_...",0.77398,75,balanced,6,_cut


In [9]:
scores[scores['dataset'] == 'sl'].query('score == score.max()').iloc[0]['params']

"{'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}"

In [12]:
scores[scores['dataset'] == 'sl'].query('score == score.max()')

Unnamed: 0,dataset,classifier,params,score,max_missing_percentage,data_strategy,train_length,cut_strategy
1,sl,DT,"{'max_depth': 2, 'min_samples_leaf': 1, 'min_s...",0.782199,75,balanced,5,
1,sl,DT,"{'max_depth': 2, 'min_samples_leaf': 1, 'min_s...",0.782199,75,balanced,6,


## Train model with the fine-tuned parameters

In [5]:
clf = DecisionTreeClassifier(random_state=24091993,  min_samples_split=2, min_samples_leaf=1, max_depth=2)
data = pd.read_csv('../data/processed/sl_processed_data_75_balanced.csv')
data = create_features(data, window_size=2)
data.dropna(inplace=True)
data['week'] = data['date'].astype('datetime64[ns]').dt.isocalendar().week
training_data = data[(data['week'] >= 13) & (data['week'] <= 13 + 6 - 1)]
test_data = data[(data['week'] >= 13 + 6) & (data['week'] < 13 + 7)]
clf.fit(training_data.drop(['stress', 'date', 'duration', 'user', 'week'], axis=1).values, training_data['stress'])
y_pred = clf.predict(test_data.drop(['stress', 'date', 'duration', 'user', 'week'], axis=1).values)
y_true = test_data['stress']
print(classification_report(y_true, y_pred))
pickle.dump(clf, open('../trained_models/sl_data_75_balanced_model_trained.pickle', "wb"))

              precision    recall  f1-score   support

           1       0.88      0.86      0.87        50
           2       0.92      0.91      0.92       148
           3       0.87      0.91      0.89        68

    accuracy                           0.90       266
   macro avg       0.89      0.89      0.89       266
weighted avg       0.90      0.90      0.90       266



In [6]:
print(classification_report(training_data['stress'], clf.predict(training_data.drop(['stress', 'date', 'duration', 'user', 'week'], axis=1).values)))

              precision    recall  f1-score   support

           1       0.72      0.73      0.73       381
           2       0.77      0.77      0.77       725
           3       0.79      0.77      0.78       452

    accuracy                           0.76      1558
   macro avg       0.76      0.76      0.76      1558
weighted avg       0.76      0.76      0.76      1558



In [1]:
clf = DecisionTreeClassifier(random_state=24091993,  min_samples_split=2, min_samples_leaf=10, max_depth=4)
data = pd.read_csv('../data/processed/fbk_processed_data_75_balanced_cut.csv')
data = create_features(data, window_size=2)
data.dropna(inplace=True)
data['week'] = data['date'].astype('datetime64[ns]').dt.isocalendar().week
training_data = data[(data['week'] >= 45) & (data['week'] <= 45 + 6 - 1)]
test_data = data[(data['week'] >= 45 + 6) & (data['week'] < 45 + 7)]
clf.fit(training_data.drop(['stress', 'date', 'duration', 'user', 'week'], axis=1).values, training_data['stress'])
y_pred = clf.predict(test_data.drop(['stress', 'date', 'duration', 'user', 'week'], axis=1).values)
y_true = test_data['stress']
print(classification_report(y_true, y_pred))
pickle.dump(clf, open('../trained_models/fbk_data_75_balanced_cut_model_trained.pickle', "wb"))

NameError: name 'DecisionTreeClassifier' is not defined

In [None]:
print(classification_report(training_data['stress'], clf.predict(training_data.drop(['stress', 'date', 'duration', 'user', 'week'], axis=1).values)))

              precision    recall  f1-score   support

           1       0.84      0.88      0.86       649
           2       0.68      0.68      0.68       356
           3       0.77      0.67      0.72       225

    accuracy                           0.78      1230
   macro avg       0.76      0.74      0.75      1230
weighted avg       0.78      0.78      0.78      1230

