# ML Analysis to predict stress based on sleep and workload

## Imports

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from imblearn.under_sampling import RandomUnderSampler, NeighbourhoodCleaningRule
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer

In [2]:
variable_to_predict = 'stress'

## Aux functions

In [3]:
def highlight_max(s):
    '''
    highlight the maximum in a Series.
    '''
    is_max = s == s.max()
    return ['background-color: darkorange' if v else '' for v in is_max]


def select_classifier(clf_name):
    if clf_name == 'LogR':
        return LogisticRegression(max_iter=1500000, random_state=24091993)
    elif clf_name == 'LinSVM':
        return LinearSVC(max_iter=1500000, random_state=24091993)
    elif clf_name == 'DT':
        return DecisionTreeClassifier(random_state=24091993)
    elif clf_name == 'RF':
        return RandomForestClassifier()
    

def select_params(clf_name):
    if clf_name == 'LogR':
        return {'C': np.logspace(-3,2, num=10),}
    elif clf_name == 'LinSVM':
        return {'C': np.logspace(-3,2, num=10), }
    elif clf_name == 'DT':
        return {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
    elif clf_name == 'RF':
        return {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'criterion' :['gini', 'entropy']}


def evaluate_easy(X, y, dataset_name, ws):
    scores = []
    for seed in [24091993]:
        np.random.seed(seed)
        for undersample_strategy in ['undersampling', 'none']:
            if undersample_strategy == 'undersampling':
                # define undersample strategy
                undersample = NeighbourhoodCleaningRule(sampling_strategy='not minority')
                # fit and apply the transform
                X___, y___ = undersample.fit_resample(X, y)
            else:
                X___ = X
                y___ = y

            for scaler_name in ['Standard', 'None']:
                if scaler_name == 'Standard':
                    scaler = StandardScaler()
                elif scaler_name == 'MinMax':
                    scaler = MinMaxScaler()
                elif scaler_name == 'Robust':
                    scaler = RobustScaler()
                else:
                    scaler = None
                if scaler is not None:
                    X_ = scaler.fit_transform(X___)
                else:
                    X_ = X___

                for clf_name in ['RF']:
                    clf = select_classifier(clf_name)

                    params = select_params(clf_name)
                    grid_search = GridSearchCV(clf, params, cv=cv, scoring='f1_weighted', n_jobs=-1)
                    grid_search.fit(X__, y___)

                    scores.append(['None', undersample_strategy, seed, dataset_name, ws, scaler_name, clf_name, grid_search.best_params_, grid_search.best_score_,  'All'])
    return scores


def evaluate(X, y, dataset_name, ws):
    scores = []
    for seed in [24091993]:
        np.random.seed(seed)
        cv = StratifiedKFold(shuffle=True, n_splits=5, random_state=seed)
        for undersample_strategy in ['undersampling', 'none']:
            if undersample_strategy == 'undersampling':
                # define undersample strategy
                undersample = NeighbourhoodCleaningRule(sampling_strategy='not minority')
                # fit and apply the transform
                X___, y___ = undersample.fit_resample(X, y)
            else:
                X___ = X
                y___ = y

            for scaler_name in ['Standard', 'None']:
                if scaler_name == 'Standard':
                    scaler = StandardScaler()
                elif scaler_name == 'MinMax':
                    scaler = MinMaxScaler()
                elif scaler_name == 'Robust':
                    scaler = RobustScaler()
                else:
                    scaler = None
                if scaler is not None:
                    X_ = scaler.fit_transform(X___)
                else:
                    X_ = X___

                for clf_name in ['RF']:
                    clf = select_classifier(clf_name)
                    for rfestrategy in ['None']:
                        selected_features = []
                        if rfestrategy == 'RFECV':
                            selector = RFECV(clf, step=1, cv=cv, scoring='f1_weighted', n_jobs=-1)
                            selector = selector.fit(X_, y___)
                            X__ = selector.transform(X_)
                            if ws > 1:
                                selected_features = selector.get_feature_names_out(['sleep', 'workload', 'weekday', 'prev_stress', 'duration', 'wl_avg_{}'.format(ws), 'sp_avg_{}'.format(ws), 'wl_std_{}'.format(ws), 'sp_std_{}'.format(ws), 'wl_max_{}'.format(ws), 'sp_max_{}'.format(ws), 'wl_min_{}'.format(ws), 'sp_min_{}'.format(ws), 'wl_delta_{}'.format(ws), 'sp_delta_{}'.format(ws)])
                            else:
                                selected_features = selector.get_feature_names_out(['sleep', 'workload', 'weekday', 'prev_stress', 'duration'])
                        else:
                            if ws > 1:
                                selected_features = ['sleep', 'workload', 'weekday', 'prev_stress', 'duration', 'wl_avg_{}'.format(ws), 'sp_avg_{}'.format(ws), 'wl_std_{}'.format(ws), 'sp_std_{}'.format(ws), 'wl_max_{}'.format(ws), 'sp_max_{}'.format(ws), 'wl_min_{}'.format(ws), 'sp_min_{}'.format(ws), 'wl_delta_{}'.format(ws), 'sp_delta_{}'.format(ws)]
                            else:
                                selected_features = ['sleep', 'workload', 'weekday', 'prev_stress', 'duration']
                            X__ = X_

                        params = select_params(clf_name)
                        grid_search = GridSearchCV(clf, params, cv=cv, scoring='f1_weighted', n_jobs=-1)
                        grid_search.fit(X__, y___)

                        scores.append([rfestrategy, undersample_strategy, seed, dataset_name, ws, scaler_name, clf_name, grid_search.best_params_, grid_search.best_score_,  selected_features])
    return scores

## Configuration

In [16]:
"""     'StudentLife' : {
        'path' : 'data/processed/sl_processed_data.csv',
        'train_weeks' : [13, 18],
        'test_weeks' : [19, 19],
    }, """

datasets = {

    'FBK' : {
        'path' : 'data/processed/fbk_processed_data.csv',
        'train_weeks' : [45, 50],
        'test_weeks' : [51, 51],
    },
}

In [17]:
## create additional features
def create_features(data, window_size=2):
    data['date'] = data['date'].astype('datetime64[ns]')
    data['weekday'] = data['date'].dt.weekday
    
    # add a new column with the average value of the last two rows of other column
    wl_sum = 0
    sp_sum = 0
    wl_values = []
    sp_values = []
    wl_delta = pd.Series(dtype='float64')
    sp_delta = pd.Series(dtype='float64')
    for i in range(0, window_size):
        workload = data.groupby('user')['workload'].shift(i)
        sleep = data.groupby('user')['sleep'].shift(i)
        wl_sum += workload
        wl_values.append(workload)
        sp_sum += sleep
        sp_values.append(sleep)
        wl_delta = workload if wl_delta.empty else wl_delta - workload
        sp_delta = sleep if sp_delta.empty else sp_delta - sleep

    data['prev_stress'] = data.groupby('user')['stress'].shift(1)

    if window_size > 1:
        data['wl_avg_{}'.format(window_size)] = wl_sum / window_size
        data['sp_avg_{}'.format(window_size)] = sp_sum / window_size
        data['wl_std_{}'.format(window_size)] = np.std(wl_values, axis=0)
        data['sp_std_{}'.format(window_size)] = np.std(sp_values, axis=0)
        data['wl_max_{}'.format(window_size)] = np.max(wl_values, axis=0)
        data['sp_max_{}'.format(window_size)] = np.max(sp_values, axis=0)
        data['wl_min_{}'.format(window_size)] = np.min(wl_values, axis=0)
        data['sp_min_{}'.format(window_size)] = np.min(sp_values, axis=0)
        data['wl_delta_{}'.format(window_size)] = wl_delta
        data['sp_delta_{}'.format(window_size)] = sp_delta
    
    return data

## Analysis

In [18]:
scores_array = []
for dataset_name, dataset in datasets.items():
    data = pd.read_csv(dataset['path'])
    for ws in [2]:
        _data = data.copy()
        _data = create_features(_data, window_size=ws)
        _data.dropna(how='any', inplace=True)

        if ws > 1:
            X = _data[['sleep', 'workload', 'weekday', 'prev_stress', 'duration', 'wl_avg_{}'.format(ws), 'sp_avg_{}'.format(ws), 'wl_std_{}'.format(ws), 'sp_std_{}'.format(ws), 
                  'wl_max_{}'.format(ws), 'sp_max_{}'.format(ws), 'wl_min_{}'.format(ws), 'sp_min_{}'.format(ws), 'wl_delta_{}'.format(ws), 'sp_delta_{}'.format(ws)]]
        else:
            X = _data[['sleep', 'workload', 'weekday', 'prev_stress']]
        y = _data[variable_to_predict]
        scores = evaluate(X, y, dataset_name, ws)

        scores = pd.DataFrame(scores, columns=['rfe', 'undersampling', 'seed', 'dataset', 'window_size', 'scaler', 'classifier', 'gs_params', 'gs_score',  'features'])
        scores_array.append(scores)

scores = pd.concat(scores_array)

In [19]:
scores[scores['dataset'] == 'FBK'].query('gs_score == gs_score.max()')

Unnamed: 0,rfe,undersampling,seed,dataset,window_size,scaler,classifier,gs_params,gs_score,features
1,,undersampling,24091993,FBK,2,,RF,"{'criterion': 'gini', 'max_depth': 9, 'n_estim...",0.886972,"[sleep, workload, weekday, prev_stress, durati..."


In [8]:
scores[scores['dataset'] == 'StudentLife'].query('gs_score == gs_score.max()')

Unnamed: 0,rfe,undersampling,seed,dataset,window_size,scaler,classifier,gs_params,gs_score,features
1,,undersampling,24091993,StudentLife,2,,RF,"{'criterion': 'gini', 'max_depth': 9, 'n_estim...",0.864528,"[sleep, workload, weekday, prev_stress, durati..."


In [20]:
scores[scores['dataset'] == 'FBK'].query('gs_score == gs_score.max()').loc[1]['gs_params'] 

{'criterion': 'gini', 'max_depth': 9, 'n_estimators': 140}

In [10]:
scores[scores['dataset'] == 'StudentLife'].query('gs_score == gs_score.max()').loc[1]['gs_params']

{'criterion': 'gini', 'max_depth': 9, 'n_estimators': 160}

In [11]:
0.916497 - 0.884767

0.031730000000000036