# **1. Packaging**

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np # to use numpy arrays instead of lists
import pandas as pd # DataFrame (table)

In [None]:
from analysis_modules import *

# **2. Working on Data**

### 2.1 Data Gathering

In [5]:
data_file = "synthetic_coffee_health_10000.csv"

In [6]:
datos = leeDatos(data_file)
describeData(datos)

    Age  Gender  Country  Coffee_Intake  Caffeine_mg  Sleep_Hours  \
ID                                                                  
1    40    Male  Germany            3.5        328.1          7.5   
2    33    Male  Germany            1.0         94.1          6.2   
3    42    Male   Brazil            5.3        503.7          5.9   
4    53    Male  Germany            2.6        249.2          7.3   
5    32  Female    Spain            3.1        298.0          5.3   

   Sleep_Quality   BMI  Heart_Rate Stress_Level  Physical_Activity_Hours  \
ID                                                                         
1           Good  24.9          78          Low                     14.5   
2           Good  20.0          67          Low                     11.0   
3           Fair  22.7          59       Medium                     11.2   
4           Good  24.7          71          Low                      6.6   
5           Fair  24.1          76       Medium             

### 2.2 Preprocessing

In [8]:
dataSelected = selectData(datos)
print(dataSelected.head())
print()
dataPreprocessed = preprocess(dataSelected)
print(dataPreprocessed.head())

    Age  Coffee_Intake  Caffeine_mg  Sleep_Hours   BMI  Heart_Rate  \
ID                                                                   
1    40            3.5        328.1          7.5  24.9          78   
2    33            1.0         94.1          6.2  20.0          67   
3    42            5.3        503.7          5.9  22.7          59   
4    53            2.6        249.2          7.3  24.7          71   
5    32            3.1        298.0          5.3  24.1          76   

    Physical_Activity_Hours  Smoking  Alcohol_Consumption  SQ_Low  SQ_Fair  \
ID                                                                           
1                      14.5        0                    0       0        0   
2                      11.0        0                    0       0        0   
3                      11.2        0                    0       0        1   
4                       6.6        0                    0       0        0   
5                       8.5        0     

# **3. ML Models**

### 3.1 Split Data

In [None]:
ts_size = .25
[trainSet, testSet] = splitDataSet(dataPreprocessed, test_size=ts_size, randSplit=True, stratify=None)
print("Train set")
print(trainSet.head())
print
print("Test set")
print(testSet.head())
print()

Train set
           Age  Coffee_Intake  Caffeine_mg  Sleep_Hours       BMI  Heart_Rate  \
ID                                                                              
2968  0.161290       0.353659     0.347943     0.371429  0.172414    0.067797   
701   0.161290       0.451220     0.446623     0.485714  0.370690    0.508475   
3482  0.000000       0.048780     0.049981     0.314286  0.512931    0.406780   
1622  0.000000       0.439024     0.435089     0.442857  0.737069    0.677966   
801   0.451613       0.609756     0.608356     0.400000  0.271552    0.084746   

      Physical_Activity_Hours  Smoking  Alcohol_Consumption  SQ_Low  SQ_Fair  \
ID                                                                             
2968                 0.046667      0.0                  1.0     0.0      1.0   
701                  0.366667      0.0                  1.0     0.0      0.0   
3482                 0.093333      0.0                  1.0     0.0      1.0   
1622                 0

## 3.2 Implement Models

In [55]:
import time as tm

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

np.random.seed(0)

In [14]:
print(dataPreprocessed.columns[-4:])
print(dataPreprocessed.columns[:-4])

Index(['SQ_Low', 'SQ_Fair', 'SQ_Good', 'SQ_Excellent'], dtype='object')
Index(['Age', 'Coffee_Intake', 'Caffeine_mg', 'Sleep_Hours', 'BMI',
       'Heart_Rate', 'Physical_Activity_Hours', 'Smoking',
       'Alcohol_Consumption'],
      dtype='object')


In [64]:
# --------------------
def metodosML(dataSet=0, dataSet2=0):
    """
    Function to perform classification using various machine learning methods.
    """

    methodsUsed = ['DT', 'RF', 'KNN']
    metrics = ['accuracy', 'precision', 'recall', 'f1']
    trainModelMetrics = pd.DataFrame(index=methodsUsed, columns=metrics)
    testModelMetrics = pd.DataFrame(index=methodsUsed, columns=metrics)
    timeHeaders=['Fit Time (sec)']
    compTime = pd.DataFrame(index=methodsUsed, columns=timeHeaders)
    compTime.index.name = "Computation Time"

    yVar = dataSet.columns[-4:]
    trainModel = pd.DataFrame()
    testModel = pd.DataFrame()
    trainModel[yVar] = dataSet[yVar]
    testModel[yVar] = dataSet2[yVar]
    sizeData=len(dataSet.columns)-4


    def computeModels(model=0, param_search=0, model_name='', position=0):
        """
        Function to compute a model using GridSearchCV and evaluate it.
        """

        start = tm.time() 
        grid_search = GridSearchCV(estimator=model, param_grid=param_search, cv=5, verbose=1)
        grid_search.fit(dataSet.iloc[:,:-4].to_numpy().reshape(len(dataSet),sizeData), dataSet.iloc[:,-4:].to_numpy().reshape(len(dataSet),4))
        best_model_param = grid_search.best_estimator_
        end = tm.time()
        lspTime = end - start
        
        print("Best parameters set found on development set:")
        print(grid_search.best_params_)
        print() 
    
        compTime.iloc[position, 0] = lspTime
        computeResults(best_model_param, model_name, position)
    
    def computeResults(train_model_param=0, model_name='', position=0):
        """ 
        Function to compute results for the classification models.
        """
        nonlocal trainModel, testModel, trainModelMetrics, testModelMetrics

        columnsNames = [model_name + ' SQ_Low',  model_name + ' SQ_Fair',  model_name + ' SQ_Good',  model_name + ' SQ_Excellent']

        trainModel[columnsNames] = train_model_param.predict(dataSet.iloc[:,:-4].to_numpy().reshape(len(dataSet),sizeData))
        testModel[columnsNames] = train_model_param.predict(dataSet2.iloc[:,:-4].to_numpy().reshape(len(dataSet2),sizeData))
        #print(trainModel.head())
        #print()
        #print(testModel.head())
        print()
       
        trainModelMetrics.iloc[position, 0] = accuracy_score(trainModel[yVar].to_numpy().reshape(len(dataSet),4), trainModel[columnsNames].to_numpy().reshape(len(dataSet),4))
        trainModelMetrics.iloc[position, 1] = precision_score(trainModel[yVar].to_numpy().reshape(len(dataSet),4), trainModel[columnsNames].to_numpy().reshape(len(dataSet),4), average='macro', zero_division=0)
        trainModelMetrics.iloc[position, 2] = recall_score(trainModel[yVar].to_numpy().reshape(len(dataSet),4), trainModel[columnsNames].to_numpy().reshape(len(dataSet),4), average='macro', zero_division=0)
        trainModelMetrics.iloc[position, 3] = f1_score(trainModel[yVar].to_numpy().reshape(len(dataSet),4), trainModel[columnsNames].to_numpy().reshape(len(dataSet),4), average='macro', zero_division=0)

        testModelMetrics.iloc[position, 0] = accuracy_score(testModel[yVar].to_numpy().reshape(len(dataSet2),4), testModel[columnsNames].to_numpy().reshape(len(dataSet2),4))
        testModelMetrics.iloc[position, 1] = precision_score(testModel[yVar].to_numpy().reshape(len(dataSet2),4), testModel[columnsNames].to_numpy().reshape(len(dataSet2),4), average='macro', zero_division=0)
        testModelMetrics.iloc[position, 2] = recall_score(testModel[yVar].to_numpy().reshape(len(dataSet2),4), testModel[columnsNames].to_numpy().reshape(len(dataSet2),4), average='macro', zero_division=0)
        testModelMetrics.iloc[position, 3] = f1_score(testModel[yVar].to_numpy().reshape(len(dataSet2),4), testModel[columnsNames].to_numpy().reshape(len(dataSet2),4), average='macro', zero_division=0)


    # Decision Tree Classifier
    print('Classification with DT')
    dt_clf = DecisionTreeClassifier(random_state=0)
    param_search_dt_clf = {
        'criterion': ["gini", "entropy"],
        'max_depth': [5, 10, 15, 20, 25, 30, None],
        'max_leaf_nodes': [5, 10, 15, 20, 35, None]
    }
    computeModels(dt_clf, param_search_dt_clf, 'DT', 0)

    # Random Forest Classifier
    print('Classification with RF')
    rf_clf = RandomForestClassifier(random_state=0)
    param_search_rf_clf = {
        'criterion': ["gini", "entropy"],
        'n_estimators': [50, 100],
        'max_depth': [5, 10, None],
        'max_leaf_nodes': [5, 10, None]
    }
    computeModels(rf_clf, param_search_rf_clf, 'RF', 1)

    # Support Vector Classifier
    # does not support multi-label classification directly
    print('Classification with SVC')
    svc_clf = SVC(random_state=0)
    param_search_svc_clf = {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto']
    }
    #computeModels(svc_clf, param_search_svc_clf, 'SVC', 2)

    # Gradient Boosting Classifier
    # does not support multi-label classification directly
    print('Classification with GB')
    gb_clf = GradientBoostingClassifier(random_state=0)
    param_search_gb_clf = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 5, 7]
    }
    #computeModels(gb_clf, param_search_gb_clf, 'GB', 3)

    # K-Nearest Neighbors Classifier
    print('Classification with KNN')
    knn_clf = KNeighborsClassifier()
    param_search_knn_clf = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }
    computeModels(knn_clf, param_search_knn_clf, 'KNN', 2)

    print("=======================================================================")
    print("                      FIT TIME COMPARISON")
    print("-----------------------------------------------------------------------")
    print("                      Training")
    print("-----------------------------------------------------------------------")
    print(compTime)
    print("=======================================================================")
    print("                      MODEL PERFORMANCE COMPARISON")
    print("-----------------------------------------------------------------------")
    print("                      Training")
    print("-----------------------------------------------------------------------")
    print(trainModelMetrics)
    print("-----------------------------------------------------------------------")
    print("                      Testing")
    print("-----------------------------------------------------------------------")
    print(testModelMetrics)
    print("\n")

In [65]:
metodosML(trainSet, testSet)

Classification with DT
Fitting 5 folds for each of 84 candidates, totalling 420 fits
Best parameters set found on development set:
{'criterion': 'entropy', 'max_depth': 15, 'max_leaf_nodes': None}


Classification with RF
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters set found on development set:
{'criterion': 'gini', 'max_depth': 10, 'max_leaf_nodes': None, 'n_estimators': 50}


Classification with SVC
Classification with GB
Classification with KNN
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters set found on development set:
{'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'distance'}


                      FIT TIME COMPARISON
-----------------------------------------------------------------------
                      Training
-----------------------------------------------------------------------
                 Fit Time (sec)
Computation Time               
DT                     6.964955
RF                   125.408