# Model Selection

## Objective

The purpose of this notebook is to test different models for the project.  

## Import libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Load dataset

In [2]:
#- Define data file
file ='../dataset/ObesityDataSet_raw_and_data_sinthetic.csv'

In [3]:
#- Load dataset to a pandas dataframe for analysis
ds = pd.read_csv(file)

## Preprocessing

In [4]:
# Transformation of binary data
ds["Gender"] = ds.Gender.apply(lambda s: 1 if s == "Female" else 0)
ds["family_history_with_overweight"] = ds.family_history_with_overweight.apply(lambda s: 1 if s == "yes" else 0)
ds["FAVC"] = ds.FAVC.apply(lambda s: 1 if s == "yes" else 0)
ds["SMOKE"] = ds.SMOKE.apply(lambda s: 1 if s == "yes" else 0)
ds["SCC"] = ds.SCC.apply(lambda s: 1 if s == "yes" else 0)

In [5]:
# One hot encoding for categorical data
CAEC_list = pd.get_dummies(ds.CAEC, prefix="CAEC")
ds.drop("CAEC", inplace=True, axis=1)
ds = ds.join(CAEC_list)

CALC_list = pd.get_dummies(ds.CALC, prefix="CALC")
ds.drop("CALC", inplace=True, axis=1)
ds = ds.join(CALC_list)

MTRANS_list = pd.get_dummies(ds.MTRANS, prefix="MTRANS")
ds.drop("MTRANS", inplace=True, axis=1)
ds = ds.join(MTRANS_list)

In [6]:
# Transformation of target feature through a dictionary
obesity = {"Insufficient_Weight":1, "Normal_Weight":2, "Overweight_Level_I":3, "Overweight_Level_II":4, "Obesity_Type_I":5, "Obesity_Type_II":6, "Obesity_Type_III":7}
ds["NObeyesdad"] = ds.NObeyesdad.map(obesity)

## Obtain Train and Test datasets

In [7]:
# Obtain train and test datasets
X_train, X_test, y_train, y_test = train_test_split(ds.drop('NObeyesdad',axis=1), 
                                                    ds['NObeyesdad'],
                                                    test_size=0.30, 
                                                    random_state=0)

In [8]:
# Standard scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Logistic Regression

In [9]:
# Train model
log_model = LogisticRegression(max_iter=10000)
log_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
# Test model
y_pred = log_model.predict(X_test)
log_accuracy = accuracy_score (y_test, y_pred)
log_accuracy

0.8911671924290221

In [11]:
# Evaluate model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.90      0.96      0.92        90
           2       0.85      0.78      0.81        87
           3       0.82      0.77      0.79        81
           4       0.81      0.80      0.81        82
           5       0.93      0.90      0.92       103
           6       0.95      1.00      0.97        90
           7       0.94      0.99      0.97       101

    accuracy                           0.89       634
   macro avg       0.89      0.89      0.88       634
weighted avg       0.89      0.89      0.89       634



In [12]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 86,   4,   0,   0,   0,   0,   0],
       [ 10,  68,   7,   2,   0,   0,   0],
       [  0,   8,  62,  11,   0,   0,   0],
       [  0,   0,   7,  66,   7,   1,   1],
       [  0,   0,   0,   2,  93,   3,   5],
       [  0,   0,   0,   0,   0,  90,   0],
       [  0,   0,   0,   0,   0,   1, 100]], dtype=int64)

## SVM

In [13]:
# Train model
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [14]:
# Test model
y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score (y_test, y_pred)
svm_accuracy

0.8564668769716088

In [15]:
# Evaluate model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.90      0.92      0.91        90
           2       0.63      0.71      0.67        87
           3       0.81      0.77      0.78        81
           4       0.76      0.73      0.75        82
           5       0.92      0.84      0.88       103
           6       0.97      0.99      0.98        90
           7       1.00      0.99      1.00       101

    accuracy                           0.86       634
   macro avg       0.85      0.85      0.85       634
weighted avg       0.86      0.86      0.86       634



In [16]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 83,   7,   0,   0,   0,   0,   0],
       [  8,  62,   8,   9,   0,   0,   0],
       [  0,  13,  62,   4,   2,   0,   0],
       [  1,   9,   7,  60,   5,   0,   0],
       [  0,   7,   0,   6,  87,   3,   0],
       [  0,   0,   0,   0,   1,  89,   0],
       [  0,   1,   0,   0,   0,   0, 100]], dtype=int64)

## KNN

In [17]:
# Train model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [18]:
# Test model
y_pred = knn_model.predict(X_test)
knn_accuracy = accuracy_score (y_test, y_pred)
knn_accuracy

0.8123028391167192

In [19]:
# Evaluate model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.79      0.93      0.86        90
           2       0.63      0.49      0.55        87
           3       0.77      0.72      0.74        81
           4       0.70      0.76      0.73        82
           5       0.84      0.79      0.81       103
           6       0.89      0.97      0.93        90
           7       0.98      0.99      0.99       101

    accuracy                           0.81       634
   macro avg       0.80      0.81      0.80       634
weighted avg       0.81      0.81      0.81       634



In [20]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 84,   4,   1,   1,   0,   0,   0],
       [ 17,  43,  10,  11,   4,   2,   0],
       [  1,   9,  58,   4,   7,   2,   0],
       [  1,   7,   4,  62,   4,   4,   0],
       [  3,   5,   1,   9,  81,   2,   2],
       [  0,   0,   1,   1,   1,  87,   0],
       [  0,   0,   0,   0,   0,   1, 100]], dtype=int64)

## Gaussian Naive Bayes

In [21]:
# Train model
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [22]:
# Test model
y_pred = gnb_model.predict(X_test)
gnb_accuracy = accuracy_score (y_test, y_pred)
gnb_accuracy

0.5473186119873817

In [23]:
# Evaluate model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.31      0.97      0.47        90
           2       0.68      0.22      0.33        87
           3       0.58      0.14      0.22        81
           4       0.53      0.10      0.16        82
           5       0.61      0.45      0.51       103
           6       0.68      0.84      0.75        90
           7       0.99      0.99      0.99       101

    accuracy                           0.55       634
   macro avg       0.62      0.53      0.49       634
weighted avg       0.63      0.55      0.51       634



In [24]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 87,   3,   0,   0,   0,   0,   0],
       [ 61,  19,   6,   1,   0,   0,   0],
       [ 61,   3,  11,   3,   3,   0,   0],
       [ 44,   2,   2,   8,  13,  13,   0],
       [ 30,   1,   0,   2,  46,  23,   1],
       [  0,   0,   0,   1,  13,  76,   0],
       [  0,   0,   0,   0,   1,   0, 100]], dtype=int64)

## Decision Trees

In [25]:
# Train model
tree_model = tree.DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [26]:
# Test model
y_pred = tree_model.predict(X_test)
tree_accuracy = accuracy_score (y_test, y_pred)
tree_accuracy

0.9321766561514195

In [27]:
# Evaluate model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.92      0.96      0.94        90
           2       0.89      0.82      0.85        87
           3       0.87      0.85      0.86        81
           4       0.89      0.93      0.91        82
           5       0.95      0.97      0.96       103
           6       0.99      0.99      0.99        90
           7       0.98      0.99      0.99       101

    accuracy                           0.93       634
   macro avg       0.93      0.93      0.93       634
weighted avg       0.93      0.93      0.93       634



In [28]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 86,   4,   0,   0,   0,   0,   0],
       [  7,  71,   8,   1,   0,   0,   0],
       [  0,   5,  69,   7,   0,   0,   0],
       [  0,   0,   2,  76,   4,   0,   0],
       [  0,   0,   0,   1, 100,   0,   2],
       [  0,   0,   0,   0,   1,  89,   0],
       [  0,   0,   0,   0,   0,   1, 100]], dtype=int64)

## Model Accuracy Comparison

In [29]:
print ("Logistic Regression:  ", log_accuracy)
print ("SVM:                  ", svm_accuracy)
print ("KNN:                  ", knn_accuracy)
print ("Gaussian Naive Bayes: ", gnb_accuracy)
print ("Decision Trees:       ", tree_accuracy)

Logistic Regression:   0.8911671924290221
SVM:                   0.8564668769716088
KNN:                   0.8123028391167192
Gaussian Naive Bayes:  0.5473186119873817
Decision Trees:        0.9321766561514195


The best model is Decision Trees