# Preprocessing

## Objective

The purpose of this notebook is to perform preprocessing on the dataset selected for the project.  

## Import libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Load dataset

In [2]:
#- Define data file
file ='../dataset/ObesityDataSet_raw_and_data_sinthetic.csv'

In [3]:
#- Load dataset to a pandas dataframe for analysis
ds = pd.read_csv(file)

## Analyze dataset

In [4]:
#- Display top 5 rows
ds.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [5]:
#- Display dataset information
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [6]:
#- Get statistics of the dataset
ds.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [7]:
#- Get null values
ds.isnull().values.any()

False

## Preprocessing

In [8]:
# Transformation of binary data
ds["Gender"] = ds.Gender.apply(lambda s: 1 if s == "Female" else 0)
ds["family_history_with_overweight"] = ds.family_history_with_overweight.apply(lambda s: 1 if s == "yes" else 0)
ds["FAVC"] = ds.FAVC.apply(lambda s: 1 if s == "yes" else 0)
ds["SMOKE"] = ds.SMOKE.apply(lambda s: 1 if s == "yes" else 0)
ds["SCC"] = ds.SCC.apply(lambda s: 1 if s == "yes" else 0)

In [9]:
# One hot encodng for categorical data
CAEC_list = pd.get_dummies(ds.CAEC, prefix="CAEC")
ds.drop("CAEC", inplace=True, axis=1)
ds = ds.join(CAEC_list)

CALC_list = pd.get_dummies(ds.CALC, prefix="CALC")
ds.drop("CALC", inplace=True, axis=1)
ds = ds.join(CALC_list)

MTRANS_list = pd.get_dummies(ds.MTRANS, prefix="MTRANS")
ds.drop("MTRANS", inplace=True, axis=1)
ds = ds.join(MTRANS_list)

In [10]:
# Transformation of target feature through a dictionary
obesity = {"Insufficient_Weight":1, "Normal_Weight":2, "Overweight_Level_I":3, "Overweight_Level_II":4, "Obesity_Type_I":5, "Obesity_Type_II":6, "Obesity_Type_III":7}
ds["NObeyesdad"] = ds.NObeyesdad.map(obesity)

In [11]:
ds.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,SMOKE,CH2O,...,CAEC_no,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,1,21.0,1.62,64.0,1,0,2.0,3.0,0,2.0,...,0,0,0,0,1,0,0,0,1,0
1,1,21.0,1.52,56.0,1,0,3.0,3.0,1,3.0,...,0,0,0,1,0,0,0,0,1,0
2,0,23.0,1.8,77.0,1,0,2.0,3.0,0,2.0,...,0,0,1,0,0,0,0,0,1,0
3,0,27.0,1.8,87.0,0,0,3.0,3.0,0,2.0,...,0,0,1,0,0,0,0,0,0,1
4,0,22.0,1.78,89.8,0,0,2.0,1.0,0,2.0,...,0,0,0,1,0,0,0,0,1,0


In [12]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   int64  
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   int64  
 5   FAVC                            2111 non-null   int64  
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   SMOKE                           2111 non-null   int64  
 9   CH2O                            2111 non-null   float64
 10  SCC                             2111 non-null   int64  
 11  FAF                             2111 non-null   float64
 12  TUE                             21

## Preliminary Test
Train and test a Linear Regression model to determine if normalization will help to obtain better results.

In [13]:
# Obtain train and test datasets
X_train, X_test, y_train, y_test = train_test_split(ds.drop('NObeyesdad',axis=1), 
                                                    ds['NObeyesdad'],
                                                    test_size=0.30, 
                                                    random_state=0)

In [14]:
# Train model
logmodel = LogisticRegression(max_iter=10000)
logmodel.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
# Test model
predictions = logmodel.predict(X_test)

In [16]:
# Evaluate model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.84      0.90      0.87        90
           2       0.76      0.69      0.72        87
           3       0.75      0.78      0.76        81
           4       0.72      0.63      0.68        82
           5       0.79      0.84      0.82       103
           6       0.96      0.97      0.96        90
           7       1.00      1.00      1.00       101

    accuracy                           0.84       634
   macro avg       0.83      0.83      0.83       634
weighted avg       0.84      0.84      0.84       634



In [17]:
cm = confusion_matrix(y_test, predictions)
cm

array([[ 81,   9,   0,   0,   0,   0,   0],
       [ 16,  60,   9,   2,   0,   0,   0],
       [  0,   7,  63,   9,   2,   0,   0],
       [  0,   3,   8,  52,  18,   1,   0],
       [  0,   0,   4,   9,  87,   3,   0],
       [  0,   0,   0,   0,   3,  87,   0],
       [  0,   0,   0,   0,   0,   0, 101]], dtype=int64)

In [18]:
# Standard scaling
sc_X = StandardScaler()
X_train_n = sc_X.fit_transform(X_train)
X_test_n = sc_X.transform(X_test)

In [19]:
# Train model
logmodel_n = LogisticRegression(max_iter=10000)
logmodel_n.fit(X_train_n, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
# Test model
predictions_n = logmodel_n.predict(X_test_n)

In [21]:
# Evaluate model
print(classification_report(y_test, predictions_n))

              precision    recall  f1-score   support

           1       0.90      0.96      0.92        90
           2       0.85      0.78      0.81        87
           3       0.82      0.77      0.79        81
           4       0.81      0.80      0.81        82
           5       0.93      0.90      0.92       103
           6       0.95      1.00      0.97        90
           7       0.94      0.99      0.97       101

    accuracy                           0.89       634
   macro avg       0.89      0.89      0.88       634
weighted avg       0.89      0.89      0.89       634



In [22]:
cm_n = confusion_matrix(y_test, predictions_n)
cm_n

array([[ 86,   4,   0,   0,   0,   0,   0],
       [ 10,  68,   7,   2,   0,   0,   0],
       [  0,   8,  62,  11,   0,   0,   0],
       [  0,   0,   7,  66,   7,   1,   1],
       [  0,   0,   0,   2,  93,   3,   5],
       [  0,   0,   0,   0,   0,  90,   0],
       [  0,   0,   0,   0,   0,   1, 100]], dtype=int64)

Normalization increases the perfomance of the model.