In [23]:
# Import standard libraries

import pandas as pd 
import numpy as np 
import random
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors 
import seaborn as sns 

In [24]:
# Read data and convert to a dataframe
df_raw = pd.read_csv(r'data/ObesityDataSet_raw_and_data_sinthetic.csv')
df_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [25]:
# Rename columns in dataframe

df = df_raw.rename(columns = {'family_history_with_overweight': 'Family_History', 
                                          'FAVC' : 'High_Cal_Foods_Frequently', 
                                          'FCVC': 'Freq_Veg', 'NCP': 'Num_Meals', 
                                          'CAEC': 'Snacking',
                                          'SMOKE': 'Smoke',
                                          'CH2O': 'Water_Intake', 
                                          'SCC': 'Calorie_Monitoring' , 
                                          'FAF': 'Phys_Activity', 
                                          'TUE': 'Tech_Use', 'CALC':
                                          "Freq_Alcohol", 
                                          'MTRANS': 'Transportation', 
                                          'NObeyesdad': 'Obesity_Level'})

# Add 'BMI' column to dataframe defined as weight divided by height
df['BMI'] = df_raw['Weight'] / (df_raw['Height'] ** 2)

# Print first rows of revised dataframe
df.head()

Unnamed: 0,Gender,Age,Height,Weight,Family_History,High_Cal_Foods_Frequently,Freq_Veg,Num_Meals,Snacking,Smoke,Water_Intake,Calorie_Monitoring,Phys_Activity,Tech_Use,Freq_Alcohol,Transportation,Obesity_Level,BMI
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight,24.386526
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight,24.238227
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight,23.765432
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I,26.851852
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II,28.342381


***Features***

There are 17 features in the dataframe defined above, and 1 output variable, "Obesity_Level" as follows:

**Gender**: Sex ('Male', 'Female')

**Age**: Age in years (float)

**Height**:	Height in meters (float)

**Weight**:	Weight in kilograms (float)

**Family_History**:	Family History of Obesity ('yes', 'no')

**High_Cal_Foods_Frequently**: Frequently consumes high-calorie foods ('yes', 'no')

**Freqency_Veg**:	Number of meals per day in which vegetables are usually consumed (integer)

**Num_Meals**:	Number of main meals per day (float)

**Snacking**:	Eat food between meals ('no', 'Sometimes', 'Frequently', 'Always')

**Smoke**:	Smoker ('yes', 'no')

**Water_Intake**:	Liters of water consumed per day (float)

**Calorie_Monitoring**:	Calories being monitored ('no', 'Sometimes', 'Frequently', 'Always')

**Physical_Activity**:	Number of days of physical activity per week (float)

**Tech_Use**:	Amount of time spent using technological devices per day (float)

**Frequency_Alcohol**:	Frequency of alcohol intake ('no', 'Sometimes', 'Frequently', 'Always')

**Transportaion**:	Means of transportation most used ('Public_Transportation' ,'Walking', 'Automobile', 'Motorbike', 'Bike')

**Obesity_Level**:	Categories based on body mass index ('Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II' and 'Obesity_Type_III')

**BMI** : Weight/Height (float)

In [4]:
descriptive_stats= data_describtion= df.describe()
descriptive_stats

Unnamed: 0,Age,Height,Weight,Freq_Veg,Num_Meals,Water_Intake,Phys_Activity,Tech_Use,BMI
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866,29.700159
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927,8.011337
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0,12.998685
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0,24.325802
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535,28.719089
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0,36.016501
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0,50.811753


In [26]:
num_cols = ['Age', 'Height', 'Weight', 'Freq_Veg', 'Num_Meals', 'Water_Intake','Phys_Activity', 'Tech_Use']

categorical_cols = ['Gender', 'Family_History',	'High_Cal_Foods_Frequently', 'Snacking', 'Smoke', 'Calorie_Monitoring' 'Freq_Alcohol','Transportation']

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression

In [None]:


pipe_num_simple = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('standardizer', StandardScaler())
])

ctransform_simple= ColumnTransformer([
    ('numeric_simple', pipe_num_simple, num_cols),
], remainder='passthrough')

pipe_simple = Pipeline([
    ('preprocess', ctransform_simple),
    ('model', Ridge())
])
pipe_simple

In [51]:
X = df.drop(columns = ['BMI', 'Weight', 'Gender', 'Family_History','High_Cal_Foods_Frequently', 'Snacking', 
                        'Smoke', 'Calorie_Monitoring', 'Freq_Alcohol','Transportation'])
Y = df['Obesity_Level']

scoring = ['neg_log_loss', 'roc_auc', 'f1', 'accuracy', 'precision', 'recall']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [None]:
# res_simple_dict = cross_validate(pipe_simple, X_train,Y_train, cv = 5, scoring = scoring)
# res_simple = pd.DataFrame(res_simple_dict).assign(experiment = 1)
# res_simple


In [55]:
preproc1 = ColumnTransformer([('standard', StandardScaler(), X)])
preproc1

In [59]:
pipe1 = Pipeline([('preprocess1', preproc1),
                   ('ridge', Ridge(random_state=42))])
pipe1

In [60]:
pipe1.fit(X_train, Y_train)

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed