In [100]:
# Import standard libraries

import pandas as pd 
import numpy as np 
import random
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors 
import seaborn as sns 

In [101]:
# Read data and convert to a dataframe
obesity_df = pd.read_csv(r'data/ObesityDataSet_raw_and_data_sinthetic.csv')
obesity_df.head()



Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [102]:
# Rename columns in dataframe

obesity_df = obesity_df.rename(columns = {'family_history_with_overweight': 'Family_History', 
                                          'FAVC' : 'High_Cal_Foods_Frequently', 
                                          'FCVC': 'Freq_Veg', 'NCP': 'Num_Meals', 
                                          'CAEC': 'Snacking',
                                          'SMOKE': 'Smoke',
                                          'CH2O': 'Water_Intake', 
                                          'SCC': 'Calorie_Monitoring' , 
                                          'FAF': 'Phys_Activity', 
                                          'TUE': 'Tech_Use', 'CALC':
                                          "Freq_Alcohol", 
                                          'MTRANS': 'Transportation', 
                                          'NObeyesdad': 'Obesity_Level'})

# Add 'BMI' column to dataframe defined as weight divided by height
obesity_df['BMI'] = obesity_df['Weight'] / (obesity_df['Height'] ** 2)

# Print first rows of revised dataframe
obesity_df.head()

Unnamed: 0,Gender,Age,Height,Weight,Family_History,High_Cal_Foods_Frequently,Freq_Veg,Num_Meals,Snacking,Smoke,Water_Intake,Calorie_Monitoring,Phys_Activity,Tech_Use,Freq_Alcohol,Transportation,Obesity_Level,BMI
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight,24.386526
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight,24.238227
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight,23.765432
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I,26.851852
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II,28.342381


**Gender**: Sex ('Male', 'Female')

**Age**: Age in years (float)

**Height**:	Height in meters (float)

**Weight**:	Weight in kilograms (float)

**Family_History**:	Family History of Obesity ('yes', 'no')

**High_Cal_Foods_Frequently**: Frequently consumes high-calorie foods ('yes', 'no')

**Freq_Veg**:	Number of meals per day in which vegetables are usually consumed (integer)

**Num_Meals**:	Number of main meals per day (integer)

**Snacking**:	Eat food between meals ('no' 'Sometimes' 'Frequently' 'Always')

**Smoke**:	Smoker ('yes', 'no')

**Water_Intake**:	Liters of water consumed per day (float)

**Calorie_Momitoring**:	Calories being monitored ('no', 'Sometimes', 'Frequently', 'Always')

**Phys_Activity**:	Number of days of physical activity per week (float)

**Tech_Use**:	Amount of time spent using technological devices per day (float)

**Freq_Alcohol**:	Frequency of alcohol intake ('no', 'Sometimes', 'Frequently', 'Always')

**Transportaion**:	Means of transportation most used ('Public_Transportation' ,'Walking', 'Automobile', 'Motorbike', 'Bike')

**Obesity_Level**:	Categories based on body mass index ('Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II' and 'Obesity_Type_III')

**BMI** : Weight/Height (float)

In [103]:
descriptive_stats= data_describtion= obesity_df.describe()
descriptive_stats

Unnamed: 0,Age,Height,Weight,Freq_Veg,Num_Meals,Water_Intake,Phys_Activity,Tech_Use,BMI
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866,29.700159
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927,8.011337
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0,12.998685
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0,24.325802
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535,28.719089
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0,36.016501
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0,50.811753
