In [1]:
# Import dependencies
import pandas as pd

# Define file path
file_path = "Resources\ObesityDataSet_raw_and_data_sinthetic.csv"

# Read file into a DataFrame
df = pd.read_csv(file_path)

In [2]:
# Display DataFrame
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
# Confirm that there are no null values in any column
df.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [4]:
# Confirm the data types for each column
df.dtypes

Gender                             object
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
NObeyesdad                         object
dtype: object

In [5]:
# Rename columns in the DataFrame to improve understanding and match standard SQL naming conventions
df.rename(columns={
    'FAVC': 'high_calorie_intake',
    'FCVC': 'vegetable_consumption',
    'NCP': 'daily_meal_count',
    'CAEC': 'food_between_meals',
    'SMOKE': 'smoking_habit',
    'CH2O': 'water_consumption',
    'SCC': 'tracks_daily_calories',
    'FAF': 'exercise_frequency',
    'TUE': 'tech_usage_time',
    'CALC': 'alcohol_intake',
    'MTRANS': 'transportation_used',
    'NObeyesdad': 'obesity_level',
    'Gender': 'gender',
    'Age': 'age',
    'Height': 'height_m',
    'Weight': 'weight_kg'
}, inplace=True)

# Verify the updated column names
print(df.columns)


Index(['gender', 'age', 'height_m', 'weight_kg',
       'family_history_with_overweight', 'high_calorie_intake',
       'vegetable_consumption', 'daily_meal_count', 'food_between_meals',
       'smoking_habit', 'water_consumption', 'tracks_daily_calories',
       'exercise_frequency', 'tech_usage_time', 'alcohol_intake',
       'transportation_used', 'obesity_level'],
      dtype='object')


In [6]:
# Display revised DataFrame with new column names
df.head()

Unnamed: 0,gender,age,height_m,weight_kg,family_history_with_overweight,high_calorie_intake,vegetable_consumption,daily_meal_count,food_between_meals,smoking_habit,water_consumption,tracks_daily_calories,exercise_frequency,tech_usage_time,alcohol_intake,transportation_used,obesity_level
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [7]:
# Confirm that the classification values are consistent for 'gender'
df['gender'].value_counts()

gender
Male      1068
Female    1043
Name: count, dtype: int64

In [8]:
# Confirm that the classification values are consistent for 'family_history_with_overweight'
df['family_history_with_overweight'].value_counts()

family_history_with_overweight
yes    1726
no      385
Name: count, dtype: int64

In [9]:
# Confirm that the classification values are consistent for 'high_calorie_intake'
df['high_calorie_intake'].value_counts()

high_calorie_intake
yes    1866
no      245
Name: count, dtype: int64

In [10]:
# Confirm that the classification values are consistent for 'food_between_meals'
df['food_between_meals'].value_counts()

food_between_meals
Sometimes     1765
Frequently     242
Always          53
no              51
Name: count, dtype: int64

In [11]:
# Confirm that the classification values are consistent for 'smoking_habit'
df['smoking_habit'].value_counts()

smoking_habit
no     2067
yes      44
Name: count, dtype: int64

In [12]:
# Confirm that the classification values are consistent for 'tracks_daily_calories'
df['tracks_daily_calories'].value_counts()

tracks_daily_calories
no     2015
yes      96
Name: count, dtype: int64

In [13]:
# Confirm that the classification values are consistent for 'alcohol_intake'
df['alcohol_intake'].value_counts()

alcohol_intake
Sometimes     1401
no             639
Frequently      70
Always           1
Name: count, dtype: int64

In [14]:
# Confirm that the classification values are consistent for 'transportation_used'
df['transportation_used'].value_counts()

transportation_used
Public_Transportation    1580
Automobile                457
Walking                    56
Motorbike                  11
Bike                        7
Name: count, dtype: int64

In [15]:
# Confirm that the classification values are consistent for 'obesity_level'
df['obesity_level'].value_counts()

obesity_level
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64

In [16]:
# Write DataFrame to csv
df.to_csv('Resources\obesity_data_raw_cleaned.csv', index=False)