In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Step 1: Load the dataset
data_path = "ObesityDataSet_raw_and_data_sinthetic.csv"  
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
# Step 2: Explore the dataset
print("Dataset Shape:", data.shape)
print("Column Names:", data.columns)
print("First Few Rows:\n", data.head())
print("Data Types:\n", data.dtypes)
print("Missing Values:\n", data.isnull().sum())

Dataset Shape: (2111, 17)
Column Names: Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')
First Few Rows:
    Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0  Female  21.0    1.62    64.0                            yes   no   2.0   
1  Female  21.0    1.52    56.0                            yes   no   3.0   
2    Male  23.0    1.80    77.0                            yes   no   2.0   
3    Male  27.0    1.80    87.0                             no   no   3.0   
4    Male  22.0    1.78    89.8                             no   no   2.0   

   NCP       CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC  \
0  3.0  Sometimes    no   2.0   no  0.0  1.0          no   
1  3.0  Sometimes   yes   3.0  yes  3.0  0.0   Sometimes   
2  3.0  Sometimes    no   2.0   no  2.0  1.0  Frequently   
3  3.0  Sometimes    no 

In [4]:
# Step 3: Calculate BMI and drop Height and Weight
data["BMI"] = data["Weight"] / (data["Height"] ** 2)
data.drop(columns=["Height", "Weight"], inplace=True)

In [5]:
# Step 4: Encode categorical variables
label_encoders = {}
for col in data.select_dtypes(include=["object"]):
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Save label encoders for reference
for col, le in label_encoders.items():
    print(f"Encoding for {col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

Encoding for Gender: {'Female': 0, 'Male': 1}
Encoding for family_history_with_overweight: {'no': 0, 'yes': 1}
Encoding for FAVC: {'no': 0, 'yes': 1}
Encoding for CAEC: {'Always': 0, 'Frequently': 1, 'Sometimes': 2, 'no': 3}
Encoding for SMOKE: {'no': 0, 'yes': 1}
Encoding for SCC: {'no': 0, 'yes': 1}
Encoding for CALC: {'Always': 0, 'Frequently': 1, 'Sometimes': 2, 'no': 3}
Encoding for MTRANS: {'Automobile': 0, 'Bike': 1, 'Motorbike': 2, 'Public_Transportation': 3, 'Walking': 4}
Encoding for NObeyesdad: {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Obesity_Type_I': 2, 'Obesity_Type_II': 3, 'Obesity_Type_III': 4, 'Overweight_Level_I': 5, 'Overweight_Level_II': 6}


In [6]:
# Step 5: Normalize or standardize numerical features
numerical_columns = data.select_dtypes(include=["int64", "float64"]).columns
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [7]:
# Step 6: Check processed data
print("Processed Dataset Shape:", data.shape)
print("First Few Processed Rows:\n", data.head())

Processed Dataset Shape: (2111, 16)
First Few Processed Rows:
    Gender       Age  family_history_with_overweight  FAVC      FCVC       NCP  \
0       0 -0.522124                               1     0 -0.785019  0.404153   
1       0 -0.522124                               1     0  1.088342  0.404153   
2       1 -0.206889                               1     0 -0.785019  0.404153   
3       1  0.423582                               0     0  1.088342  0.404153   
4       1 -0.364507                               0     0 -0.785019 -2.167023   

   CAEC  SMOKE      CH2O  SCC       FAF       TUE  CALC  MTRANS  NObeyesdad  \
0     2      0 -0.013073    0 -1.188039  0.561997     3       3           1   
1     2      1  1.618759    1  2.339750 -1.080625     2       3           1   
2     2      0 -0.013073    0  1.163820  0.561997     1       3           1   
3     2      0 -0.013073    0  1.163820 -1.080625     1       4           5   
4     2      0 -0.013073    0 -1.188039 -1.080625     2

In [8]:
# Step 7: Save the processed data (optional)
data.to_csv("processed_dataset.csv", index=False)
print("Processed data saved to 'processed_dataset.csv'")


Processed data saved to 'processed_dataset.csv'
