In [5]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/GitHub/obesity-risk-prediction/data/Obesity.csv')

def clean_obesity_data(df):
    """
    Cleans and prepares the obesity dataset according to the data dictionary.
    Handles noise in decimal values by rounding them to the nearest integer.
    """
    df_clean = df.copy()

    # 1. Rounding features that should be discrete (according to dictionary notes)
    # FCVC: Frequency of consumption of vegetables (1-3)
    # NCP: Number of main meals (1-4)
    # CH2O: Daily water consumption (1-3)
    # FAF: Physical activity frequency (0-3)
    # TUE: Time using electronic devices (0-2)

    cols_to_round = ['FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

    for col in cols_to_round:
        df_clean[col] = df_clean[col].round().astype(int)

    # 2. Converting Age to integer (as it is continuous but usually represented as int in medical contexts)
    df_clean['Age'] = df_clean['Age'].round().astype(int)

    # 3. Rename columns for better clarity in English (Optional but recommended)
    # The current column names are already standard, but we ensure consistency

    return df_clean

# Applying the cleaning
df_prepared = clean_obesity_data(df)

# Quick check on the results
print("First 5 rows after cleaning:")
print(df_prepared.head())

# Save the cleaned data for the next step (Modeling)
df_prepared.to_csv('/content/drive/MyDrive/GitHub/obesity-risk-prediction/data/processed_obesity.csv', index=False)

First 5 rows after cleaning:
   Gender  Age  Height  Weight family_history FAVC  FCVC  NCP       CAEC  \
0  Female   21    1.62    64.0            yes   no     2    3  Sometimes   
1  Female   21    1.52    56.0            yes   no     3    3  Sometimes   
2    Male   23    1.80    77.0            yes   no     2    3  Sometimes   
3    Male   27    1.80    87.0             no   no     3    3  Sometimes   
4    Male   22    1.78    89.8             no   no     2    1  Sometimes   

  SMOKE  CH2O  SCC  FAF  TUE        CALC                 MTRANS  \
0    no     2   no    0    1          no  Public_Transportation   
1   yes     3  yes    3    0   Sometimes  Public_Transportation   
2    no     2   no    2    1  Frequently  Public_Transportation   
3    no     2   no    2    0  Frequently                Walking   
4    no     2   no    0    0   Sometimes  Public_Transportation   

               Obesity  
0        Normal_Weight  
1        Normal_Weight  
2        Normal_Weight  
3   Overwei