**Import Libraries and Load Data:

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [6]:
# Load the dataset
df = pd.read_csv('../../data/raw/diabetes_012_health_indicators_BRFSS2015.csv')

**Handle Missing Values and Convert Categorical Variables:

In [7]:
# Drop rows with missing values
df.dropna(inplace=True)

# Convert categorical variables with more than two categories into numerical format
# GenHlth, Education, and Income are categorical variables
df['GenHlth'] = df['GenHlth'].astype('category').cat.codes
df['Education'] = df['Education'].astype('category').cat.codes
df['Income'] = df['Income'].astype('category').cat.codes


**Normalization/Standardization and Categorical Encoding:

In [8]:
# Identify categorical and numerical features
categorical_features = ['Sex']
numeric_features = df.columns.difference(['Diabetes_012', 'Sex'])

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)])

# Separate features and target
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

# Apply transformations
X_cleaned = preprocessor.fit_transform(X)

# Create a new DataFrame with cleaned data
df_cleaned = pd.DataFrame(X_cleaned, columns=numeric_features.tolist() + ['Sex_Male', 'Sex_Female'])
df_cleaned['Diabetes_012'] = y.values

# Save the cleaned dataset
df_cleaned.to_csv('../../data/cleaned_diabetes_dataset.csv', index=False)
