# Imports

In [11]:
# ============================================================================
# 02_preprocessing.ipynb
# Preprocessing pipeline for Diabetes Binary Classification
# Dataset: BRFSS 2015 - Diabetes Health Indicators
# ============================================================================
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')


# Data Loading

In [12]:
# ============================================================================
# 2. DATA LOADING
# ============================================================================
print("="*80)
print("DATA LOADING")
print("="*80)

dataset_link = "alexteboul/diabetes-health-indicators-dataset"  # correct Kaggle dataset handle
destination = "../data/raw"
dataset_name = dataset_link.split("/")[-1]

files = os.listdir(f"{destination}/{dataset_name}")
df = pd.read_csv(f"{destination}/{dataset_name}/{files[2]}", delimiter=",")

print(f"\nDataset shape: {df.shape}")
print(f"Number of records: {df.shape[0]:,}")
print(f"Number of features: {df.shape[1]}")

print("\nFirst few rows:")
print(df.head())

DATA LOADING

Dataset shape: (253680, 22)
Number of records: 253,680
Number of features: 22

First few rows:
   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3              0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4              0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  Ph

# Data Cleaning

In [13]:
# ============================================================================
# 3. DATA CLEANING - DUPLICATE REMOVAL
# ============================================================================

print("\n" + "="*80)
print("DATA CLEANING - DUPLICATE REMOVAL")
print("="*80)

initial_len = len(df)
df = df.drop_duplicates()
duplicates_removed = initial_len - len(df)

print(f"\nDuplicates removed: {duplicates_removed:,}")
print(f"New dataset size: {len(df):,}")


DATA CLEANING - DUPLICATE REMOVAL

Duplicates removed: 24,206
New dataset size: 229,474


# Feature and Target Separation

In [14]:
# ============================================================================
# 4. FEATURE AND TARGET SEPARATION
# ============================================================================
print("\n" + "="*80)
print("SECTION 4: FEATURE AND TARGET SEPARATION")
print("="*80)

target = 'Diabetes_binary'
y = df[target]
X = df.drop(columns=[target])

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")



SECTION 4: FEATURE AND TARGET SEPARATION

Features shape: (229474, 21)
Target shape: (229474,)


# Feature Grouping

In [15]:
# ============================================================================
# 5. FEATURE GROUPING
# ============================================================================
print("\n" + "="*80)
print("FEATURE GROUPING")
print("="*80)

binary_health = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 
                 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
                 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk']
ordinal = ['GenHlth', 'Age', 'Education', 'Income']
continuous = ['BMI', 'MentHlth', 'PhysHlth']
demographic = ['Sex', 'Age', 'Education', 'Income']

print(f"\nBinary health features ({len(binary_health)}): {binary_health}")
print(f"Ordinal features ({len(ordinal)}): {ordinal}")
print(f"Continuous features ({len(continuous)}): {continuous}")
print(f"Demographic features ({len(demographic)}): {demographic}")


FEATURE GROUPING

Binary health features (13): ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk']
Ordinal features (4): ['GenHlth', 'Age', 'Education', 'Income']
Continuous features (3): ['BMI', 'MentHlth', 'PhysHlth']
Demographic features (4): ['Sex', 'Age', 'Education', 'Income']
