# Imports

In [3]:
# ============================================================================
# 02_preprocessing.ipynb
# Data Preprocessing for Multi-Class Diabetes Classification
# Dataset: BRFSS 2015 - Diabetes Health Indicators (3 Classes)
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append('../src/core')

# Import custom modules
from data_loader import load_raw_data
from preprocessing import (
    check_data_quality,
    remove_duplicates,
    create_train_test_split,
    scale_features,
    apply_smote,
    save_processed_data
)

# Plot style
sns.set_style("whitegrid")
sns.set_palette("colorblind")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Define output directory
output_dir = "../outputs/figures/preprocessing"
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory created: {output_dir}")

# Define processed data directory
processed_data_dir = "../data/processed"
os.makedirs(processed_data_dir, exist_ok=True)
print(f"Processed data directory created: {processed_data_dir}")


Output directory created: ../outputs/figures/preprocessing
Processed data directory created: ../data/processed


# Loading Data

In [5]:
# =============================================================================
# 1. DATA LOADING
# =============================================================================
print("\n" + "="*80)
print("STEP 1: DATA LOADING")
print("="*80)

df = load_raw_data("../data/raw/diabetes-health-indicators-dataset\diabetes_012_health_indicators_BRFSS2015.csv")

print(f"\nInitial Dataset Shape: {df.shape}")
print(f"Total Samples: {df.shape[0]:,}")
print(f"Total Features: {df.shape[1] - 1}")



STEP 1: DATA LOADING
Data loaded: 253680 rows, 22 columns

Initial Dataset Shape: (253680, 22)
Total Samples: 253,680
Total Features: 21
