In [1]:
# Feature engineering (model-ready prep) 

import json
import pandas as pd 
import numpy as np

from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler 


INPUT_PATH = "../data/processed/diabetes_cleaned.csv"
OUTPUT_FE_PATH = "../data/processed/diabetes_fe.csv"
FEATURE_SPEC_PATH = "../data/processed/feature_spec.json"

df = pd.read_csv(INPUT_PATH)

assert "Outcome" in df.columns, "Outcome column not found"

print("Input shape:", df.shape)
print("Columns:", df.columns.tolist())



Input shape: (768, 9)
Columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']


In [2]:

df_fe = df.copy()

df_fe["AgeGroup"] = pd.cut(
    df_fe["Age"],
    bins=[0, 29, 39, 49, 59, 120],
    labels=["<30", "30-39", "40-49", "50-59", "60+"],
    right=True,
).astype(str)

# BMI categories (WHO-style rough buckets)
df_fe["BMICategory"] = pd.cut(
    df_fe["BMI"],
    bins=[0, 18.5, 25, 30, 100],
    labels=["underweight", "normal", "overweight", "obese"],
    right=False,
).astype(str)

# Simple interaction-like features (still interpretable)
df_fe["Glucose_BMI"] = df_fe["Glucose"] * df_fe["BMI"]
df_fe["Pregnancies_Age"] = df_fe["Pregnancies"] / (df_fe["Age"] + 1.0)

na_counts = df_fe.isna().sum().sum()
assert na_counts == 0, f"NaNs found after feature engineering: {na_counts}"

print("After FE shape:", df_fe.shape)


After FE shape: (768, 13)


In [3]:
target_col = "Outcome"

categorical_features = ["AgeGroup", "BMICategory"]
numeric_features = [c for c in df_fe.columns if c not in categorical_features + [target_col]]

print("Numeric feature count:", len(numeric_features))
print("Categorical feature count:", len(categorical_features))

Numeric feature count: 10
Categorical feature count: 2


In [4]:

# PREPROCESSOR OBJECT (to be reused with the model)
# Scale numeric features using StandardScaler
# Keep categorical features passthrough for now 


numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", "passthrough", categorical_features),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

print(preprocessor)


ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['Pregnancies', 'Glucose', 'BloodPressure',
                                  'SkinThickness', 'Insulin', 'BMI',
                                  'DiabetesPedigreeFunction', 'Age',
                                  'Glucose_BMI', 'Pregnancies_Age']),
                                ('cat', 'passthrough',
                                 ['AgeGroup', 'BMICategory'])],
                  verbose_feature_names_out=False)


In [5]:
df_fe.to_csv(OUTPUT_FE_PATH, index=False)
print("Saved:", OUTPUT_FE_PATH)

feature_spec = {
    "input_path": INPUT_PATH,
    "output_path": OUTPUT_FE_PATH,
    "target": target_col,
    "numeric_features": numeric_features,
    "categorical_features": categorical_features,
    "engineered_features": ["AgeGroup", "BMICategory", "Glucose_BMI", "Pregnancies_Age"],
    "notes": [
        "AgeGroup and BMICategory are categorical and will be one-hot encoded later in the ML pipeline.",
        "Numeric features will be standardized using StandardScaler (fit on train only)."
    ],
}

with open(FEATURE_SPEC_PATH, "w", encoding="utf-8") as f:
    json.dump(feature_spec, f, indent=2)

print("Saved:", FEATURE_SPEC_PATH)


Saved: ../data/processed/diabetes_fe.csv
Saved: ../data/processed/feature_spec.json
