# Diabetes Risk Prediction

This notebook builds a machine learning pipeline to predict whether a patient is at risk for diabetes using the PIMA Indian dataset.

Goals:
- Perform EDA and handle missing/invalid values
- Build a strong preprocessing pipeline with feature engineering
- Compare multiple models (LogReg, RF, XGBoost, Neural Net)
- Visualize model performance (ROC, AUC, feature importance)
- Explain predictions with SHAP
- Deploy a live web app with Gradio

In [None]:
!pip install scikit-learn xgboost shap matplotlib pandas seaborn joblib --quiet

## Exploratory Data Analysis

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
           "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
df = pd.read_csv(url, names=columns)

sns.countplot(data=df, x='Outcome')
plt.title("Class Distribution")
plt.show()

plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

## Feature Engineering and Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib

df_cleaned = df.copy()
zero_features = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in zero_features:
    df_cleaned[col] = df_cleaned[col].replace(0, np.nan)
    df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)

df_cleaned["BMI_Age"] = df_cleaned["BMI"] * df_cleaned["Age"]
df_cleaned["GlucosePerPreg"] = df_cleaned["Glucose"] / (df_cleaned["Pregnancies"] + 1)
df_cleaned["LogInsulin"] = np.log1p(df_cleaned["Insulin"])
df_cleaned["LogBMI"] = np.log1p(df_cleaned["BMI"])

drop_cols = ["Insulin", "BMI"]
X = df_cleaned.drop(columns=["Outcome"] + drop_cols)
y = df_cleaned["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

joblib.dump(scaler, "scaler.pkl")

## Train Machine Learning Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

predictions = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    predictions[name] = y_prob
    auc = roc_auc_score(y_test, y_prob)
    print(f"{name}: ROC AUC = {auc:.4f}")

joblib.dump(models["XGBoost"], "xgboost_diabetes_model.pkl")

## ROC Curve Comparison

In [None]:
plt.figure(figsize=(8,6))
for name, y_prob in predictions.items():
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, label=f"{name}")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for All Models")
plt.legend()
plt.grid(True)
plt.show()

## Explainability with SHAP

In [None]:
import shap
explainer = shap.Explainer(models["XGBoost"], X_train_scaled)
shap_values = explainer(X_test_scaled[:100])
shap.plots.beeswarm(shap_values)

## Neural Network Model (PyTorch)

In [None]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(X_train_scaled.shape[1], 16), nn.ReLU(),
            nn.Linear(16, 8), nn.ReLU(),
            nn.Linear(8, 1), nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
train_dl = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=32, shuffle=True)

nn_model = MLP()
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(nn_model.parameters(), lr=0.01)

for epoch in range(10):
    for xb, yb in train_dl:
        preds = nn_model(xb)
        loss = loss_fn(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")