# Diabetes Risk Prediction
This notebook builds a pipeline to predict diabetes risk.

In [None]:
!pip install scikit-learn xgboost shap matplotlib pandas seaborn joblib torch --quiet

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
           "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
df = pd.read_csv(url, names=columns)

sns.countplot(data=df, x='Outcome')
plt.title("Class Distribution")
plt.show()

plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib

df_cleaned = df.copy()
zero_features = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in zero_features:
    df_cleaned[col] = df_cleaned[col].replace(0, np.nan)
    df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)

df_cleaned["BMI_Age"] = df_cleaned["BMI"] * df_cleaned["Age"]
df_cleaned["GlucosePerPreg"] = df_cleaned["Glucose"] / (df_cleaned["Pregnancies"] + 1)
df_cleaned["LogInsulin"] = np.log1p(df_cleaned["Insulin"])
df_cleaned["LogBMI"] = np.log1p(df_cleaned["BMI"])

drop_cols = ["Insulin", "BMI"]
X = df_cleaned.drop(columns=["Outcome"] + drop_cols)
y = df_cleaned["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

joblib.dump(scaler, "scaler.pkl")

In [None]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(X_train_scaled.shape[1], 16), nn.ReLU(),
            nn.Linear(16, 8), nn.ReLU(),
            nn.Linear(8, 1)
        )
    def forward(self, x):
        return self.net(x)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
train_dl = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=32, shuffle=True)

nn_model = MLP()
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(nn_model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for xb, yb in train_dl:
        preds = nn_model(xb)
        loss = loss_fn(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_dl):.4f}")

with torch.no_grad():
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    test_preds = torch.sigmoid(nn_model(X_test_tensor)).numpy().flatten()
    print(f'Sample predictions: {test_preds[:5]}')