## 📦 1. Install ucimlrepo and Load Dataset

In [1]:
!pip install ucimlrepo xgboost shap streamlit gradio -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m962.0 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.1/54.1 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.9/322.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from ucimlrepo import fetch_ucirepo
import pandas as pd

# fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# data (as pandas dataframes)
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets['Diabetes_binary']

# metadata
print(cdc_diabetes_health_indicators.metadata)

# variable information
print(cdc_diabetes_health_indicators.variables)

# # Load dataset
# X = cdc_diabetes_health_indicators.data.features
# y = cdc_diabetes_health_indicators.data.targets['Diabetes_binary']

# Sanity check
print("Columns in X:\n", X.columns)
print("Class distribution:\n", y.value_counts())


{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

## 🧹 2. Basic Preprocessing

In [3]:
# Class distribution
print(y.value_counts())

# Quick overview
df = pd.concat([X, y], axis=1)
df.head()


Diabetes_binary
0    218334
1     35346
Name: count, dtype: int64


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


## 🔀 3. Train-Test Split

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 🧠 4. Train a Classifier

In [5]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
# Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
# XGBoost
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.



## 📊 5. Evaluate the Model

In [6]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate(model, name):
    y_pred = model.predict(X_test)
    print(f"\n--- {name} ---")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

evaluate(rf, "Random Forest")
evaluate(lr, "Logistic Regression")
evaluate(xgb, "XGBoost")



--- Random Forest ---
              precision    recall  f1-score   support

           0       0.88      0.97      0.92     43667
           1       0.49      0.18      0.26      7069

    accuracy                           0.86     50736
   macro avg       0.68      0.57      0.59     50736
weighted avg       0.82      0.86      0.83     50736

Confusion Matrix:
[[42342  1325]
 [ 5808  1261]]

--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.88      0.98      0.92     43667
           1       0.52      0.16      0.24      7069

    accuracy                           0.86     50736
   macro avg       0.70      0.57      0.58     50736
weighted avg       0.83      0.86      0.83     50736

Confusion Matrix:
[[42626  1041]
 [ 5950  1119]]

--- XGBoost ---
              precision    recall  f1-score   support

           0       0.88      0.98      0.93     43667
           1       0.54      0.17      0.26      7069

    accuracy   

## 📈 6. Feature Importance

In [7]:
from sklearn.model_selection import cross_val_score

def cross_val(model, name):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name} Cross-Validation Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

cross_val(rf, "Random Forest")
cross_val(lr, "Logistic Regression")
cross_val(xgb, "XGBoost")


Random Forest Cross-Validation Accuracy: 0.8594 ± 0.0009
Logistic Regression Cross-Validation Accuracy: 0.8632 ± 0.0022


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost Cross-Validation Accuracy: 0.8656 ± 0.0003


In [None]:
# import shap
# import matplotlib
# matplotlib.use('Agg')  # Use non-interactive backend (important for servers/Colab)

# # Create SHAP explainer for XGBoost model
# explainer = shap.Explainer(xgb, X_test)
# shap_values = explainer(X_test)

# # Generate SHAP summary plot
# shap.summary_plot(shap_values, X_test)


## 🧑‍💻 9. Simple Gradio App (for Local Prediction)
🩺 Diabetes Prediction App - Input Descriptions
Use the following options to input lifestyle, demographic, and health data for a prediction:

🔲 Checkboxes (Binary: 0 = No, 1 = Yes)
HighBP: High Blood Pressure diagnosed

HighChol: High Cholesterol diagnosed

CholCheck: Cholesterol checked in past 5 years

Smoker: Smoked 100+ cigarettes in life

Stroke: Ever had a stroke

HeartDiseaseorAttack: CHD or heart attack diagnosis

PhysActivity: Physical activity in past 30 days (non-work)

Fruits: Eat fruit 1+ times per day

Veggies: Eat vegetables 1+ times per day

HvyAlcoholConsump: Heavy alcohol consumption

AnyHealthcare: Has health insurance or coverage

NoDocbcCost: Could not see doctor due to cost in last year

DiffWalk: Serious difficulty walking/climbing stairs

Sex: 0 = Female, 1 = Male

🔢 Numeric Inputs
BMI: Body Mass Index (e.g., 22-50)

GenHlth: General health (1 = Excellent to 5 = Poor)

MentHlth: Days mental health was not good (0-30)

PhysHlth: Days physical health was not good (0-30)

Age: Age group

1 = 18-24

2 = 25-29

…

13 = 80+

🎓 Education Level (1-6)
1 = Never attended school

2 = Grades 1-8

3 = Grades 9-11

4 = Grade 12 or GED

5 = College 1-3 years

6 = College 4+ years

💰 Income Level (1-8)
1 = Less than $10,000

2 = $10,000-14,999

3 = $15,000-19,999

4 = $20,000-24,999

5 = $25,000-34,999

6 = $35,000-49,999

7 = $50,000-74,999

8 = $75,000 or more

In [8]:
import gradio as gr

def predict_diabetes(HighBP, HighChol, CholCheck, BMI, Smoker, Stroke, HeartDiseaseorAttack,
                     PhysActivity, Fruits, Veggies, HvyAlcoholConsump, AnyHealthcare,
                     NoDocbcCost, GenHlth, MentHlth, PhysHlth, DiffWalk, Sex, Age,
                     Education, Income):

    input_data = pd.DataFrame([[
        HighBP, HighChol, CholCheck, BMI, Smoker, Stroke, HeartDiseaseorAttack,
        PhysActivity, Fruits, Veggies, HvyAlcoholConsump, AnyHealthcare,
        NoDocbcCost, GenHlth, MentHlth, PhysHlth, DiffWalk, Sex, Age,
        Education, Income
    ]], columns=X.columns)

    prediction = xgb.predict(input_data)[0]
    return "Diabetic / Prediabetic" if prediction == 1 else "Healthy"

inputs = [
    gr.Checkbox(label=col) if X[col].nunique() == 2 else gr.Slider(minimum=int(X[col].min()), maximum=int(X[col].max()), step=1, label=col)
    for col in X.columns
]

gr.Interface(fn=predict_diabetes, inputs=inputs, outputs="text", title="Diabetes Risk Classifier").launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ee1fe322d1d574e3bc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


