# PEDIL-Forge Day 2: Machine Learning for Public Health
### Hands-On Tutorial – Predicting Diabetes with BRFSS 2011 Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Load dataset
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['brfss_diabetes_clean.csv']))
df.head()

In [None]:
print(X_test_ros.columns.tolist())


In [None]:
# Preprocess data
X = df.drop('has_diabetes', axis=1)
y = df['has_diabetes']

# Convert categorical variables to numeric
X_encoded = X.copy()

# Example: Convert 'sex' to numeric (Male=0, Female=1)
X_encoded['sex'] = X_encoded['sex'].map({'Male': 0, 'Female': 1})

# You can repeat this if you have other categorical columns
# X_encoded = pd.get_dummies(X_encoded)  # Optional: for more complex encoding
X_encoded = pd.get_dummies(X, columns=['sex', 'income_level', 'education_level'], drop_first=True)

# Now split using encoded data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [None]:
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Evaluate model
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Reds')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

We have a fantastic model. Let's interpret, publish and make decisions bases on this model. Right?

# **Maybe not!** ***Why?***

Model barely catches Class 0!
   * Only 1 out of 9 non-diabetic cases was predicted correctly.
   * Most predictions go to the majority class (diabetes).
   * Accuracy is inflated because the dataset is ~95% diabetic.

# **Key points to note**

   1. Imbalanced datasets fool accuracy → Show how it fails to capture minority class.
   2. Precision vs Recall trade-off → Would a diabetes screening tool that misses most non-diabetics be acceptable?
   3. Macro vs Weighted Averages → Macro average treats both classes equally.

# *So what can we do?*
   1. Stratified sampling to balance classes
   2.   Class weights in models: ***model = RandomForestClassifier(class_weight='balanced')***
   3. SMOTE or undersampling

In [None]:
# Rebuild model with class weights
model_weighted = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_weighted.fit(X_train, y_train)
y_pred_weighted = model_weighted.predict(X_test)

# Evaluation
print("Classification Report (with class_weight='balanced'):")
print(classification_report(y_test, y_pred_weighted))

# What Changed?
  * Precision for Class 0 improved slightly → Model makes slightly fewer false positives.
  * Recall for Class 0 stayed very low → Still missing most non-diabetics.
  * Class 1 (majority) still dominates.

Remember:
Macro vs Weighted Averages
  * Macro avg: 0.65 (precision), 0.55 (recall) → these treat both classes equally.
  * Weighted avg: still high because Class 1 dominates.

Conclusion:
1. Accuracy is still misleading → A 95% accuracy looks good… but it fails for Class 0
2. Minority class recall → Model catches <12% of non-diabetics
3. Better balance needed → Explore other balancing options
  * Random undersampling of Class 1
  * Random oversampling of Class 0
  * SMOTE (Synthetic Minority Oversampling Technique)


# Handling Imbalanced Data: Resampling Techniques
# In this section, we’ll compare Undersampling, Oversampling, and SMOTE to improve classification for imbalanced outcomes.


In [None]:
# Install the necessary package
!pip install -q imbalanced-learn

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt



In [None]:
# Reusable evaluation function
def evaluate_model(X_resampled, y_resampled, title):
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n📊 {title} - Classification Report:")
    print(classification_report(y_test, y_pred))

    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="coolwarm")
    plt.title(f"{title} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()


In [None]:
# Prepare data
X = df.drop("has_diabetes", axis=1).copy()
y = df["has_diabetes"]

# Encode 'sex' column (update if you have other categorical variables)
X["sex"] = X["sex"].map({"Male": 0, "Female": 1})

1. Undersampling

In [None]:
# Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)
evaluate_model(X_rus, y_rus, "Random Undersampling")


2. Oversampling

In [None]:
# Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)
evaluate_model(X_ros, y_ros, "Random Oversampling")


3. SMOTE

In [None]:
# 🔄 SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)
evaluate_model(X_smote, y_smote, "SMOTE (Synthetic Minority Oversampling Technique)")


# Discussion:
1. What imbalance does to model performance
2. How resampling can improve minority class detection
3. Why we compare metrics beyond just accuracy

Let's get the best model and interpret what we get and what it means for our data. Which was the best model?
* Random Oversampling

So we rerun the model and move save it for interpretation and potential deployment

In [None]:
from sklearn.model_selection import GridSearchCV

# Define pipeline again (same preprocessor from earlier)
clf_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# Define hyperparameter grid to search
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5],
    'clf__min_samples_leaf': [1, 2],
    'clf__class_weight': [None, 'balanced']
}

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=clf_pipeline,
    param_grid=param_grid,
    cv=3,
    scoring='f1',  # or 'roc_auc', 'accuracy', etc.
    verbose=1,
    n_jobs=-1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Evaluate
from sklearn.metrics import classification_report
y_pred = best_model.predict(X_test)
print("Classification Report (Tuned Model):")
print(classification_report(y_test, y_pred))


In [None]:
# Re-run oversampling and store model
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)
X_train_ros, X_test_ros, y_train_ros, y_test_ros = train_test_split(X_ros, y_ros, test_size=0.2, random_state=42)

model_ros = RandomForestClassifier(n_estimators=100, random_state=42)
model_ros.fit(X_train_ros, y_train_ros)


In [None]:
# Feature Importance (Random Oversampling Model)
import numpy as np

feature_importances = model_ros.feature_importances_
features = X.columns

# Get top 10 features
indices = np.argsort(feature_importances)[-10:]

plt.figure(figsize=(8, 6))
sns.barplot(x=feature_importances[indices], y=features[indices])
plt.title("Top 10 Feature Importances (Random Oversampling)")
plt.xlabel("Relative Importance")
plt.ylabel("Features")
plt.tight_layout()
plt.show()


In [None]:
#Download the saved image to your computer (optional)
files.download(plot_filename)

Feature importance = how much each variable helps the model split classes correctly.

Tasks:
 1. Discuss why age, BMI, education, or income might be ranked high.
 2.  Would these be your top predictors of diabetes? Why?

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import PartialDependenceDisplay
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt

# Step 1: Identify columns
categorical_cols = ['sex', 'income_level', 'education_level']
numeric_cols = ['age', 'BMI']
all_features = categorical_cols + numeric_cols

# Subset the data
X_subset = X[all_features]
y_subset = y.copy()

# Step 2: Encode + scale via ColumnTransformer
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ("num", StandardScaler(), numeric_cols)
])

# Step 3: Wrap in pipeline
clf_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Step 4: Resample and split
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_subset, y_subset)
X_train, X_test, y_train, y_test = train_test_split(X_ros, y_ros, test_size=0.2, random_state=42)

# Step 5: Fit model
clf_pipeline.fit(X_train, y_train)

# Step 6: Plot partial dependence (now works with pipeline!)
features_for_plot = ['age', 'BMI', 'sex', 'income_level', 'education_level']

for feat in features_for_plot:
    fig, ax = plt.subplots(figsize=(6, 4))
    PartialDependenceDisplay.from_estimator(
        clf_pipeline, X_test, [feat],
        grid_resolution=50, ax=ax
    )
    plt.title(f"PDP: {feat}")
    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.inspection import PartialDependenceDisplay
import matplotlib.pyplot as plt

for feature in top5_features:
    fig, ax = plt.subplots(figsize=(6, 4))
    PartialDependenceDisplay.from_estimator(
        model_ros, X_test_ros, [feature],
        grid_resolution=50, ax=ax
    )
    plt.title(f"Partial Dependence of {feature}")
    plt.tight_layout()
    plt.show()


# Wrap-Up

1. What did we learn about imbalance?
2. Why isn’t accuracy enough?
3. Which method worked best for our case?
4. Which variable is most important in predicting risk of diabetes?

Challenge
* Compare the interpretatin to a different classifier (e.g., LogisticRegression).
* What happens when you increase the number of trees?


