In [39]:
# load in libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plotnine import *
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, cohen_kappa_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingRegressor, StackingRegressor, StackingClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
import kagglehub
from kagglehub import KaggleDatasetAdapter
from xgboost import XGBClassifier

Predicting Heart Disease

In this lab we'll be using a dataset from kaggle yet again...it's just so fun and rich! We're using publicly available data from the Centers for Disease Control and Prevention (CDC), and in particular the Behavioral Risk Factor Surveillance System (BRFSS).

Primary Goals:

Predict heart disease.
One of the questions posted on the kaggle page is, "Can you indicate which variables have a significant effect on the likelihood of heart disease?" So, if your work allows you to comment on this question then please do!
Assignment Specs:

You need to use at least one boosting model in your work to answer the questions above, but you should explore at least two other models in order to answer the above questions as best you can. You may use multiple boosting models if you like, but I'd encourage you to consider past model types we've discussed.
The kaggle page indicates that the classes are extremely unbalanced in this dataset. You should keep this in mind as you work and if appropriate, take steps to adjust for it. You may need to look up how to adjust for this yourself, but I could probably make some suggestions...as a starting place check out under- and oversampling ideas described here: https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/Links to an external site. .
Your submission should be built and written with non-experts as the target audience. All of your code should still be included, but do your best to narrate your work in accessible ways.
Again, submit an HTML, ipynb, or Colab link. Be sure to rerun your entire notebook fresh before submitting!


In [12]:
# load in data

heart = pd.read_csv("heart_2020_cleaned.csv")
heart = heart.dropna()
heart.head()


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [33]:
heart_minority = heart[heart["HeartDisease"] == "Yes"]
heart_majority = heart[heart["HeartDisease"] == "No"].sample(n=len(heart_minority), random_state=42)

# deal with imbalanced data
heart_balanced = pd.concat([heart_minority, heart_majority])
X = heart_balanced.drop(columns="HeartDisease")
y = heart_balanced["HeartDisease"]
y = y.map({"No": 0, "Yes": 1})

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# define x and y
X = heart_balanced.drop(columns = ["HeartDisease"])
y = heart_balanced["HeartDisease"]

# Boosting Model

In [35]:

# define column transformer
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
        
    ],
    remainder="passthrough"
).set_output(transform="pandas")

# define model
boosting_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

# Define pipeline
pipeline = Pipeline([
    ("preprocessor", ct),
    ("model", boosting_model)
])

# Fit the pipeline directly (no grid search)
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

recall = recall_score(y_test, y_pred)
print(f"Recall (HeartDisease = Yes): {recall:.2f}")

              precision    recall  f1-score   support

           0       0.80      0.74      0.77      5516
           1       0.76      0.81      0.78      5434

    accuracy                           0.77     10950
   macro avg       0.78      0.78      0.77     10950
weighted avg       0.78      0.77      0.77     10950

Recall (HeartDisease = Yes): 0.81


Since the dataset is highly imbalanced, accuracy is not a reliable metric since the model could predict "No" for almost everyone and still appear accurate. Instead, recall for the "Yes" (heart disease) class is more important because it tells us how well the model is identifying actual cases of heart disease. A higher recall helps reduce the number of missed positive cases, which is critical in a health-related prediction task.

A recall score of 0.81 means the boosting model correctly identified 81% of patients who actually have heart disease.

# Bagging Model

In [36]:
# using the same column transformer as above

bagging_model = BaggingClassifier(
    n_estimators=100,
    random_state=42
)

# Define pipeline
pipeline = Pipeline([
    ("preprocessor", ct),
    ("model", bagging_model)
])

# Fit the pipeline directly (no grid search)
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall (HeartDisease = Yes): {recall:.2f}")

              precision    recall  f1-score   support

           0       0.76      0.71      0.73      5516
           1       0.72      0.77      0.74      5434

    accuracy                           0.74     10950
   macro avg       0.74      0.74      0.74     10950
weighted avg       0.74      0.74      0.74     10950

Recall (HeartDisease = Yes): 0.77


The recall score of 0.77 for the "Yes" class (heart disease) means that the model successfully identified 77% of the actual positive cases of heart disease. In a healthcare context, this is a good outcome, as it indicates the model is fairly effective at detecting patients who truly have heart disease, helping to minimize the number of missed cases.

# XG Boosting

In [40]:
# Define model (XGBoost)
xgboost_model = XGBClassifier(
    n_estimators=100,
    random_state=42,
    use_label_encoder=False,  # To avoid warnings in newer versions
    eval_metric='logloss'     # To avoid warnings related to eval_metric
)

# Define pipeline
pipeline = Pipeline([
    ("preprocessor", ct),
    ("model", xgboost_model)
])

# Fit the pipeline directly (no grid search)
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall (HeartDisease = Yes): {recall:.2f}")

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.78      0.73      0.75      5516
           1       0.74      0.80      0.77      5434

    accuracy                           0.76     10950
   macro avg       0.76      0.76      0.76     10950
weighted avg       0.76      0.76      0.76     10950

Recall (HeartDisease = Yes): 0.80


The recall score of 0.80 for the "Yes" (heart disease) class in the XGBoost model indicates that the model is identifying 80% of all the actual cases of heart disease. This is a strong result, as it means that the model is successfully capturing the majority of heart disease cases.

# Stacking Model

In [41]:
base_models = [
    ("rf", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("xgb", XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss'))
]

# Define the meta-model
meta_model = LogisticRegression()

# Define the stacking model
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)

# Define pipeline
pipeline = Pipeline([
    ("preprocessor", ct),
    ("model", stacking_model)
])

# Fit the pipeline directly
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall (HeartDisease = Yes): {recall:.2f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.78      0.73      0.75      5516
           1       0.74      0.79      0.77      5434

    accuracy                           0.76     10950
   macro avg       0.76      0.76      0.76     10950
weighted avg       0.76      0.76      0.76     10950

Recall (HeartDisease = Yes): 0.79


The recall score for the "Yes" (heart disease) class is 0.79, which means that the stacking model is correctly identifying 79% of the actual heart disease cases. 


Overall, the gradient boosting model produced the best recall score of 0.81. The other models produced slightly similar recall values in the 0.75 - 0.82 range. This indicates that while all ensemble models performed reasonably well in identifying heart disease cases, the gradient boosting model was slightly more effective at minimizing false negatives, making it the most reliable choice in a context where detecting actual cases is critical.