## 1. Import What We Need

In [1]:
#import regular functionalities that we need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from IPython.display import display

#stats imports
import statsmodels.api as sm
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

#classifiers to be used
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import ensemble

#Model Selection Bits
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, KFold
from sklearn.model_selection import learning_curve, validation_curve

#preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import mode

#evaluation
from sklearn.metrics import f1_score

#plotting
from sklearn.model_selection import learning_curve, validation_curve

#dataset
from sklearn.datasets import load_digits, make_classification

#pipelines
from sklearn.pipeline import Pipeline

## 2. Read the file

In [4]:
heart_disease_converted_df=pd.read_csv('heart_2020_converted.csv')

## 4. Clean the Data

In [7]:
heart_disease_converted_df.isna().sum()

HeartDisease                           0
BMI                                    0
Smoking                                0
AlcoholDrinking                        0
Stroke                                 0
PhysicalHealth                         0
MentalHealth                           0
DiffWalking                            0
Sex                                    0
AgeCategory                            0
PhysicalActivity                       0
GenHealth                              0
SleepTime                              0
Asthma                                 0
KidneyDisease                          0
SkinCancer                             0
Race_American Indian/Alaskan Native    0
Race_Asian                             0
Race_Black                             0
Race_Hispanic                          0
Race_Other                             0
Race_White                             0
Diabetic_No                            0
Diabetic_No, borderline diabetes       0
Diabetic_Yes    

In [9]:
#There are no blanks in the data

In [22]:
# Assuming 'HeartDisease' is the target variable (0 = No disease, 1 = Disease)
X = heart_disease_converted_df.drop(columns=["HeartDisease"])  # Features
y = heart_disease_converted_df["HeartDisease"]  # Target

In [24]:
from sklearn.metrics import classification_report, confusion_matrix

In [26]:
# Check class distribution
print("Class distribution before SMOTE:\n", y.value_counts())

# Split data before applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Apply SMOTE only on training data
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print("Class distribution after SMOTE:\n", pd.Series(y_train_sm).value_counts())

Class distribution before SMOTE:
 HeartDisease
0    292422
1     27373
Name: count, dtype: int64
Class distribution after SMOTE:
 HeartDisease
0    204695
1    204695
Name: count, dtype: int64


In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1010, stratify=y)

In [30]:
#Nomralize/Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
#Train RFC 
model = RandomForestClassifier(n_estimators=100, random_state=1010)
model.fit(X_train_scaled, y_train)

In [34]:
#Predict
y_pred = model.predict(X_test_scaled)

In [35]:
#Calculate accuracy, precision, f1
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.91
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95     87727
           1       0.37      0.12      0.18      8212

    accuracy                           0.91     95939
   macro avg       0.65      0.55      0.57     95939
weighted avg       0.88      0.91      0.88     95939



In [36]:
# Evaluate model
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95     87727
           1       0.37      0.12      0.18      8212

    accuracy                           0.91     95939
   macro avg       0.65      0.55      0.57     95939
weighted avg       0.88      0.91      0.88     95939


Confusion Matrix:
 [[86054  1673]
 [ 7236   976]]


In [40]:
#Try again but with ensemble (quick easy)
from imblearn.ensemble import BalancedRandomForestClassifier

# Train a Balanced Random Forest Classifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
brf.fit(X_train, y_train)  # No SMOTE needed as this model balances data internally

# Predictions
y_pred_brf = brf.predict(X_test)

# Evaluate
print("\nBalanced Random Forest - Classification Report:\n", classification_report(y_test, y_pred_brf))

  warn(
  warn(
  warn(



Balanced Random Forest - Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.71      0.82     87727
           1       0.21      0.80      0.33      8212

    accuracy                           0.72     95939
   macro avg       0.59      0.75      0.58     95939
weighted avg       0.91      0.72      0.78     95939



In [52]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [47]:
#Improved recall to 0.80.  Let's try to increase it.= with XGBoost
import xgboost as xgb

In [48]:
# Calculate class weight (useful if minority class is rare)
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# Train XGBoost
xgb_model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate
print("\nXGBoost - Classification Report:\n", classification_report(y_test, y_pred_xgb))


XGBoost - Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.75      0.85     87727
           1       0.22      0.77      0.34      8212

    accuracy                           0.75     95939
   macro avg       0.60      0.76      0.59     95939
weighted avg       0.91      0.75      0.80     95939



In [54]:
#XGBoost performed worse than ensemble alone.

In [56]:
#Let's try with Ensemble, XGBoost, Logistic Regression
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

# Define base classifiers
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
lr = LogisticRegression(class_weight="balanced", max_iter=1000)

# Voting classifier (soft voting improves recall)
voting_clf = VotingClassifier(estimators=[('rf', rf), ('xgb', xgb), ('lr', lr)], voting='soft')
voting_clf.fit(X_train, y_train)

# Predictions
y_pred_voting = voting_clf.predict(X_test)

# Evaluate
print("\nVoting Classifier - Classification Report:\n", classification_report(y_test, y_pred_voting))


Voting Classifier - Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.87      0.91     87727
           1       0.29      0.58      0.39      8212

    accuracy                           0.84     95939
   macro avg       0.63      0.73      0.65     95939
weighted avg       0.90      0.84      0.87     95939



In [None]:
#even worse...Thus, keep it simple with just ensemble.