## 1. Import What We Need

In [49]:
#import regular functionalities that we need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from IPython.display import display

#stats imports
import statsmodels.api as sm
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#classifiers to be used
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import ensemble

#Model Selection Bits
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, KFold
from sklearn.model_selection import learning_curve, validation_curve

#preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import mode

#evaluation
from sklearn.metrics import f1_score

#plotting
from sklearn.model_selection import learning_curve, validation_curve

#dataset
from sklearn.datasets import load_digits, make_classification

#pipelines
from sklearn.pipeline import Pipeline

## 2. Read the file

In [7]:
heart_disease_converted_df=pd.read_csv('heart_2020_converted.csv')

## 3. Inspect the data (info, describe, head, tail)

In [9]:
heart_disease_converted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 26 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   HeartDisease                         319795 non-null  int64  
 1   BMI                                  319795 non-null  float64
 2   Smoking                              319795 non-null  int64  
 3   AlcoholDrinking                      319795 non-null  int64  
 4   Stroke                               319795 non-null  int64  
 5   PhysicalHealth                       319795 non-null  float64
 6   MentalHealth                         319795 non-null  float64
 7   DiffWalking                          319795 non-null  int64  
 8   Sex                                  319795 non-null  int64  
 9   AgeCategory                          319795 non-null  int64  
 10  PhysicalActivity                     319795 non-null  int64  
 11  GenHealth    

In [10]:
#The results above indicate that there are 3 columns of floats (actual scores/decimals).
#The results also indicate that most of the results are 0,1 (No, Yes)

In [11]:
heart_disease_converted_df.describe()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
count,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,...,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0
mean,0.085595,28.325399,0.412477,0.068097,0.03774,3.37171,3.898366,0.13887,0.524727,6.514536,...,0.016267,0.025229,0.07173,0.085824,0.034172,0.766779,0.843206,0.021204,0.127588,0.008002
std,0.279766,6.3561,0.492281,0.251912,0.190567,7.95085,7.955235,0.345812,0.499389,3.564759,...,0.126499,0.156819,0.258041,0.280104,0.181671,0.422883,0.363607,0.144065,0.333631,0.089095
min,0.0,12.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,24.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
50%,0.0,27.34,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,0.0,31.42,1.0,0.0,0.0,2.0,3.0,0.0,1.0,9.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
max,1.0,94.85,1.0,1.0,1.0,30.0,30.0,1.0,1.0,12.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
heart_disease_converted_df.head(10)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,0,16.6,1,0,0,3.0,30.0,0,1,7,...,0,0,0,0,0,1,0,0,1,0
1,0,20.34,0,0,1,0.0,0.0,0,1,12,...,0,0,0,0,0,1,1,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,0,9,...,0,0,0,0,0,1,0,0,1,0
3,0,24.21,0,0,0,0.0,0.0,0,1,11,...,0,0,0,0,0,1,1,0,0,0
4,0,23.71,0,0,0,28.0,0.0,1,1,4,...,0,0,0,0,0,1,1,0,0,0
5,1,28.87,1,0,0,6.0,0.0,1,1,11,...,0,0,1,0,0,0,1,0,0,0
6,0,21.63,0,0,0,15.0,0.0,0,1,10,...,0,0,0,0,0,1,1,0,0,0
7,0,31.64,1,0,0,5.0,0.0,1,1,12,...,0,0,0,0,0,1,0,0,1,0
8,0,26.45,0,0,0,0.0,0.0,0,1,12,...,0,0,0,0,0,1,0,1,0,0
9,0,40.69,0,0,0,0.0,0.0,1,0,9,...,0,0,0,0,0,1,1,0,0,0


In [13]:
heart_disease_converted_df.tail(10)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
319785,0,31.93,0,1,0,0.0,0.0,0,0,9,...,0,0,0,1,0,0,1,0,0,0
319786,1,33.2,1,0,0,0.0,0.0,0,1,8,...,0,0,0,1,0,0,0,0,1,0
319787,0,36.54,0,0,0,7.0,0.0,0,0,2,...,0,0,0,1,0,0,1,0,0,0
319788,0,23.38,0,0,0,0.0,0.0,0,1,8,...,0,0,0,1,0,0,1,0,0,0
319789,0,22.22,0,0,0,0.0,0.0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
319790,1,27.41,1,0,0,7.0,0.0,1,0,8,...,0,0,0,1,0,0,0,0,1,0
319791,0,29.84,1,0,0,0.0,0.0,0,0,3,...,0,0,0,1,0,0,1,0,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,1,5,...,0,0,0,1,0,0,1,0,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,1,1,...,0,0,0,1,0,0,1,0,0,0
319794,0,46.56,0,0,0,0.0,0.0,0,1,12,...,0,0,0,1,0,0,1,0,0,0


## 4. Clean the Data

In [15]:
heart_disease_converted_df.isna().sum()

HeartDisease                           0
BMI                                    0
Smoking                                0
AlcoholDrinking                        0
Stroke                                 0
PhysicalHealth                         0
MentalHealth                           0
DiffWalking                            0
Sex                                    0
AgeCategory                            0
PhysicalActivity                       0
GenHealth                              0
SleepTime                              0
Asthma                                 0
KidneyDisease                          0
SkinCancer                             0
Race_American Indian/Alaskan Native    0
Race_Asian                             0
Race_Black                             0
Race_Hispanic                          0
Race_Other                             0
Race_White                             0
Diabetic_No                            0
Diabetic_No, borderline diabetes       0
Diabetic_Yes    

In [16]:
#There are no blanks in the data

In [17]:
#1. Split the data into X and y (70% training, 30% testing)
X = heart_disease_converted_df.drop(columns=["HeartDisease"])  # Features  
y = heart_disease_converted_df["HeartDisease"]  # Target  

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1010, stratify=y)

In [19]:
#Nomralize/Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
#Train RFC 
model = RandomForestClassifier(n_estimators=100, random_state=1010)
model.fit(X_train_scaled, y_train)

In [18]:
#Predict
y_pred = model.predict(X_test_scaled)

In [19]:
#Calculate accuracy, precision, f1
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.91
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95     87727
           1       0.37      0.12      0.18      8212

    accuracy                           0.91     95939
   macro avg       0.65      0.55      0.57     95939
weighted avg       0.88      0.91      0.88     95939



In [20]:
#2. Redo to improve the scoring
#Split the data into X and y (70% training, 30% testing)
X = heart_disease_converted_df.drop(columns=["HeartDisease", "AlcoholDrinking", "PhysicalActivity"])  # Features  
y = heart_disease_converted_df["HeartDisease"]  # Target  

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1010, stratify=y)

In [22]:
#Nomralize/Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
#Train RFC 
model = RandomForestClassifier(n_estimators=100, random_state=1010)
model.fit(X_train_scaled, y_train)

In [24]:
#Predict
y_pred = model.predict(X_test_scaled)

In [25]:
#Calculate accuracy, precision, f1
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.90
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95     87727
           1       0.35      0.12      0.18      8212

    accuracy                           0.90     95939
   macro avg       0.63      0.55      0.57     95939
weighted avg       0.87      0.90      0.88     95939



In [26]:
#3. Redo to improve the scoring
#Split the data into X and y (70% training, 30% testing)
X = heart_disease_converted_df.drop(columns=["HeartDisease", "AlcoholDrinking", "PhysicalActivity", 
                                             "Race_Asian", "Race_Black", "Race_Hispanic", "Race_Other", 
                                             "Race_White", "Race_American Indian/Alaskan Native"])  # Features  
y = heart_disease_converted_df["HeartDisease"]  # Target  

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1000, stratify=y)

In [28]:
#Nomralize/Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [29]:
#Train RFC 
model = RandomForestClassifier(n_estimators=100, random_state=1010)
model.fit(X_train_scaled, y_train)

In [30]:
#Predict
y_pred = model.predict(X_test_scaled)

In [31]:
#Calculate accuracy, precision, f1
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.90
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.95     87727
           1       0.33      0.13      0.19      8212

    accuracy                           0.90     95939
   macro avg       0.63      0.55      0.57     95939
weighted avg       0.87      0.90      0.88     95939



In [32]:
#4. Redo to improve scoring with logistic regression
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=500)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = logreg.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.99      0.96     87886
           1       0.53      0.11      0.18      8053

    accuracy                           0.92     95939
   macro avg       0.73      0.55      0.57     95939
weighted avg       0.89      0.92      0.89     95939



In [33]:
#5. Redo to improve scoring with logistic regression, but higher max_iteration
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=500)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression
logreg = LogisticRegression(max_iter=1200)
logreg.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = logreg.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.99      0.96     87886
           1       0.53      0.11      0.18      8053

    accuracy                           0.92     95939
   macro avg       0.73      0.55      0.57     95939
weighted avg       0.89      0.92      0.89     95939



In [34]:
#6. with Neural Network (MLP Classifier)
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=500)
mlp.fit(X_train_scaled, y_train)

y_pred_mlp = mlp.predict(X_test_scaled)
print(classification_report(y_test, y_pred_mlp))

              precision    recall  f1-score   support

           0       0.92      0.99      0.96     87886
           1       0.47      0.11      0.18      8053

    accuracy                           0.91     95939
   macro avg       0.70      0.55      0.57     95939
weighted avg       0.89      0.91      0.89     95939



In [35]:
#7. with Isolation Forest
from sklearn.ensemble import IsolationForest

# Train model
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest.fit(X_train)

# Predict anomalies (1 = normal, -1 = anomaly)
y_pred_iso = iso_forest.predict(X_test)

# Convert -1 (anomalies) to 1 (disease) and 1 (normal) to 0 (no disease)
y_pred_iso = [1 if x == -1 else 0 for x in y_pred_iso]

print(classification_report(y_test, y_pred_iso))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93     87886
           1       0.27      0.33      0.30      8053

    accuracy                           0.87     95939
   macro avg       0.61      0.62      0.61     95939
weighted avg       0.88      0.87      0.88     95939



In [33]:
print(8053/95939)

0.08393875274914268


In [34]:
y_train.value_counts()

HeartDisease
0    204695
1     19161
Name: count, dtype: int64

In [38]:
print(19161/(19161+204695))

0.0855952040597527


In [41]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

In [51]:
# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95     87727
           1       0.32      0.11      0.16      8212

    accuracy                           0.90     95939
   macro avg       0.62      0.54      0.55     95939
weighted avg       0.87      0.90      0.88     95939

