# Final Modeling Notebook

This notebook contains the final steps of modeling and statistical analysis. It involves various steps such as data loading, preprocessing, and analyzing the relationship between certain variables. Feel free to run the code and visualize the outputs.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
!pip install imblearn
import imblearn

In [None]:
df = pd.read_csv('data_cleanm.csv')
df.count()

In [None]:
#Heart Attack is our target variable and is set to Y. The classification variables from our dataset are set to X. We will be using these to predict.
#Managing Diabetes is not stastically significant so we dropped from modeling.
Y = df[['Heart_Attack']]
X = df[['Gender','Exercise', 'Insulin', 'Overweight_or_Obese_Calculated', 'Dly_Sugar_Check','Diabetes_Checkup','Current_Smokers',"Glycosylated_Hemoglobin"]]

In [None]:
#Logistic regression modeling.

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, random_state=0)

logReg = LogisticRegression()
logReg.fit(X_train, y_train.values.ravel())

predictions = logReg.predict(X_test)
score = logReg.score(X_test, y_test)
print(score)

In [None]:
#We are able to make reliable preditions in our negaitve class but nothing in the positve class

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

In [None]:
#Visulization of modeling
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix
cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap='Blues_r');
plt.ylabel('Actual label')
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
#ROC Curve for Logistic Regression modeling

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_curve, auc

#Had to convert the 1/2 values to 0/1
lb = LabelBinarizer()
#y_test_binary = lb.fit_transform(y_test)

lb.fit([1, 2])  # Explicitly fit on 1 and 2
y_test_binary = lb.transform(y_test)  # Transform the test labels



predictions_proba = logReg.predict_proba(X_test)[:, 1]


fpr, tpr, _ = roc_curve(y_test_binary, predictions_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:
#Unable to predict a positve in the dependent class, trying Random Forrest
# training RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

rfc_predicted = rfc.predict(X_test)
rfc_expected = y_test



In [None]:
rfc_predicted = rfc_predicted.reshape(12925,1)

rfc_matches = (rfc_predicted == rfc_expected)
print("Overall Accuracy for RFC = ", rfc_matches.sum()/float(len(rfc_matches)))

print(metrics.classification_report(rfc_expected, rfc_predicted))

#Scores still poor with RFC

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)


predictions_proba = 1- rfc.predict_proba(X_test)[:, 0]


fpr, tpr, _ = roc_curve(y_test_binary, predictions_proba)
roc_auc = auc(fpr, tpr)


plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=2)

knn.fit(X_train, y_train)

predictions = knn.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

print(classification_report(y_test, predictions))

#KNN becomes one of our better perfoming models but the ROC Curve shows that our predictions are not better than random.

In [None]:
lb = LabelBinarizer()
y_train_binary = lb.fit_transform(y_train)
y_test_binary = lb.transform(y_test)


y_train_binary = y_train_binary.ravel()
y_test_binary = y_test_binary.ravel()

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train_binary)

predictions_proba = knn.predict_proba(X_test)[:, 1]

fpr, tpr, _ = roc_curve(y_test_binary, predictions_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for KNN')
plt.legend(loc='lower right')
plt.show()

In [None]:
conf_matrix = confusion_matrix(y_test, predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
#Resampling the dependent class to see if that will lead to better results

from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

smote_enn = SMOTEENN(random_state=42)
X_train, y_train = smote_enn.fit_resample(X_train, y_train)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression



logReg = LogisticRegression()
logReg.fit(X_train, y_train.values.ravel())

predictions = logReg.predict(X_test)


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

#Scores are better when resampled but still not satsifactory

In [None]:
#Confusion Matrix for Resampled Logistic Regression Modeling
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap='Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
#ROC Curve for logistic regression resampled
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_curve, auc


lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)


predictions_proba = logReg.predict_proba(X_test)[:, 1]


fpr, tpr, _ = roc_curve(y_test_binary, predictions_proba)
roc_auc = auc(fpr, tpr)


plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:
#Trying Random Forrest resampled

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics



rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)


rfc_predicted = rfc.predict(X_test)


print(metrics.classification_report(y_test, rfc_predicted))

#Still poor scores

In [None]:
#Trying XGBoost
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label= y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'multi:softmax',
    'num_class': 3
}


num_round = 100
model = xgb.train(params, dtrain, num_round)


y_pred = model.predict(dtest)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of XGBoost: {accuracy * 100:.2f}%")


report1 = classification_report(y_test, y_pred)
print(report1)

#Best scores yet, but unsatsifcatory

In [None]:
#Finally KNN
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=2)

knn.fit(X_train, y_train)

predictions = knn.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

print(classification_report(y_test, predictions))

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)


predictions_proba = 1- rfc.predict_proba(X_test)[:, 0]


fpr, tpr, _ = roc_curve(y_test_binary, predictions_proba)
roc_auc = auc(fpr, tpr)


plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:
conf_matrix = confusion_matrix(y_test, predictions)


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
#The below visulazations are an attempt to show why we got poor scores. We wanted to show the ratios of positive/negative values for our features
#in some of the classes. This shows that our dependent class doesn't have much different ratios than our indenpendnt class which lead to our model
#having trouble predicting.
insulin_condition_1 = (df['Heart_Attack'] == 1.0) & (df['Insulin'] == 1.0)
insulin_condition_2 = (df['Heart_Attack'] == 2.0) & (df['Insulin'] == 1.0)
insulin_condition_3 = (df['Heart_Attack'] == 1.0) & (df['Insulin'] == 2.0)
insulin_condition_4 = (df['Heart_Attack'] == 2.0) & (df['Insulin'] == 2.0)


counts_insulin = df.groupby(['Heart_Attack', 'Insulin']).size().unstack().fillna(0)


fig, axs = plt.subplots(figsize=(10, 8))


axs.bar(counts_insulin.index, counts_insulin[1.0], label='Taking Insulin')
axs.bar(counts_insulin.index, counts_insulin[2.0], bottom=counts_insulin[1.0], label='Not taking Insulin')

axs.set_title('Insulin and Heart Attack Relationship')
axs.set_ylabel('Count')
axs.set_xticks([1, 2])
axs.set_xticklabels(['Heart Attack', 'No Heart Attack'])
axs.legend()

plt.show()

In [None]:

managing_diabetes_condition_1 = (df['Heart_Attack'] == 1.0) & (df['Managing_Diabetes'] == 1.0)
managing_diabetes_condition_2 = (df['Heart_Attack'] == 2.0) & (df['Managing_Diabetes'] == 1.0)
managing_diabetes_condition_3 = (df['Heart_Attack'] == 1.0) & (df['Managing_Diabetes'] == 2.0)
managing_diabetes_condition_4 = (df['Heart_Attack'] == 2.0) & (df['Managing_Diabetes'] == 2.0)


counts_managing_diabetes = df.groupby(['Heart_Attack', 'Managing_Diabetes']).size().unstack().fillna(0)


fig, axs = plt.subplots(figsize=(10, 8))

axs.bar(counts_managing_diabetes.index, counts_managing_diabetes[1.0], label='Did not take class')
axs.bar(counts_managing_diabetes.index, counts_managing_diabetes[2.0], bottom=counts_managing_diabetes[1.0], label='Took a class')

axs.set_title('Managing Diabetes and Heart Attack Relationship')
axs.set_ylabel('Count')
axs.set_xticks([1, 2])
axs.set_xticklabels(['Heart Attack', 'No Heart Attack'])
axs.legend()

plt.show()

In [None]:

overweight_condition_1 = (df['Heart_Attack'] == 1.0) & (df['Overweight_or_Obese_Calculated'] == 1.0)
overweight_condition_2 = (df['Heart_Attack'] == 2.0) & (df['Overweight_or_Obese_Calculated'] == 1.0)
exercise_condition_3 = (df['Heart_Attack'] == 1.0) & (df['Overweight_or_Obese_Calculated'] == 2.0)
exercise_condition_4 = (df['Heart_Attack'] == 2.0) & (df['Overweight_or_Obese_Calculated'] == 2.0)

counts_overweight = df.groupby(['Heart_Attack', 'Overweight_or_Obese_Calculated']).size().unstack().fillna(0)


fig, axs = plt.subplots(figsize=(10, 8))


axs.bar(counts_overweight.index, counts_overweight[1.0], label='High BMI')
axs.bar(counts_overweight.index, counts_overweight[2.0], bottom=counts_overweight[1.0], label='Normal BMI')

axs.set_title('Overweight and Heart Attack Relationship')
axs.set_ylabel('Count')
axs.set_xticks([1, 2])
axs.set_xticklabels(['Heart Attack', 'No Heart Attack'])
axs.legend()

plt.show()

In [None]:

exercise_condition_1 = (df['Heart_Attack'] == 1.0) & (df['Exercise'] == 1.0)
exercise_condition_2 = (df['Heart_Attack'] == 2.0) & (df['Exercise'] == 1.0)
exercise_condition_3 = (df['Heart_Attack'] == 1.0) & (df['Exercise'] == 2.0)
exercise_condition_4 = (df['Heart_Attack'] == 2.0) & (df['Exercise'] == 2.0)


counts_exercise = df.groupby(['Heart_Attack', 'Exercise']).size().unstack().fillna(0)


fig, axs = plt.subplots(figsize=(10, 8))

axs.bar(counts_exercise.index, counts_exercise[1.0], label='No Exercise')
axs.bar(counts_exercise.index, counts_exercise[2.0], bottom=counts_exercise[1.0], label='Some Exercise')

axs.set_title('Exercise and Heart Attack Relationship')
axs.set_ylabel('Count')
axs.set_xticks([1, 2])
axs.set_xticklabels(['Heart Attack', 'No Heart Attack'])
axs.legend()

plt.show()

In [None]:

smokers_condition_1 = (df['Heart_Attack'] == 1.0) & (df['Current_Smokers'] == 1.0)
smokers_condition_2 = (df['Heart_Attack'] == 2.0) & (df['Current_Smokers'] == 1.0)
smokers_condition_3 = (df['Heart_Attack'] == 1.0) & (df['Current_Smokers'] == 2.0)
smokers_condition_4 = (df['Heart_Attack'] == 2.0) & (df['Current_Smokers'] == 2.0)


counts_smokers = df.groupby(['Heart_Attack', 'Current_Smokers']).size().unstack().fillna(0)


fig, axs = plt.subplots(figsize=(10, 8))


axs.bar(counts_smokers.index, counts_smokers[1.0], label='Non-Smoker')
axs.bar(counts_smokers.index, counts_smokers[2.0], bottom=counts_smokers[1.0], label='Smoker')

axs.set_title('Smoking Habits and Heart Attack Relationship')
axs.set_ylabel('Count')
axs.set_xticks([1, 2])
axs.set_xticklabels(['Heart Attack', 'No Heart Attack'])
axs.legend()

plt.show()