# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,ConfusionMatrixDisplay, precision_recall_fscore_support, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import plotly.express as px
from ipywidgets import interact, widgets
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Data

In [None]:
rs=123

In [None]:
data=pd.read_csv(r"C:\Users\HP\Desktop\Online Courses\IBM Machine Learning Engineer\2. Classification\All data\heartattack.csv", na_values='?')

# Exploring and Cleaning Data

In [None]:
# columns
data.columns

In [None]:
# general info of columns
data.info()

In [None]:
# summary of data types
data.dtypes.value_counts()

In [None]:
# missing values
data.isnull().sum()

In [None]:
# To keep the cleaning process simple, we’ll remove:
# the columns with many missing values, which are slope, ca, thal.
# the rows with missing values.

data = data.drop(['slope', 'ca', 'thal'], axis=1)

data = data.dropna().copy()

In [None]:
data.isnull().sum()

In [None]:
# #check data types of columns with missing values
# columns=list(data.columns)
# col_mv=[]
# for col in columns:
#     if data[col].isnull().sum()>0:
#         col_mv.append(col)
# col_mv
# data[col_mv].dtypes

In [None]:
# # replace all missing values with means of respective columns
# for col in col_mv:
#     data[col] = data[col].fillna(data[col].mean())
# data.isnull().sum()

In [None]:
#outliers
numeric_columns = list(data.select_dtypes(include=[np.number]).columns)
len(numeric_columns)

In [None]:
type(numeric_columns[0])

In [None]:
# #box plot
# # lets resolve this issue later
# ncols=3
# nrows=math.ceil(len(numeric_columns)/ncols)
# fig, axes = plt.subplots(nrows, ncols, figsize=(15, 6))  # Create subplots

# for i, column in enumerate(numeric_columns):
#     axes[i].boxplot(data[column])  # Create a box plot for the column in the i-th subplot
#     axes[i].set_title(f'Box Plot for {column}')  # Set the title for the subplot
#     axes[i].set_xlabel(column)  # Set the x-axis label

# plt.tight_layout()  # Adjust subplot layout for better spacing
# plt.show()  # Display the figure with subplots

In [None]:
# #using z-score

# treshold=3
# for column in numeric_columns:
#     # Calculate the z-scores
#     z_scores = stats.zscore(data[column])
    
#     # Find the rows where z-scores are greater than treshold
#     outliers = np.abs(z_scores) > treshold
    
#     # Replace outliers with the mean value of the column
#     mean_value = data[column].mean()
#     data.loc[outliers, column] = mean_value

# # Now, data contains the data with outliers replaced by the mean value for each numeric column
data

In [None]:
# Renaming target variable
data = data.rename(columns={'num       ': 'heart_attack'})

data['heart_attack'].value_counts(dropna=False)

In [None]:
data

In [None]:
# age: age in years
# sex: sex (1 = male; 0 = female)
# cp: chest pain type
# – 1: typical angina
# – 2: atypical angina
# – 3: non-anginal pain
# – 4: asymptomatic
# trestbps: resting blood pressure (in mm Hg on admission to the hospital)
# chol: serum cholesterol in mg/dl
# fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
# restecg: resting electrocardiographic results
# – 0: normal
# – 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# – 2: showing probable or definite left ventricular hypertrophy by Estes’ criteria
# thalach: maximum heart rate achieved
# exang: exercise-induced angina (1 = yes; 0 = no)
# oldpeak: ST depression induced by exercise relative to rest


## Transform the Categorical Variables: Creating Dummy Variables

In [None]:
### Among the five categorical variables, sex, fbs, and exang only have two levels of 0 and 1, 
# so they are already in the dummy variable format. But we still need to convert cp and restecg 
# into dummy variables

print(data['cp'].value_counts(dropna=False))

print(data['restecg'].value_counts(dropna=False))

In [None]:
data = pd.get_dummies(data, columns=['cp', 'restecg'], drop_first=True)
data

In [None]:
# we can print out the numeric columns and categorical columns as numeric_cols and cat_cols below.

numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
cat_cols = list(set(data.columns) - set(numeric_cols) - {'target'})
cat_cols.sort()

print(numeric_cols)
print(cat_cols)

## Transform the Numerical Variables: Scaling

In [None]:
scaler = StandardScaler()
for i in numeric_cols:
    data[i]=scaler.fit_transform(data[[i]])

## Defining Target and Features

In [None]:
y=data['heart_attack']
X=data.drop(columns='heart_attack')

## Split Training and Test Datasets

when the dataset is imbalanced, it’s good practice to do stratified sampling. In this way, both the training and test datasets will have similar portions of the target classes as the complete dataset.

In [None]:
# First, let's split the training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state =rs)

In [None]:
X_train.shape, X_test.shape

## Define Logistic Regression Model

In [None]:
model_lr = LogisticRegression(penalty='none') # logistic regression with no penalty term in the cost function.
model_lr.fit(X_train,y_train)

## Evaluating Model 

In [None]:
predictions_lr=model_lr.predict(X_test)

In [None]:
def evaluations(y, yhat):
    accuracy = accuracy_score(y_test, predictions_lr)
    precision, recall, f_beta, _ = precision_recall_fscore_support(y_test, predictions_lr)
    print('Accuracy Score = {}'.format(accuracy))
    print('Precision Score = {}'.format(precision))
    print('Recall Score = {}'.format(recall))
    print('f_beta Score = {}'.format(f_beta))
evaluations(y_test, predictions_lr)

In [None]:
cm_lr=confusion_matrix(y_test, predictions_lr, normalize='true')

In [None]:
sns.set_context('talk')
disp = ConfusionMatrixDisplay(confusion_matrix=cm_lr,display_labels=model_lr.classes_)
disp.plot()
plt.show()

In [None]:
sns.set_context('talk')
disp = ConfusionMatrixDisplay(confusion_matrix=cm_lr,display_labels=['No Heart Attack','Heart Attack'])
disp.plot()
plt.show()

In [None]:
print(classification_report(y_test, predictions_lr))

## Interpretation of Results

In [None]:
model_lr.coef_

In [None]:

# create a data frame with the feature names and coefficients
coef_df = pd.DataFrame(zip(X.columns, model_lr.coef_[0]), columns=['Variable', 'Coefficient'])
# print the data frame
print(coef_df)


## KNN MODEL

In [None]:
# Estimate KNN model and report outcomes
knn = KNeighborsClassifier(n_neighbors=5)
knn = knn.fit(X_train, y_train)
predictions_knn = knn.predict(X_test)
# Preciision, recall, f-score from the multi-class support function
print(classification_report(y_test, predictions_knn))
print('Accuracy score: ', round(accuracy_score(y_test, predictions_knn), 2))
print('F1 Score: ', round(f1_score(y_test, predictions_knn), 2))

In [None]:
cm_knn=confusion_matrix(y_test, predictions_knn, normalize='true')

sns.set_context('talk')
disp = ConfusionMatrixDisplay(confusion_matrix=cm_knn,display_labels=['No Heart Attack','Heart Attack'])
disp.plot()
plt.show()

## Choosing Optimal k

In [None]:
max_k = 40
f1_scores = list()
error_rates = list() # 1-accuracy

for k in range(1, max_k):
    
    knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
    knn = knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    f1 = f1_score(y_pred, y_test)
    f1_scores.append((k, round(f1_score(y_test, y_pred), 4)))
    error = 1-round(accuracy_score(y_test, y_pred), 4)
    error_rates.append((k, error))
    
f1_results = pd.DataFrame(f1_scores, columns=['K', 'F1 Score'])
error_results = pd.DataFrame(error_rates, columns=['K', 'Error Rate'])

In [None]:
# Plot F1 results
sns.set_context('talk')
sns.set_style('ticks')

plt.figure(dpi=300)
ax = f1_results.set_index('K').plot(figsize=(12, 12), linewidth=6)
ax.set(xlabel='K', ylabel='F1 Score')
ax.set_xticks(range(1, max_k, 2));
plt.title('KNN F1 Score')
plt.savefig('knn_f1.png')

In [None]:
# Plot Accuracy (Error Rate) results
sns.set_context('talk')
sns.set_style('ticks')

plt.figure(dpi=300)
ax = error_results.set_index('K').plot(figsize=(12, 12), linewidth=6)
ax.set(xlabel='K', ylabel='Error Rate')
ax.set_xticks(range(1, max_k, 2))
plt.title('KNN Elbow Curve')
plt.savefig('knn_elbow.png')

## Support Vector Machines (SVM)

In [None]:
model_svm = SVC(kernel='linear', gamma=0.1, C=1)
model_svm.fit(X_train, y_train)


In [None]:

predictions_svm = model_svm.predict(X_test)

cm_svm=confusion_matrix(y_test, predictions_svm, normalize='true')

sns.set_context('talk')
disp = ConfusionMatrixDisplay(confusion_matrix=cm_svm,display_labels=['No Heart Attack','Heart Attack'])
disp.plot()
plt.show()

In [None]:
# Finding the best hyperparameters
params_svm = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf','linear']
}

search_smv = GridSearchCV(
    estimator=SVC(),
    param_grid=params_svm,
    cv=5,
    n_jobs=5,
    verbose=1
)

model_svm=search_smv.fit(X_train, y_train)
print(model_svm.best_params_)

## Decision Tree

In [None]:
model_dt = DecisionTreeClassifier(random_state=42)
model_dt = model_dt.fit(X_train, y_train)

predictions_dt=model_dt.predict(X_test)

cm_dt=confusion_matrix(y_test, predictions_dt, normalize='true')

sns.set_context('talk')
disp = ConfusionMatrixDisplay(confusion_matrix=cm_dt,display_labels=['No Heart Attack','Heart Attack'])
disp.plot()
plt.show()

In [None]:

param_grid = {'max_depth':range(1, model_dt.tree_.max_depth+1, 2),
              'max_features': range(1, len(model_dt.feature_importances_)+1)}

model_dt = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  param_grid=param_grid,
                  scoring='accuracy',
                  n_jobs=-1)

model_dt = model_dt.fit(X_train, y_train)

# model_dt.best_estimator_.tree_.node_count, model_dt.best_estimator_.tree_.max_depth

predictions_dt=model_dt.predict(X_test)

cm_dt=confusion_matrix(y_test, predictions_dt, normalize='true')

sns.set_context('talk')
disp = ConfusionMatrixDisplay(confusion_matrix=cm_dt,display_labels=['No Heart Attack','Heart Attack'])
disp.plot()
plt.show()

## Random Forest

In [None]:
n_estimators=20
M_features=X.shape[1]
max_features=round(np.sqrt(M_features))-1
max_features

model_rf = RandomForestClassifier( max_features=max_features,n_estimators=n_estimators, random_state=0)

model_rf.fit(X_train,y_train)
model_rf.get_params().keys()

### Grid Search

In [None]:
model_rf = RandomForestClassifier()

In [None]:

param_grid = {'n_estimators': [2*n+1 for n in range(20)],
             'max_depth' : [2*n+1 for n in range(10) ],
             'max_features':["auto", "sqrt", "log2"]}

model_rf_gs = GridSearchCV(estimator=model_rf, param_grid=param_grid,scoring='accuracy')
model_rf_gs.fit(X_train, y_train)

#model_rf_gs.best_score_


In [None]:

predictions_rf=model_rf_gs.predict(X_test)

In [None]:
cm_rf=confusion_matrix(y_test, predictions_rf, normalize='true')

sns.set_context('talk')
disp = ConfusionMatrixDisplay(confusion_matrix=cm_rf,display_labels=['No Heart Attack','Heart Attack'])
disp.plot()
plt.show()

In [None]:
metrics={}
metrics['Logistic Regression']=[accuracy_score(y_test, predictions_lr),precision_score(y_test, predictions_lr),
                           f1_score(y_test, predictions_lr), recall_score(y_test, predictions_lr)]
metrics['K-Nearest Neigbour']= [accuracy_score(y_test, predictions_knn),precision_score(y_test, predictions_knn),
                           f1_score(y_test, predictions_knn), recall_score(y_test, predictions_knn)]
metrics['Support Vector Machines']= [accuracy_score(y_test, predictions_lr),precision_score(y_test, predictions_lr),
                           f1_score(y_test, predictions_lr), recall_score(y_test, predictions_lr)]
# metrics['Decision Tree']= [accuracy_score(y_test, predictions_dt),precision_score(y_test, predictions_dt),
#                            f1_score(y_test, predictions_dt), recall_score(y_test, predictions_dt)]
metrics['Random Forest']= [accuracy_score(y_test, predictions_rf),precision_score(y_test, predictions_rf),
                           f1_score(y_test, predictions_rf), recall_score(y_test, predictions_rf)]


df=pd.DataFrame(metrics).T. rename(columns={0:'Accuracy', 1:'Precision', 2:'F', 3:'Recall'})
df


In [None]:
sex_lable={'Male': "1", 'Female':"0"}
exang_lable={'Yes': "1", 'No':"0"}
fbs_lable={'True':'1', 'False':'0'}
@interact
def predictions(sex=list(sex_lable.keys()),
                Exercise_induced_angina=list(exang_lable.keys()),
                High_fbs=list(fbs_lable.keys()),
                age=widgets.IntSlider(min=0, max=200, step=1, value=0),
                resting_BP = widgets.FloatText(value=0.0),
                Cholesterol = widgets.FloatText(value=0.0),
                Max_Heart_Rate = widgets.FloatText(value=0.0),
                ST_Depression = widgets.FloatText(value=0.0),
                Moderate_restecg=list(X['restecg_1.0'].unique()),
                High_restecg=list(X['restecg_2.0'].unique()),
                Chest_pains_atypical_angina=list(X['cp_2'].unique()),
                Chest_pains_non_anginal_pain=list(X['cp_3'].unique()),
                Chest_pains_asymptomatic=list(X['cp_4'].unique())):
    sex_dummy=int(sex_lable[sex])
    exang_dummy=int(exang_lable[Exercise_induced_angina])
    fbs_dummy=int(fbs_lable[High_fbs])
    user_inputs=[age, sex_dummy, resting_BP, Cholesterol, fbs_dummy, Max_Heart_Rate, exang_dummy, 
                 ST_Depression, Chest_pains_atypical_angina, Chest_pains_non_anginal_pain, Chest_pains_asymptomatic, 
                 Moderate_restecg, High_restecg]
    user_inputs_array=np.array(user_inputs)
    user_inputs_reshaped=user_inputs_array.reshape(1, -1)
    heart_attack=knn.predict(np.array(user_inputs_reshaped))[0]
    if heart_attack==0:
        heart_attack="HEART ATTACK"
    else:
        heart_attack = "NO HEART ATTACK"
    print()
    print("PREDICTION: {}".format(heart_attack))
    print("PRECISION SCORE: {}%".format(round(precision_score(y_test, predictions_knn),2)))
    
    
    #return user_inputs
     

In [None]:
# age: age in years
# sex: sex (1 = male; 0 = female)
# cp: chest pain type
# – 1: typical angina
# – 2: atypical angina
# – 3: non-anginal pain
# – 4: asymptomatic
# trestbps: resting blood pressure (in mm Hg on admission to the hospital)
# chol: serum cholesterol in mg/dl
# fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
# restecg: resting electrocardiographic results
# – 0: normal
# – 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# – 2: showing probable or definite left ventricular hypertrophy by Estes’ criteria
# thalach: maximum heart rate achieved
# exang: exercise-induced angina (1 = yes; 0 = no)
# oldpeak: ST depression induced by exercise relative to rest
