# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,ConfusionMatrixDisplay, precision_recall_fscore_support, precision_score, recall_score


In [None]:
rs=123

# Data

In [None]:
data=pd.read_csv(r"C:\Users\HP\Desktop\Online Courses\IBM Machine Learning Engineer\2. Classification\All data\heartattack.csv", na_values='?')

# Exploring and Cleaning Data

In [None]:
# columns
data.columns

In [None]:
# general info of columns
data.info()

In [None]:
# summary of data types
data.dtypes.value_counts()

In [None]:
# missing values
data.isnull().sum()

In [None]:
# To keep the cleaning process simple, we’ll remove:
# the columns with many missing values, which are slope, ca, thal.
# the rows with missing values.

data = data.drop(['slope', 'ca', 'thal'], axis=1)

data = data.dropna().copy()

In [None]:
data.isnull().sum()

In [None]:
# #check data types of columns with missing values
# columns=list(data.columns)
# col_mv=[]
# for col in columns:
#     if data[col].isnull().sum()>0:
#         col_mv.append(col)
# col_mv
# data[col_mv].dtypes

In [None]:
# # replace all missing values with means of respective columns
# for col in col_mv:
#     data[col] = data[col].fillna(data[col].mean())
# data.isnull().sum()

In [None]:
#outliers
numeric_columns = list(data.select_dtypes(include=[np.number]).columns)
len(numeric_columns)

In [None]:
type(numeric_columns[0])

In [None]:
# #box plot
# # lets resolve this issue later
# ncols=3
# nrows=math.ceil(len(numeric_columns)/ncols)
# fig, axes = plt.subplots(nrows, ncols, figsize=(15, 6))  # Create subplots

# for i, column in enumerate(numeric_columns):
#     axes[i].boxplot(data[column])  # Create a box plot for the column in the i-th subplot
#     axes[i].set_title(f'Box Plot for {column}')  # Set the title for the subplot
#     axes[i].set_xlabel(column)  # Set the x-axis label

# plt.tight_layout()  # Adjust subplot layout for better spacing
# plt.show()  # Display the figure with subplots

In [None]:
# #using z-score

# treshold=3
# for column in numeric_columns:
#     # Calculate the z-scores
#     z_scores = stats.zscore(data[column])
    
#     # Find the rows where z-scores are greater than treshold
#     outliers = np.abs(z_scores) > treshold
    
#     # Replace outliers with the mean value of the column
#     mean_value = data[column].mean()
#     data.loc[outliers, column] = mean_value

# # Now, data contains the data with outliers replaced by the mean value for each numeric column
data

In [None]:
# Renaming target variable
data = data.rename(columns={'num       ': 'heart_attack'})

data['heart_attack'].value_counts(dropna=False)

In [None]:
data

In [None]:
# age: age in years
# sex: sex (1 = male; 0 = female)
# cp: chest pain type
# – 1: typical angina
# – 2: atypical angina
# – 3: non-anginal pain
# – 4: asymptomatic
# trestbps: resting blood pressure (in mm Hg on admission to the hospital)
# chol: serum cholesterol in mg/dl
# fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
# restecg: resting electrocardiographic results
# – 0: normal
# – 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# – 2: showing probable or definite left ventricular hypertrophy by Estes’ criteria
# thalach: maximum heart rate achieved
# exang: exercise-induced angina (1 = yes; 0 = no)
# oldpeak: ST depression induced by exercise relative to rest


## Transform the Categorical Variables: Creating Dummy Variables

In [None]:
### Among the five categorical variables, sex, fbs, and exang only have two levels of 0 and 1, 
# so they are already in the dummy variable format. But we still need to convert cp and restecg 
# into dummy variables

print(data['cp'].value_counts(dropna=False))

print(data['restecg'].value_counts(dropna=False))

In [None]:
data = pd.get_dummies(data, columns=['cp', 'restecg'], drop_first=True)
data

In [None]:
# we can print out the numeric columns and categorical columns as numeric_cols and cat_cols below.

numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
cat_cols = list(set(data.columns) - set(numeric_cols) - {'target'})
cat_cols.sort()

print(numeric_cols)
print(cat_cols)

## Transform the Numerical Variables: Scaling

In [None]:
scaler = StandardScaler()
for i in numeric_cols:
    data[i]=scaler.fit_transform(data[[i]])

## Defining Target and Features

In [None]:
y=data['heart_attack']
X=data.drop(columns='heart_attack')

## Split Training and Test Datasets

when the dataset is imbalanced, it’s good practice to do stratified sampling. In this way, both the training and test datasets will have similar portions of the target classes as the complete dataset.

In [None]:
# First, let's split the training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state =rs)

In [None]:
X_train.shape, X_test.shape

## Define Logistic Regression Model

In [None]:
model = LogisticRegression(penalty='none') # logistic regression with no penalty term in the cost function.
model.fit(X_train,y_train)

## Evaluating Model 

In [None]:
predictions=model.predict(X_test)

In [None]:
def evaluations(y, yhat):
    accuracy = accuracy_score(y_test, predictions)
    precision, recall, f_beta, _ = precision_recall_fscore_support(y_test, predictions)
    print('Accuracy Score = {}'.format(accuracy))
    print('Precision Score = {}'.format(precision))
    print('Recall Score = {}'.format(recall))
    print('f_beta Score = {}'.format(f_beta))
evaluations(y_test, predictions)

In [None]:
cm=confusion_matrix(y_test, predictions, normalize='true')

In [None]:
sns.set_context('talk')
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
disp.plot()
plt.show()

In [None]:
print(classification_report(y_test, predictions))