## Project Name: Heart Attack Risk Predictor

### Let us import the necessary liabraries and read our DataSet

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Let us import our Data Set


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
df= pd.read_csv(r"/content/drive/MyDrive/heart/heart.csv")

In [None]:
df= df.drop(['oldpeak','slp','thall'],axis=1)

In [None]:
df.head()

### Data Analysis

### Understanding our DataSet:

#### Age : Age of the patient

#### Sex : Sex of the patient

#### exang: exercise induced angina (1 = yes; 0 = no)

#### ca: number of major vessels (0-3)

#### cp : Chest Pain type chest pain type

- Value 0: typical angina
- Value 1: atypical angina
- Value 2: non-anginal pain
- Value 3: asymptomatic

#### trtbps : resting blood pressure (in mm Hg)

#### chol : cholestoral in mg/dl fetched via BMI sensor

#### fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

#### rest_ecg : resting electrocardiographic results

- Value 0: normal
- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

#### thalach : maximum heart rate achieved

#### target : 0= less chance of heart attack 1= more chance of heart attack

In [None]:
df.shape

In [None]:
df.isnull().sum()

#### As we can see there are no null values in our Data Set

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr())

#### As we can see our variables are not highly correlated to each other

#### We will do Uni and Bi variate analysis on our Features

In [None]:

plt.figure(figsize=(20, 10))
plt.title("Age of Patients")
plt.xlabel("Age")
sns.countplot(x='age',data=df)

#### As we can see the Patients are of Age Group 51-67years in majority

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Sex of Patients,0=Female and 1=Male")

sns.countplot(x='sex',data=df)

In [None]:
cp_data= df['cp'].value_counts().reset_index()
cp_data['index'][3]= 'asymptomatic'
cp_data['index'][2]= 'non-anginal'
cp_data['index'][1]= 'Atyppical Anigma'
cp_data['index'][0]= 'Typical Anigma'
cp_data

In [None]:
plt.figure(figsize=(20, 10))
plt.title("Chest Pain of Patients")

sns.barplot(x=cp_data['index'],y= cp_data['cp'])

#### We have seen how the the Chest Pain Category is distributed

In [None]:
ecg_data= df['restecg'].value_counts().reset_index()
ecg_data['index'][0]= 'normal'
ecg_data['index'][1]= 'having ST-T wave abnormality'
ecg_data['index'][2]= 'showing probable or definite left ventricular hypertrophy by Estes'

ecg_data

In [None]:
plt.figure(figsize=(20, 10))
plt.title("ECG data of Patients")

sns.barplot(x=ecg_data['index'],y= ecg_data['restecg'])

#### This is our ECG Data

In [None]:
sns.pairplot(df,hue='output')

### Let us see for our Continuous Variable

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1,2,1)
sns.distplot(df['trtbps'], kde=True, color = 'magenta')
plt.xlabel("Resting Blood Pressure (mmHg)")
plt.subplot(1,2,2)
sns.distplot(df['thalachh'], kde=True, color = 'teal')
plt.xlabel("Maximum Heart Rate Achieved (bpm)")

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(df['chol'], kde=True, color = 'red')
plt.xlabel("Cholestrol")

#### We have done the Analysis of the data now let's have a look at out data

In [None]:
df.head()

### Let us do Standardisation

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scale=StandardScaler()

In [None]:
scale.fit(df)

In [None]:
df= scale.transform(df)

In [None]:
df=pd.DataFrame(df,columns=['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'caa', 'output'])

In [None]:
df.head()

### We can insert this data into our ML Models

### We will use the following models for our predictions :
- Logistic Regression
- Decision Tree
- Random Forest
- K Nearest Neighbour
- SVM

#### Then we will use the ensembling techniques

#### Let us split our data

In [None]:
x= df.iloc[:,:-1]
x

In [None]:
y= df.iloc[:,-1:]
y

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
lbl= LabelEncoder()

In [None]:
encoded_y= lbl.fit_transform(y_train)

In [None]:
logreg= LogisticRegression()

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train, encoded_y)


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
encoded_ytest= lbl.fit_transform(y_test)

In [None]:
Y_pred1 = logreg.predict(x_test)
lr_conf_matrix = confusion_matrix(encoded_ytest,Y_pred1 )
lr_acc_score = accuracy_score(encoded_ytest, Y_pred1)

In [None]:
lr_conf_matrix

In [None]:
print(lr_acc_score*100,"%")

#### As we see the Logistic Regression Model have a 85% accuracy

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree= DecisionTreeClassifier()

In [None]:
tree.fit(x_train,encoded_y)

In [None]:
ypred2=tree.predict(x_test)

In [None]:
encoded_ytest= lbl.fit_transform(y_test)

In [None]:
tree_conf_matrix = confusion_matrix(encoded_ytest,ypred2 )
tree_acc_score = accuracy_score(encoded_ytest, ypred2)

In [None]:
tree_conf_matrix

In [None]:
print(tree_acc_score*100,"%")

#### As we see our Decision Tree Model does not perform well as it gives a score of only 69%

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf= RandomForestClassifier()

In [None]:
rf.fit(x_train,encoded_y)

In [None]:
ypred3 = rf.predict(x_test)

In [None]:
rf_conf_matrix = confusion_matrix(encoded_ytest,ypred3 )
rf_acc_score = accuracy_score(encoded_ytest, ypred3)

In [None]:
rf_conf_matrix

In [None]:
print(rf_acc_score*100,"%")

#### RF also gives us an accuracy of around 80%

### K Nearest Neighbour

#### We have to select what k we will use for the maximum accuracy
#### Let's write a function for it

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
error_rate= []
for i in range(1,40):
    knn= KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train,encoded_y)
    pred= knn.predict(x_test)
    error_rate.append(np.mean(pred != encoded_ytest))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.xlabel('K Vlaue')
plt.ylabel('Error rate')
plt.title('To check the correct value of k')
plt.show()

#### As we see from the graph we should select K= 12 as it gives the best error rate

In [None]:
knn= KNeighborsClassifier(n_neighbors=12)
knn.fit(x_train,encoded_y)
ypred4= knn.predict(x_test)


In [None]:
knn_conf_matrix = confusion_matrix(encoded_ytest,ypred4 )
knn_acc_score = accuracy_score(encoded_ytest, ypred4)

In [None]:
knn_conf_matrix

In [None]:
print(knn_acc_score*100,"%")

#### As we see KNN gives us an accuracy of around 85% which is good

### Support Vector Machine(SVM)

In [None]:
from sklearn import svm

In [None]:
svm= svm.SVC()

In [None]:
svm.fit(x_train,encoded_y)

In [None]:
ypred5= svm.predict(x_test)

In [None]:
svm_conf_matrix = confusion_matrix(encoded_ytest,ypred5)
svm_acc_score = accuracy_score(encoded_ytest, ypred5)

In [None]:
svm_conf_matrix

In [None]:
print(svm_acc_score*100,"%")

#### We get an accuracy of 80% in SVM

#### Let us see our model accuracy in Table form

In [None]:
model_acc= pd.DataFrame({'Model' : ['Logistic Regression','Decision Tree','Random Forest','K Nearest Neighbor','SVM'],'Accuracy' : [lr_acc_score*100,tree_acc_score*100,rf_acc_score*100,knn_acc_score*100,svm_acc_score*100]})

In [None]:
model_acc = model_acc.sort_values(by=['Accuracy'],ascending=False)

In [None]:
model_acc

#### Let us use one more Techniques known as Adaboost, this is a Boosting technique which uses multiple models for better accuracy.

### Adaboost  Classifier

#### Let us first use some random parameters for training the model without Hypertuning.

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
adab= AdaBoostClassifier(base_estimator=svm,n_estimators=100,algorithm='SAMME',learning_rate=0.01,random_state=0)

In [None]:
adab.fit(x_train,encoded_y)

In [None]:
ypred6=adab.predict(x_test)

In [None]:
adab_conf_matrix = confusion_matrix(encoded_ytest,ypred6)
adab_acc_score = accuracy_score(encoded_ytest, ypred6)

In [None]:
adab_conf_matrix

In [None]:
print(adab_acc_score*100,"%")

In [None]:
adab.score(x_train,encoded_y)

In [None]:
adab.score(x_test,encoded_ytest)

#### As we see our model has performed very poorly with just 50% accuracy

#### We will use Grid Seach CV for HyperParameter Tuning

### Grid Search CV

#### Let us try Grid Search CV for our top 3 performing Algorithms for HyperParameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model_acc

### Logistic Regression

In [None]:
param_grid= {

    'solver': ['newton-cg', 'lbfgs', 'liblinear','sag', 'saga'],
    'penalty' : ['none', 'l1', 'l2', 'elasticnet'],
    'C' : [100, 10, 1.0, 0.1, 0.01]

}

In [None]:
grid1= GridSearchCV(LogisticRegression(),param_grid)

In [None]:
grid1.fit(x_train,encoded_y)

In [None]:
grid1.best_params_

#### Let us apply these para in our Model

In [None]:
logreg1= LogisticRegression(C=0.01,penalty='l2',solver='liblinear')

In [None]:
logreg1.fit(x_train,encoded_y)

In [None]:
logreg_pred= logreg1.predict(x_test)

In [None]:
logreg_pred_conf_matrix = confusion_matrix(encoded_ytest,logreg_pred)
logreg_pred_acc_score = accuracy_score(encoded_ytest, logreg_pred)

In [None]:
logreg_pred_conf_matrix

In [None]:
print(logreg_pred_acc_score*100,"%")

#### We got an accuracy of 81%

### KNN

In [None]:
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

In [None]:
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
grid_search = GridSearchCV(estimator=knn, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

In [None]:
grid_search.fit(x_train,encoded_y)

In [None]:
grid_search.best_params_

#### Let's apply

In [None]:
knn= KNeighborsClassifier(n_neighbors=12,metric='manhattan',weights='distance')
knn.fit(x_train,encoded_y)
knn_pred= knn.predict(x_test)

In [None]:
knn_pred_conf_matrix = confusion_matrix(encoded_ytest,knn_pred)
knn_pred_acc_score = accuracy_score(encoded_ytest, knn_pred)

In [None]:
knn_pred_conf_matrix

In [None]:
print(knn_pred_acc_score*100,"%")

#### We have an Accuracy of 82.5%

### SVM

In [None]:

kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']

In [None]:
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=svm, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

In [None]:
grid_search.fit(x_train,encoded_y)

In [None]:
grid_search.best_params_

#### Let us apply these

In [None]:
from sklearn.svm import SVC

In [None]:
svc= SVC(C= 0.1, gamma= 'scale',kernel= 'sigmoid')

In [None]:
svc.fit(x_train,encoded_y)

In [None]:
svm_pred= svc.predict(x_test)

In [None]:
svm_pred_conf_matrix = confusion_matrix(encoded_ytest,svm_pred)
svm_pred_acc_score = accuracy_score(encoded_ytest, svm_pred)

In [None]:
svm_pred_conf_matrix

In [None]:
print(svm_pred_acc_score*100,"%")

#### Accuracy is 81%

## Final Verdict

#### After comparing all the models the best performing model is :
### Logistic Regression with no Hyperparameter tuning

In [None]:
logreg= LogisticRegression()
logreg = LogisticRegression()
logreg.fit(x_train, encoded_y)

In [None]:
Y_pred1

In [None]:
lr_conf_matrix

In [None]:
print(lr_acc_score*100,"%")

#### Let us build a proper confusion matrix for our model

In [None]:
# Confusion Matrix of  Model enlarged
options = ["Disease", 'No Disease']

fig, ax = plt.subplots()
im = ax.imshow(lr_conf_matrix, cmap= 'Set3', interpolation='nearest')

# We want to show all ticks...
ax.set_xticks(np.arange(len(options)))
ax.set_yticks(np.arange(len(options)))
# ... and label them with the respective list entries
ax.set_xticklabels(options)
ax.set_yticklabels(options)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(len(options)):
    for j in range(len(options)):
        text = ax.text(j, i, lr_conf_matrix[i, j],
                       ha="center", va="center", color="black")

ax.set_title("Confusion Matrix of Logistic Regression Model")
fig.tight_layout()
plt.xlabel('Model Prediction')
plt.ylabel('Actual Result')
plt.show()
print("ACCURACY of our model is ",lr_acc_score*100,"%")

## We have succesfully made our model which predicts weather a person is having a risk of Heart Disease or not with 85.7% accuracy
