# Building a classification model for Cardiac Arrest Prediction 

In [5]:
import pandas as  pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import numpy as np
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'pandas'

In [None]:
data=pd.read_csv('cardio_train.csv')
data

In [None]:
data=data.drop_duplicates()
data

In [None]:
data=data.drop(['id','active'],axis=1)

In [None]:
data.age=round(data.age/365)

In [None]:
data

In [None]:
# here we can get one more feature i.e BMI
# to calculate BMI we need height in meter squared but given is cm.

In [None]:
data['BMI']=data['weight']/((data.height/100)**2)
data

In [None]:
cm=data.corr()
plt.figure(figsize=(12,9))
sns.heatmap(cm,annot=True,linewidths=2)

In [None]:
data.describe()

In [None]:
# there are some unrealastic values from ap_lo and ap_hi
# neglecting those values

In [None]:
data=data[data['ap_lo']>0]
data=data[data['ap_hi']<250]
data=data[data['ap_hi']>60]
data=data[data['ap_lo']<150]
data=data[data['ap_lo']>50]

## Univariate analyasis

In [None]:
sns.countplot(data.cardio)

In [None]:
sns.countplot(data.cholesterol,hue=data.cardio)

In [None]:
sns.countplot(data.smoke,hue=data.cardio)

In [None]:
sns.countplot(data.alco,hue=data.cardio)

In [None]:
sns.countplot(data.cholesterol,hue=data.gender)

In [None]:
sns.barplot(data.cholesterol,data.cardio,hue=data.gender)

In [None]:
sns.countplot(data.BMI<25,hue=data.cardio)

In [None]:
sns.regplot(data.ap_hi,data.ap_lo)

In [None]:
sns.scatterplot(data.ap_hi,data.age)

In [None]:
plt.scatter(data.ap_hi,data.ap_lo)

In [None]:
col=data.columns
plt.figure(figsize=(15,10))
for i in range(len(col)-6):
    plt.subplot(2,3,i+1)
    plt.hist(data[col[i]])
    plt.xlabel(col[i])
    

In [None]:
col=data.columns
plt.figure(figsize=(15,10))
for i in range(len(col)-6):
    plt.subplot(2,3,i+1)
    plt.boxplot(data[col[i]])
    plt.xlabel(col[i])

## Result of Univariate Analysis
* By observing the histogram and box plot there are more no of outliers are present in Height, Weight, ap_hi, ap_lo.
* To deal with this here i am using 2nd and 3rd standard deviation

In [None]:
upper_limit= data.weight.mean() + 2*data.weight.std()
print(upper_limit)
lower_limit= data.weight.mean() - 2*data.weight.std()
lower_limit

In [None]:
data=data[data['weight']<upper_limit] 

In [None]:
data=data[data['weight']>lower_limit]

In [None]:
data.describe()

In [None]:
upper_limit= data.height.mean() + 2*data.height.std()
print('upper limit: ',upper_limit)
lower_limit= data.height.mean() - 2*data.height.std()
print('Lower limit: ',lower_limit)

data=data[data['height']<upper_limit]
data=data[data['height']>lower_limit]


In [None]:
upper_limit= data.ap_hi.mean() + 3*data.ap_hi.std()
print('upper limit: ',upper_limit)
lower_limit= data.ap_hi.mean() - 3*data.ap_hi.std()
print('Lower limit: ',lower_limit)

data=data[data['ap_hi']<upper_limit]
data=data[data['ap_hi']>lower_limit]


In [None]:
upper_limit= data.ap_lo.mean() + 3*data.ap_lo.std()
print('upper limit: ',upper_limit)
lower_limit= data.ap_lo.mean() - 3*data.ap_lo.std()
print('Lower limit: ',lower_limit)

data=data[data['ap_lo']<150]
data=data[data['ap_lo']>50]

In [None]:
col=data.columns
plt.figure(figsize=(15,10))
for i in range(len(col)-6):
    plt.subplot(2,3,i+1)
    plt.boxplot(data[col[i]])
    plt.xlabel(col[i])

In [None]:
col=data.columns
plt.figure(figsize=(15,10))
for i in range(len(col)-6):
    plt.subplot(2,3,i+1)
    plt.hist(data[col[i]])
    plt.xlabel(col[i])

In [None]:
data.describe()

# Building the model

In [None]:
y=data.cardio

In [None]:
x=data[['age','gender','height','weight','ap_hi','ap_lo','cholesterol','gluc','smoke','alco']]

In [None]:
x

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=True)

### Logistic Regression

In [4]:
lgr=LogisticRegression(solver='newton-cg').fit(x_train,y_train)
print('Accuracy of Logistic Regression:',accuracy_score(y_test,lgr.predict(x_test)))

NameError: name 'LogisticRegression' is not defined

In [None]:
print('Classification Report:\n',classification_report(y_test,lgr.predict(x_test)))

### Gradient Boosting Classifier

In [None]:
model=GradientBoostingClassifier().fit(x_train,y_train)

In [None]:
pred=model.predict(x_test)

In [None]:
print('Accuracy of GradientBoostingClassifier:',accuracy_score(y_test,pred))

In [None]:
print('Classification Report:\n',classification_report(y_test,pred))

### Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(random_state=True)
rfc.fit(x_train, y_train)

In [None]:
print('Accuracy of random forrest classifier:',accuracy_score(y_test,rfc.predict(x_test)))

In [None]:
print('Classification Report:\n',classification_report(y_test,rfc.predict(x_test)))

## Evalution of above models

In [None]:
from sklearn.metrics import plot_confusion_matrix,plot_roc_curve,plot_precision_recall_curve

### Based on confusion matrix

In [None]:
print('Confusion matrix of Logistic Regresssion model:')
lgr_cf=plot_confusion_matrix(lgr,x_test,y_test,cmap='Blues_r')

In [None]:
print('Confusion matrix of Random Forest Classifier model:')
lgr_cf=plot_confusion_matrix(rfc,x_test,y_test,cmap='Blues_r')

In [None]:
print('Confusion matrix of Gradient Boosting classifier model:')
lgr_cf=plot_confusion_matrix(model,x_test,y_test,cmap='Blues_r')

### ROC and Precision curve

In [None]:
gbc_disp=plot_roc_curve(model,x_test,y_test)

In [None]:
lgr_disp=plot_roc_curve(lgr,x_test,y_test)
ax = plt.gca()
rfc_disp = plot_roc_curve(rfc, x_test, y_test, ax=ax)
gbc_disp.plot(ax=ax)
plt.show()

In [None]:
gbc_prc=plot_precision_recall_curve(model,x_test,y_test)

In [None]:
lgr_prc=plot_precision_recall_curve(lgr,x_test,y_test)
ax = plt.gca()
rfc_prc = plot_precision_recall_curve(rfc, x_test, y_test, ax=ax)
gbc_prc.plot(ax=ax)
plt.show()

# Result of evalution
* Considering all the obervation and metrics result, logistic regression and Gradient Boosting are performing similar.
* But based on ROC and Precision curve Gradient Boosting is at top of all the three models.
* Hence here choosing Gradient boosting as final model. 

In [None]:
pickle.dump(model,open('Healtcare.pkl','wb'))

In [None]:
pic=pickle.load(open('Healtcare.pkl','rb'))

In [None]:
pic.predict([[52.0,1,165,64.0,130,70,3,1,0,0]])

In [None]:
#from cmd.
sklearn_version = sklearn. __version__

print(sklearn_version)