In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [114]:
# Getting our data ready
data = pd.read_csv('data/cardio.csv',sep=';').drop('id', axis=1)
data.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [115]:
# Splitting our data into training and testing data
from sklearn.model_selection import train_test_split
x = data.drop('cardio',axis = 1)
y = data['cardio']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [116]:
# Choosing the right estimator for our data sets
# We are going to use ensemble classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [117]:
# Now we train our model on the training data
clf.fit(x_train,y_train)

In [118]:
# Now we predict our model
cardio_pred = clf.predict(x_test)
pd.DataFrame(cardio_pred,columns=['cardio']).head()

Unnamed: 0,cardio
0,1
1,1
2,0
3,1
4,1


In [119]:
# Now we score our model on the testing data
s1 = clf.score(x_test,y_test)
s2 = np.mean(cardio_pred==y_test)

In [120]:
s1,s2

(0.7120714285714286, 0.7120714285714286)

In [121]:
# Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test,cardio_pred)

0.7120714285714286

In [122]:
# Using Cross Validation Score
from sklearn.model_selection import cross_val_score
cross_val_score(clf,x,y,cv=10)

array([0.71414286, 0.70714286, 0.72228571, 0.72128571, 0.70828571,
       0.72757143, 0.71314286, 0.719     , 0.70685714, 0.71314286])

In [123]:
# Saving our model using pickle
import pickle
pickle.dump(clf,open('clf_model.pkl','wb'))

In [124]:
# Loading the saved model and predicting the cardio disease
loaded_model = pickle.load(open('clf_model.pkl','rb'))
y_pred = loaded_model.predict(x_test)
df = pd.DataFrame(y_pred,columns=['cardio'])
df.head()

Unnamed: 0,cardio
0,1
1,1
2,0
3,1
4,1


In [125]:
# Score
s1 = loaded_model.score(x_test,y_test)
s2 = np.mean(y_test==y_pred)
s1,s2

(0.7120714285714286, 0.7120714285714286)

In [126]:
# Accuracy Score
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7120714285714286

In [127]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred,y_test)

array([[5105, 2116],
       [1915, 4864]], dtype=int64)

In [128]:
# Classification report
from sklearn.metrics import classification_report
rep = classification_report(y_pred,y_test)
rep

'              precision    recall  f1-score   support\n\n           0       0.73      0.71      0.72      7221\n           1       0.70      0.72      0.71      6779\n\n    accuracy                           0.71     14000\n   macro avg       0.71      0.71      0.71     14000\nweighted avg       0.71      0.71      0.71     14000\n'

In [130]:
# Saving the model using joblib
from joblib import dump,load
dump(clf,filename='joblib_model.joblib')

['joblib_model.joblib']

In [131]:
# loading the model
load_joblib_model = load(filename='joblib_model.joblib')

In [132]:
# Checking score
load_joblib_model.score(x_test,y_test)

0.7120714285714286

In [133]:
# Predicting the cardio
new_pred = load_joblib_model.predict(x_test)

In [135]:
# Checking the accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test,new_pred)

0.7120714285714286