In [35]:
import pickle
import gzip
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [9]:
%matplotlib inline

In [10]:
np.random.seed()

In [11]:
# Exploratory data analysis

In [12]:
data = pd.read_csv('./cardio_data.csv', sep=';', index_col='id')

In [13]:
data.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [14]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,70000.0,19468.865814,2467.251667,10798.0,17664.0,19703.0,21327.0,23713.0
gender,70000.0,1.349571,0.476838,1.0,1.0,1.0,2.0,2.0
height,70000.0,164.359229,8.210126,55.0,159.0,165.0,170.0,250.0
weight,70000.0,74.20569,14.395757,10.0,65.0,72.0,82.0,200.0
ap_hi,70000.0,128.817286,154.011419,-150.0,120.0,120.0,140.0,16020.0
ap_lo,70000.0,96.630414,188.47253,-70.0,80.0,80.0,90.0,11000.0
cholesterol,70000.0,1.366871,0.68025,1.0,1.0,1.0,2.0,3.0
gluc,70000.0,1.226457,0.57227,1.0,1.0,1.0,1.0,3.0
smoke,70000.0,0.088129,0.283484,0.0,0.0,0.0,0.0,1.0
alco,70000.0,0.053771,0.225568,0.0,0.0,0.0,0.0,1.0


In [15]:
data.isnull().sum()

age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [16]:
data['age'] = data['age'] // 365

In [22]:
# Train test split

In [23]:
X = data.drop(['cardio', 'age_group'], axis=1).values
y = data['cardio'].values

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [25]:
# Creating the model

In [26]:
scaler = StandardScaler()

In [27]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [28]:
model = RandomForestClassifier(n_estimators=300)

In [29]:
model.fit(X_train, y_train)

In [30]:
# Evaluating the model performance

In [31]:
predictions = model.predict(X_test)

In [32]:
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.4f}')
confusion_matrix = confusion_matrix(y_test, predictions)
print('Confusion Matrix: ')
print(confusion_matrix, end='\n')
classification_report = classification_report(y_test, predictions)
print('Classification Report: ')
print(classification_report, end='\n')

Accuracy: 0.7110
Confusion Matrix: 
[[5057 2072]
 [1974 4897]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.72      0.71      0.71      7129
           1       0.70      0.71      0.71      6871

    accuracy                           0.71     14000
   macro avg       0.71      0.71      0.71     14000
weighted avg       0.71      0.71      0.71     14000



In [39]:
with gzip.open('model.pkl.gz', 'wb') as file:
    pickle.dump(model, file)