## 1. Import the Cleaned data-set 

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('cleaned_cardio.csv')
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
68413,19240,2,168,76.0,120,80,1,1,1,0,1,0
68414,22601,1,158,126.0,140,90,2,2,0,0,1,1
68415,19066,2,183,105.0,180,90,3,1,0,1,0,1
68416,22431,1,163,72.0,135,80,1,2,0,0,0,1


## 2. Convert the 'age' column from days to years named 'age_years'
Find Min, Mean, Max so we can detect outliers

In [3]:
df2 = df.copy()

In [4]:
df2['age_years'] = df2['age'] // 365
print("Min: ",df2['age_years'].min())
print("Max: ",df2['age_years'].max())
print("Mean: ",df2['age_years'].mean())

Min:  29
Max:  64
Mean:  52.829284691163146


## 3. From 'weight' and 'height' column generate new 'bmi' column by using formula weight/((height/100)^2)
Find Min, Mean, Max so we can detect outliers

In [5]:
df2['bmi'] = df2['weight'] / ((df2['height']/100)**2)
print("Min: ",df2['bmi'].min())
print("Max: ",df2['bmi'].max())
print("Mean: ",df2['bmi'].mean())

Min:  13.520822065981614
Max:  65.35672120087705
Mean:  27.428436983277347


## 4. Create new column from 'ap_hi' and 'ap_lo' columns called 'bp_diff' which indicate the BP Difference
Find Min, Mean, Max so we can detect outliers

In [6]:
df2['bp_diff'] = df2['ap_hi'] - df2['ap_lo']
print("Min: ",df2['bp_diff'].min())
print("Max: ",df2['bp_diff'].max())
print("Mean: ",df2['bp_diff'].mean())

Min:  5
Max:  140
Mean:  45.37270893624485


In [7]:
df2.shape

(68418, 15)

## 5. In madically bmi generally categrise into 4 categories:
below 18.5        --> 'Underweight'\
Between 18.6 & 24 --> 'Normal'\
Between 25 & 29   --> 'Overweight'\
above 29          --> 'Obese'

In [8]:
def bmi_cat_4(bmi):
    if bmi < 18.5:
        return 0   # Underweight
    elif bmi < 25:
        return 1   # Normal
    elif bmi < 30:
        return 2   # Overweight
    else:
        return 3   # Obese

df2["bmi_cat"] = df2["bmi"].apply(bmi_cat_4)
df2.groupby("bmi_cat")["cardio"].mean()

bmi_cat
0    0.265886
1    0.398055
2    0.505528
3    0.625743
Name: cardio, dtype: float64

## 6. Split the data into train and test part.

In [9]:
from sklearn.model_selection import train_test_split

x = df2.drop(['cardio','age'], axis=1)
y = df2['cardio']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

print(X_train.shape)
print(X_test.shape)

(54734, 14)
(13684, 14)


## 7. Apply scaling and encoding
StandardScaler on numeric features \
OneHotEncoder on categorical features

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_features = ['age_years', 'height', 'weight' ,'ap_hi', 'ap_lo', 'bmi', 'bp_diff']
cat_features = ['gender', 'cholesterol', 'gluc','smoke', 'active', 'alco', 'bmi_cat']

preprocess = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

## 8. Make a pipeline so we can easily preprocess and train the model by the pipeline call
for the data prediction we use the random forest

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [12]:
pipeline = Pipeline([
    ('preprocess', preprocess),
    ('model', RandomForestClassifier(
        n_estimators=100,
        max_depth=12,
        min_samples_leaf=5,
        class_weight='balanced',
        random_state=42,
        n_jobs=1
    ))
])


## 9. Train the data on random-forest pipeline.

In [13]:
pipeline.fit(X_train, y_train)

## 10. Now predict the splitted test data and also train data so we can compare.
Accuracy measure\
Recall and F1-test\
Check Overfitting and Underfitting\
Confusion matrix for the test data

In [14]:
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)


In [15]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix

print("TRAIN METRICS")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Recall:", recall_score(y_train, y_train_pred))
print("F1:", f1_score(y_train, y_train_pred))

print("\nTEST METRICS")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1:", f1_score(y_test, y_test_pred))
print("Confusion matrix of test data:\n", confusion_matrix(y_test, y_test_pred))

TRAIN METRICS
Accuracy: 0.7598019512551614
Recall: 0.7064426804504338
F1: 0.7442964115530487

TEST METRICS
Accuracy: 0.7335574393452207
Recall: 0.6742468989958653
F1: 0.7146658318985757
Confusion matrix of test data:
 [[5472 1440]
 [2206 4566]]


## 11. Now save the pipeline for further use.
joblib library will be used for that

In [19]:
import joblib

joblib.dump(pipeline, "cardio_pipeline.pkl")

['cardio_pipeline.pkl']

## 12. Check by loading the saved pipeline as model that works or not.

In [20]:
model = joblib.load("cardio_pipeline.pkl")

In [21]:
y_model_pred = model.predict(X_test)
recall_score(y_test, y_model_pred)

0.6742468989958653