# Data Processing

In [1]:
# import library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import data
df = pd.read_csv('heart_clean.csv')
df.head()

# note:
# heart_clean.csv is heart.csv that has been cleaned in heart_diagnosis_EDA.ipynb

Unnamed: 0,age,sex,chest_pain,systolic,cholesterol,fasting_blood_sugar,restECG,max_heart_rate,exercise_induced_angina,st_depression,slope,number_vessel,thallium_stress,diagnose
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# correlation between features and target
df.corr()['diagnose'].sort_values(ascending=False)[1:]

max_heart_rate             0.426655
chest_pain                 0.423425
slope                      0.337825
restECG                    0.131716
fasting_blood_sugar       -0.004680
cholesterol               -0.076541
systolic                  -0.148922
age                       -0.225453
sex                       -0.285322
thallium_stress           -0.364399
exercise_induced_angina   -0.425085
st_depression             -0.428804
number_vessel             -0.467158
Name: diagnose, dtype: float64

# Feature Engineering

In [4]:
# feature selection

# all features has moderate correlation level, except the fasting_blood sugar
# cholesterol has low correlation with diagnose, but EDA says it matters
df = df.drop(columns=['fasting_blood_sugar', 'cholesterol'])

In [5]:
# one hot encoding non-binary nominal data

# restECG
df = pd.get_dummies(data=df, columns=['restECG'], prefix_sep='_')

# thallium_stress
df = pd.get_dummies(data=df, columns=['thallium_stress'], prefix_sep='_')

df.head()

Unnamed: 0,age,sex,chest_pain,systolic,max_heart_rate,exercise_induced_angina,st_depression,slope,number_vessel,diagnose,restECG_0,restECG_1,restECG_2,thallium_stress_1,thallium_stress_2,thallium_stress_3
0,63,1,3,145,150,0,2.3,0,0,1,1,0,0,1,0,0
1,37,1,2,130,187,0,3.5,0,0,1,0,1,0,0,1,0
2,41,0,1,130,172,0,1.4,2,0,1,1,0,0,0,1,0
3,56,1,1,120,178,0,0.8,2,0,1,0,1,0,0,1,0
4,57,0,0,120,163,1,0.6,2,0,1,0,1,0,0,1,0


In [6]:
# correlation between features and target
df.corr()['diagnose'].sort_values(ascending=False)[1:]

thallium_stress_2          0.530032
max_heart_rate             0.426655
chest_pain                 0.423425
slope                      0.337825
restECG_1                  0.170030
restECG_2                 -0.068235
thallium_stress_1         -0.105799
systolic                  -0.148922
restECG_0                 -0.154302
age                       -0.225453
sex                       -0.285322
exercise_induced_angina   -0.425085
st_depression             -0.428804
number_vessel             -0.467158
thallium_stress_3         -0.489046
Name: diagnose, dtype: float64

In [8]:
# feature scaling

# we need to scale some features by using Robust Scaler to minimize the effect of outliers
from sklearn.preprocessing import RobustScaler
df['systolic'] = RobustScaler().fit_transform(df[['systolic']])
df['max_heart_rate'] = RobustScaler().fit_transform(df[['max_heart_rate']])

df.head()

Unnamed: 0,age,sex,chest_pain,systolic,max_heart_rate,exercise_induced_angina,st_depression,slope,number_vessel,diagnose,restECG_0,restECG_1,restECG_2,thallium_stress_1,thallium_stress_2,thallium_stress_3
0,63,1,3,0.75,-0.075758,0,2.3,0,0,1,1,0,0,1,0,0
1,37,1,2,0.0,1.045455,0,3.5,0,0,1,0,1,0,0,1,0
2,41,0,1,0.0,0.590909,0,1.4,2,0,1,1,0,0,0,1,0
3,56,1,1,-0.5,0.772727,0,0.8,2,0,1,0,1,0,0,1,0
4,57,0,0,-0.5,0.318182,1,0.6,2,0,1,0,1,0,0,1,0


# Data Processing into Machine Learning

In [9]:
# data splitting into fetaure and target
X = df.drop(columns='diagnose')
y = df['diagnose']

In [10]:
# data splitting into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.75, random_state=99)
df_train = pd.concat([X_train, y_train], axis=1)

In [11]:
# checking the balance of values
y_train.value_counts()

# the data is imbalance, there is gap between the number of heart disease and non-heart disease
# majority data: diagnosed heart disease
# minority data: diagnosed non-heart disease

1    120
0    102
Name: diagnose, dtype: int64

In [12]:
# do SMOTE to synthesize data of the minority
import imblearn
from imblearn.over_sampling import SMOTE

sm = SMOTE()
X_sm, y_sm = sm.fit_sample(X_train, y_train)
df_sm = pd.concat([X_sm, y_sm], axis=1)

# check if the data has been balanced
y_sm.value_counts()

1    120
0    120
Name: diagnose, dtype: int64

# Machine Learning

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In this machine learning process we need to focus on the recall_score value. We just do not want those with heart-disease risk is missed from observation, because it can risks patient's life.

## Logistic Regression (base)

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
LR = LogisticRegression()
LR.fit(X_sm, y_sm)

LogisticRegression()

# Export Model

In [16]:
import joblib

In [17]:
filename = 'heart_disease_model.sav'
joblib.dump(LR, filename)

['heart_disease_model.sav']