In [73]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

from sklearn.metrics import accuracy_score, recall_score, precision_score

import pickle

import warnings

warnings.filterwarnings('ignore')

# Load data

### Initialize path constant

In [6]:
DATA_PATH = '../data'
RAW_DATA_PATH = '{}/raw'.format(DATA_PATH)
PROCESSED_DATA_PATH = '{}/processed'.format(DATA_PATH)

MODEL_PATH = '../models'

### Read processed data from CSV

In [48]:
data = pd.read_csv('{}/processed_data.csv'.format(PROCESSED_DATA_PATH))
X = data.drop(['heart_disease_diagnosis'], axis=1).values
y = data['heart_disease_diagnosis'].values

# Cross Validation

### CV Function

In [57]:
def get_kfold():
    return KFold(n_splits=5, shuffle=True, random_state=1)

def print_cv_result(model, X, y):
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    
    kfold = get_kfold()
    
    for train_idx, validation_idx in kfold.split(X, y):
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_validation = X[validation_idx]
        y_validation = y[validation_idx]

        model.fit(X_train, y_train)

        prediction = model.predict(X_validation)

        accuracy = accuracy_score(y_validation, prediction)
        precision = precision_score(y_validation, prediction, average='macro')
        recall = recall_score(y_validation, prediction, average='macro')

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
    
    print('--- Validation Metrics ---')
    print('Accuracy  = {:.3f}'.format(np.mean(accuracy_scores)))
    print('Precision = {:.3f}'.format(np.mean(precision_scores)))
    print('Recall    = {:.3f}'.format(np.mean(recall_scores)))

## Models

### Naive Bayes

In [58]:
model_name = 'Naive Bayes'
nb_model = GaussianNB()

print('=== {} ===\n'.format(model_name))
print_cv_result(nb_model, X, y)

=== Naive Bayes ===

--- Validation Metrics ---
Accuracy  = 0.531
Precision = 0.348
Recall    = 0.343


### KNN

In K-Nearest Neighbors algorithm, it's really important to scale the features first (feature scaling).
Since the range of values of raw data varies widely, in K-Nearest Neighbors algoritm, objective functions will not work properly without normalization. For example, the majority of classifiers calculate the distance between two points by the Euclidean distance. If one of the features has a broad range of values, the distance will be governed by this particular feature. Therefore, the range of all features should be normalized so that each feature contributes approximately proportionately to the final distance.

In [59]:
model_name = 'K-Nearest Neighbor'
knn_model = KNeighborsClassifier()
X_Scaled = preprocessing.scale(X)

print('=== {} ===\n'.format(model_name))
print_cv_result(knn_model, X_Scaled, y)

=== K-Nearest Neighbor ===

--- Validation Metrics ---
Accuracy  = 0.546
Precision = 0.350
Recall    = 0.331


### Decision Tree

In [60]:
model_name = 'Decision Tree'
dtc_model = DecisionTreeClassifier(criterion='entropy', random_state=1)

print('=== {} ===\n'.format(model_name))
print_cv_result(dtc_model, X, y)

=== Decision Tree ===

--- Validation Metrics ---
Accuracy  = 0.451
Precision = 0.320
Recall    = 0.299


### ANN

In [77]:
model_name = 'ANN'
ann_model = MLPClassifier(random_state=1, activation='logistic')
X_Scaled = preprocessing.scale(X)

print('=== {} ===\n'.format(model_name))
print_cv_result(ann_model, X_Scaled, y)

=== ANN ===

--- Validation Metrics ---
Accuracy  = 0.570
Precision = 0.367
Recall    = 0.349


# Tune Best Base Model

In [86]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)]
}

best_model = GridSearchCV(MLPClassifier(random_state=1, activation='logistic'),
                          param_grid,
                          cv=get_kfold(),
                          scoring='accuracy')

best_model.fit(X_Scaled, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
       error_score='raise-deprecating',
       estimator=MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [87]:
print('Best accuracy:', best_model.best_score_)

Best accuracy: 0.5815147625160462


## Save Best Model

In [88]:
pickle.dump(best_model, open('{}/best_model.pkl'.format(MODEL_PATH), 'wb'))