In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, KFold

from sklearn.metrics import accuracy_score, recall_score, precision_score

import pickle

import warnings

warnings.filterwarnings('ignore')

# Load data

### Initialize path constant

In [2]:
DATA_PATH = '../data'
RAW_DATA_PATH = '{}/raw'.format(DATA_PATH)
PROCESSED_DATA_PATH = '{}/processed'.format(DATA_PATH)

MODEL_PATH = '../models'

### Read processed data from CSV

In [3]:
data = pd.read_csv('{}/processed_data.csv'.format(PROCESSED_DATA_PATH))
X = data.drop(['heart_disease_diagnosis','sex','num_of_major_vessels'], axis=1)
y = data['heart_disease_diagnosis']

# Cross Validation

### CV Function

In [4]:
def get_cv_data_partition(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=1)

def print_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_true)
    precision = precision_score(y_pred=y_pred, y_true=y_true, average='macro')
    recall = recall_score(y_pred=y_pred, y_true=y_true, average='macro')
    
    print('--- Validation Metrics ---')
    print('Accuracy  = {:.3f}'.format(accuracy))
    print('Precision = {:.3f}'.format(precision))
    print('Recall    = {:.3f}'.format(recall))

def print_cv_result(model, X, y):
    # Partition data
    X_train, X_validation, y_train, y_validation = get_cv_data_partition(X, y)
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predict validation
    prediction = model.predict(X_validation)
    
    # Print validation metrics
    print_metrics(y_validation, prediction)

## Models

### Naive Bayes

In [10]:
model_name = 'Naive Bayes'
nb_model = GaussianNB()

print('=== {} ===\n'.format(model_name))
print_cv_result(nb_model, X, y)

=== Naive Bayes ===

--- Validation Metrics ---
Accuracy  = 0.571
Precision = 0.408
Recall    = 0.432


### KNN

In K-Nearest Neighbors algorithm, it's really important to scale the features first (feature scaling).
Since the range of values of raw data varies widely, in K-Nearest Neighbors algoritm, objective functions will not work properly without normalization. For example, the majority of classifiers calculate the distance between two points by the Euclidean distance. If one of the features has a broad range of values, the distance will be governed by this particular feature. Therefore, the range of all features should be normalized so that each feature contributes approximately proportionately to the final distance.

In [6]:
model_name = 'K-Nearest Neighbor'
knn_model = KNeighborsClassifier()
X_Scaled = preprocessing.scale(X)

print('=== {} ===\n'.format(model_name))
print_cv_result(knn_model, X_Scaled, y)

=== K-Nearest Neighbor ===

--- Validation Metrics ---
Accuracy  = 0.545
Precision = 0.387
Recall    = 0.327


### Decision Tree

In [7]:
model_name = 'Decision Tree'
dtc_model = DecisionTreeClassifier(criterion='entropy', random_state=1)

print('=== {} ===\n'.format(model_name))
print_cv_result(dtc_model, X, y)

=== Decision Tree ===

--- Validation Metrics ---
Accuracy  = 0.455
Precision = 0.335
Recall    = 0.321


### ANN

In [8]:
model_name = 'ANN'
ann_model = MLPClassifier(random_state=1, alpha=0.00210, epsilon=1e-8, activation='logistic', verbose=True)

print('=== {} ===\n'.format(model_name))
print_cv_result(ann_model, X, y)

=== ANN ===

Iteration 1, loss = 1.44603137
Iteration 2, loss = 1.36179690
Iteration 3, loss = 1.31641627
Iteration 4, loss = 1.28611168
Iteration 5, loss = 1.26701383
Iteration 6, loss = 1.25047268
Iteration 7, loss = 1.23935273
Iteration 8, loss = 1.23155788
Iteration 9, loss = 1.22591125
Iteration 10, loss = 1.22150550
Iteration 11, loss = 1.21664599
Iteration 12, loss = 1.21147469
Iteration 13, loss = 1.20848573
Iteration 14, loss = 1.20550590
Iteration 15, loss = 1.20034981
Iteration 16, loss = 1.19591337
Iteration 17, loss = 1.19271237
Iteration 18, loss = 1.18916824
Iteration 19, loss = 1.18239896
Iteration 20, loss = 1.17976465
Iteration 21, loss = 1.18163414
Iteration 22, loss = 1.17997689
Iteration 23, loss = 1.17493432
Iteration 24, loss = 1.17041438
Iteration 25, loss = 1.16788758
Iteration 26, loss = 1.16317273
Iteration 27, loss = 1.16441787
Iteration 28, loss = 1.16055251
Iteration 29, loss = 1.15655046
Iteration 30, loss = 1.15507686
Iteration 31, loss = 1.15293015
Iter

## Save Best Model

In [9]:
best_model = ann_model
pickle.dump(best_model, open('{}/best_model.pkl'.format(MODEL_PATH), 'wb'))