In [24]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, KFold

from sklearn.metrics import accuracy_score, recall_score, precision_score

import pickle

import warnings

warnings.filterwarnings('ignore')

# Load data

### Initialize path constant

In [26]:
DATA_PATH = '../data'
RAW_DATA_PATH = '{}/raw'.format(DATA_PATH)
PROCESSED_DATA_PATH = '{}/processed'.format(DATA_PATH)

MODEL_PATH = '../models'

### Read processed data from CSV

In [3]:
data = pd.read_csv('{}/processed_data.csv'.format(PROCESSED_DATA_PATH))
X = data.drop(['heart_disease_diagnosis','sex'], axis=1)
y = data['heart_disease_diagnosis']

# Cross Validation

### CV Function

In [18]:
def get_cv_data_partition(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=1)

def print_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_true)
    precision = precision_score(y_pred=y_pred, y_true=y_true, average='macro')
    recall = recall_score(y_pred=y_pred, y_true=y_true, average='macro')
    
    print('--- Validation Metrics ---')
    print('Accuracy  = {:.3f}'.format(accuracy))
    print('Precision = {:.3f}'.format(precision))
    print('Recall    = {:.3f}'.format(recall))

def print_cv_result(model, X, y):
    # Partition data
    X_train, X_validation, y_train, y_validation = get_cv_data_partition(X, y)
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predict validation
    prediction = model.predict(X_validation)
    
    # Print validation metrics
    print_metrics(y_validation, prediction)

## Models

### Logistic Regression

In [19]:
model_name = 'Logistic Regression'
model = LogisticRegression()

print('=== {} ===\n'.format(model_name))
print_cv_result(model, X, y)

=== Logistic Regression ===

--- Validation Metrics ---
Accuracy  = 0.564
Precision = 0.356
Recall    = 0.320


### KNN

In [21]:
model_name = 'K-Nearest Neighbor'
model = KNeighborsClassifier()

print('=== {} ===\n'.format(model_name))
print_cv_result(model, X, y)

=== K-Nearest Neighbor ===

--- Validation Metrics ---
Accuracy  = 0.429
Precision = 0.184
Recall    = 0.218


### Decision Tree

In [22]:
model_name = 'Decision Tree'
model = DecisionTreeClassifier(criterion='entropy', random_state=1)

print('=== {} ===\n'.format(model_name))
print_cv_result(model, X, y)

=== Decision Tree ===

--- Validation Metrics ---
Accuracy  = 0.417
Precision = 0.296
Recall    = 0.276


### ANN

In [23]:
model_name = 'ANN'
model = MLPClassifier(random_state=1)

print('=== {} ===\n'.format(model_name))
print_cv_result(model, X, y)

=== ANN ===

--- Validation Metrics ---
Accuracy  = 0.532
Precision = 0.477
Recall    = 0.322


## Save Best Model

In [27]:
best_model = LogisticRegression()
best_model.fit(X, y)

pickle.dump(best_model, open('{}/best_model.pkl'.format(MODEL_PATH), 'wb'))