In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

from sklearn.metrics import accuracy_score, recall_score, precision_score

import pickle

import warnings

warnings.filterwarnings('ignore')

# Load data

### Initialize path constant

In [2]:
DATA_PATH = '../data'
RAW_DATA_PATH = '{}/raw'.format(DATA_PATH)
PROCESSED_DATA_PATH = '{}/processed'.format(DATA_PATH)

MODEL_PATH = '../models'

### Read processed data from CSV

In [17]:
# data = pd.read_csv('{}/processed_data.csv'.format(PROCESSED_DATA_PATH))
data = pd.read_csv('{}/processed_data.csv'.format(PROCESSED_DATA_PATH))
X_df = data.drop(['heart_disease_diagnosis'], axis=1)
X = X_df.values
y = data['heart_disease_diagnosis'].values

# Feature Elimination Function

In [59]:
from itertools import chain, combinations

# RFE-ish with no rank. kwargs = arguments to be passed to the scorer
# Maybe create a custom scoring function for combining multiple score?
# X HAS TO BE DATAFRAME, because it supports .drop()
def rfe_no_rank(model, X_train, X_vali, y_train, y_vali, scorer, **kwargs):
    attr_to_drop = []
    best_target_yet = 0
    attributes = X_train.columns.values.tolist()
    # For every element in the attribute powerset...
    for subset in chain.from_iterable(combinations(attributes, r) for r in range(len(attributes))):
        # ...fit, predict, and calculate score without them
        model.fit(X_train.drop(list(subset), axis=1), y_train)
        y_pred = model.predict(X_vali.drop(list(subset), axis=1))
        target_now = scorer(y_pred, y_vali, **kwargs)
        if target_now > best_target_yet:
            best_target_yet = target_now
            attr_to_drop = list(subset)
    return attr_to_drop, best_target_yet

# Cross Validation

### CV Function

In [4]:
def get_kfold():
    return KFold(n_splits=5, shuffle=True, random_state=1)

def print_cv_result(model, X, y):
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    
    kfold = get_kfold()
    
    for train_idx, validation_idx in kfold.split(X, y):
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_validation = X[validation_idx]
        y_validation = y[validation_idx]

        model.fit(X_train, y_train)

        prediction = model.predict(X_validation)

        accuracy = accuracy_score(y_validation, prediction)
        precision = precision_score(y_validation, prediction, average='macro')
        recall = recall_score(y_validation, prediction, average='macro')

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
    
    print('--- Validation Metrics ---')
    print('Accuracy  = {:.3f}'.format(np.mean(accuracy_scores)))
    print('Precision = {:.3f}'.format(np.mean(precision_scores)))
    print('Recall    = {:.3f}'.format(np.mean(recall_scores)))

## Models

### Naive Bayes

In [5]:
model_name = 'Naive Bayes'
nb_model = GaussianNB()

print('=== {} ===\n'.format(model_name))
print_cv_result(nb_model, X, y)

=== Naive Bayes ===

--- Validation Metrics ---
Accuracy  = 0.531
Precision = 0.348
Recall    = 0.343


### KNN

In K-Nearest Neighbors algorithm, it's really important to scale the features first (feature scaling).
Since the range of values of raw data varies widely, in K-Nearest Neighbors algoritm, objective functions will not work properly without normalization. For example, the majority of classifiers calculate the distance between two points by the Euclidean distance. If one of the features has a broad range of values, the distance will be governed by this particular feature. Therefore, the range of all features should be normalized so that each feature contributes approximately proportionately to the final distance.

In [6]:
model_name = 'K-Nearest Neighbor'
knn_model = KNeighborsClassifier()
X_Scaled = preprocessing.scale(X)

print('=== {} ===\n'.format(model_name))
print_cv_result(knn_model, X_Scaled, y)

=== K-Nearest Neighbor ===

--- Validation Metrics ---
Accuracy  = 0.546
Precision = 0.350
Recall    = 0.331


### Decision Tree

In [60]:
model_name = 'Decision Tree'
dtc_model = DecisionTreeClassifier(criterion='entropy', random_state=1)

print('=== {} ===\n'.format(model_name))
print_cv_result(dtc_model, X, y)

rfe_no_rank(dtc_model, X_df[:500], X_df[500:], y[:500], y[500:], accuracy_score)

=== Decision Tree ===

--- Validation Metrics ---
Accuracy  = 0.451
Precision = 0.320
Recall    = 0.299


(['age',
  'resting_blood_pressure',
  'serum_cholestrol',
  'resting_ECG',
  'max_heart_rate_achieved',
  'ST_depression',
  'peak_exercise_ST_segment'],
 0.5770609318996416)

### ANN

In [8]:
model_name = 'ANN'
ann_model = MLPClassifier(random_state=1, activation='logistic')
X_Scaled = preprocessing.scale(X)

print('=== {} ===\n'.format(model_name))
print_cv_result(ann_model, X_Scaled, y)

=== ANN ===

--- Validation Metrics ---
Accuracy  = 0.570
Precision = 0.317
Recall    = 0.329


# Tune Best Base Model

In [9]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

best_model = GridSearchCV(MLPClassifier(random_state=1, activation='logistic'),
                          param_grid,
                          cv=get_kfold(),
                          scoring='accuracy',
                          verbose=20)

best_model.fit(X_Scaled, y)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=lbfgs 
[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=lbfgs, score=0.4551282051282051, total=   0.4s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=lbfgs 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=lbfgs, score=0.5256410256410257, total=   0.4s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=lbfgs 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=lbfgs, score=0.4935897435897436, total=   0.4s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=lbfgs 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.2s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=lbfgs, score=0.5, total=   0.4s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=lbfgs 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.6s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=lbfgs, score=0.44516129032258067, total=   0.4s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=sgd 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.0s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=sgd, score=0.5128205128205128, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=sgd 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.8s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=sgd, score=0.5064102564102564, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=sgd 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    3.6s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=sgd, score=0.5448717948717948, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=sgd 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    4.3s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=sgd, score=0.6217948717948718, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=sgd 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    5.1s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=sgd, score=0.5161290322580645, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam 


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.9s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam, score=0.5897435897435898, total=   0.5s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam 


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    6.4s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam, score=0.5833333333333334, total=   0.6s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam 


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    7.0s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam, score=0.5897435897435898, total=   0.5s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam 


[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    7.5s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam, score=0.5641025641025641, total=   0.6s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam 


[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    8.0s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam, score=0.5483870967741935, total=   0.6s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=lbfgs 


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    8.6s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=lbfgs, score=0.4551282051282051, total=   0.4s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=lbfgs 


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    9.0s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=lbfgs, score=0.5256410256410257, total=   0.4s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=lbfgs 


[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:    9.4s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=lbfgs, score=0.4935897435897436, total=   0.4s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=lbfgs 


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    9.8s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=lbfgs, score=0.5, total=   0.4s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=lbfgs 


[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:   10.2s remaining:    0.0s


[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=lbfgs, score=0.44516129032258067, total=   0.4s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=sgd 
[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=sgd, score=0.33974358974358976, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=sgd 
[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=sgd, score=0.2948717948717949, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=sgd 
[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=sgd, score=0.27564102564102566, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=sgd 
[CV]  alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=invscaling, solver=sgd, score=0.27564102564102566, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(50,

[CV]  alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam, score=0.5483870967741935, total=   0.6s
[CV] alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=invscaling, solver=lbfgs 
[CV]  alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=invscaling, solver=lbfgs, score=0.48717948717948717, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=invscaling, solver=lbfgs 
[CV]  alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=invscaling, solver=lbfgs, score=0.5128205128205128, total=   1.0s
[CV] alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=invscaling, solver=lbfgs 
[CV]  alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=invscaling, solver=lbfgs, score=0.5448717948717948, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=invscaling, solver=lbfgs 
[CV]  alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=invscaling, solver=lbfgs, score=0.4935897435897436, total=   0.8s
[CV] alpha=0.0001, hidd

[CV]  alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=sgd, score=0.43870967741935485, total=   0.7s
[CV] alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=adam 
[CV]  alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=adam, score=0.5705128205128205, total=   0.9s
[CV] alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=adam 
[CV]  alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=adam, score=0.6089743589743589, total=   1.0s
[CV] alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=adam 
[CV]  alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=adam, score=0.5769230769230769, total=   1.3s
[CV] alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=adam 
[CV]  alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=adam, score=0.5769230769230769, total=   1.1s
[CV] alpha=0.0001, hidden_laye

[CV]  alpha=0.0001, hidden_layer_sizes=(100, 100), learning_rate=constant, solver=lbfgs, score=0.47435897435897434, total=   5.1s
[CV] alpha=0.0001, hidden_layer_sizes=(100, 100), learning_rate=constant, solver=lbfgs 
[CV]  alpha=0.0001, hidden_layer_sizes=(100, 100), learning_rate=constant, solver=lbfgs, score=0.43870967741935485, total=   4.9s
[CV] alpha=0.0001, hidden_layer_sizes=(100, 100), learning_rate=constant, solver=sgd 
[CV]  alpha=0.0001, hidden_layer_sizes=(100, 100), learning_rate=constant, solver=sgd, score=0.4358974358974359, total=   0.8s
[CV] alpha=0.0001, hidden_layer_sizes=(100, 100), learning_rate=constant, solver=sgd 
[CV]  alpha=0.0001, hidden_layer_sizes=(100, 100), learning_rate=constant, solver=sgd, score=0.42948717948717946, total=   1.1s
[CV] alpha=0.0001, hidden_layer_sizes=(100, 100), learning_rate=constant, solver=sgd 
[CV]  alpha=0.0001, hidden_layer_sizes=(100, 100), learning_rate=constant, solver=sgd, score=0.4358974358974359, total=   1.0s
[CV] alpha=0

KeyboardInterrupt: 

In [None]:
print('Best accuracy :', best_model.best_score_)
print('Best params   :', best_model.best_params_)

In [None]:
best_params = {'alpha': 0.01, 'hidden_layer_sizes': (100, 100), 'learning_rate': 'constant', 'solver': 'adam'}
model = MLPClassifier(random_state=1, activation='logistic', **best_params)
print_cv_result(model, X_Scaled, y)

# Final Train

In [None]:
best_params = {'alpha': 0.01, 'hidden_layer_sizes': (100, 100), 'learning_rate': 'constant', 'solver': 'adam'}
model = MLPClassifier(random_state=1, activation='logistic', **best_params)
model.fit(X_Scaled, y)

## Save Best Model

In [None]:
pickle.dump(model, open('{}/taki2_best_model.pkl'.format(MODEL_PATH), 'wb'))