In [1]:
# Libraries
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import time
import numpy as np

dataset = pd.read_csv("./datasets/heart_data.csv")
dataset.drop(columns=['index', 'id'], axis=1, inplace=True)


# Preparing data

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

y = dataset['cardio'].values
cardio = dataset.drop(['cardio'], axis=1)
cat_attribs = ['gender','cholesterol', 'gluc', 'smoke', 'alco', 'active']
cardio_num = cardio.drop(cat_attribs, axis=1)
num_attribs = list(cardio_num)

num_pipeline = Pipeline([('std_scaler', StandardScaler())])
full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs), #num_pipeline
 ("cat", OneHotEncoder(), cat_attribs), #one hot encoder
 ])
cardio_prepared = full_pipeline.fit_transform(cardio)

# Split into train and test set

In [3]:
from sklearn.model_selection import train_test_split

y = dataset['cardio'].values
X = cardio_prepared.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# METRICS FUNCTION

In [4]:
#!/usr/bin/env python3
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import numpy as np
def perfomance_metrics(classifier, y_, y_pred):
    
    accuracy = round(accuracy_score(y_, y_pred), 5)
    precision = round(precision_score(y_, y_pred), 5)
    recall = round(recall_score(y_, y_pred), 5)
    f1 = round(f1_score(y_, y_pred), 5)
    auc = round(roc_auc_score(y_, y_pred), 5)
    
    print("\n", classifier.__class__.__name__)
    if hasattr(classifier, 'best_params_'):
        print("Best Model: ", classifier.best_params_)
    print("\nConfusion Matrix:\n", confusion_matrix(y_, y_pred))
    print("\nAccuracy: " , accuracy)
    print("\nPrecision: ", precision)
    print("\nRecall: ", recall)
    print("\nF1: ", f1)
    print("\nAUC: ", auc)

    return np.array([classifier.__class__.__name__, accuracy, precision, recall, f1, auc])

# LOGISTIC REGRESSION

In [5]:
from sklearn.linear_model import LogisticRegression
start_time = time.time()

#lbfgs - [l2, None]
#liblinear - [l1, l2]
#newton-cg - [l2, None]
#newton-cholesky - [l2, None]
#sag - [l2, None]
#saga - [elasticnet, l1, l2, None]

parameters = {  'penalty': ['l2'],
                'C' : [0.01, 0.1, 0.3, 0.5,0.7, 1],
                'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'saga'],
                #'n_jobs': -1,
                #'max_iter': [100, 120, 150, 200],
            }

logistic_regression_clf = RandomizedSearchCV(LogisticRegression(), parameters, cv=3, random_state=42, n_jobs=-1, error_score=0, n_iter=20)

logistic_regression_clf.fit(X_train, y_train)
y_pred = logistic_regression_clf.predict(X_test)

duration = time.time() - start_time
print("Computation Time = ", duration)
perfomance_metrics(logistic_regression_clf, y_test, y_pred)

Computation Time =  15.646891355514526

 RandomizedSearchCV
Best Model:  {'solver': 'newton-cg', 'penalty': 'l2', 'C': 1}

Confusion Matrix:
 [[5352 1636]
 [2253 4759]]

Accuracy:  0.72221

Precision:  0.74418

Recall:  0.67869

F1:  0.70993

AUC:  0.72229


array(['RandomizedSearchCV', '0.72221', '0.74418', '0.67869', '0.70993',
       '0.72229'], dtype='<U32')

In [6]:
logistic_regression_clf.best_params_['solver']

'newton-cg'

# DECISION TREE

In [7]:
from sklearn.tree import DecisionTreeClassifier

start_time = time.time()

parameters = {  'criterion':['gini','entropy'],
                'splitter': ['best', 'random'],
                'max_depth':np.arange(1,30).tolist()[0::2],
                'min_samples_split':np.arange(1,30).tolist()[1::2],
                'min_samples_leaf': np.arange(1,30).tolist()[0::2],
                'max_leaf_nodes':np.arange(3,30).tolist()[0::2] }

print(parameters)
tree_clf = RandomizedSearchCV(DecisionTreeClassifier(), parameters, cv=3, n_iter=20, random_state=42, n_jobs=-1)

tree_clf.fit(X_train, y_train)
y_pred = tree_clf.predict(X_test)

duration = time.time() - start_time
print("Computation Time = ", duration)

perfomance_metrics(tree_clf, y_test, y_pred)

{'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29], 'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28], 'min_samples_leaf': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29], 'max_leaf_nodes': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]}
Computation Time =  3.4483869075775146

 RandomizedSearchCV
Best Model:  {'splitter': 'best', 'min_samples_split': 28, 'min_samples_leaf': 9, 'max_leaf_nodes': 27, 'max_depth': 29, 'criterion': 'gini'}

Confusion Matrix:
 [[5487 1501]
 [2200 4812]]

Accuracy:  0.73564

Precision:  0.76224

Recall:  0.68625

F1:  0.72225

AUC:  0.73573


array(['RandomizedSearchCV', '0.73564', '0.76224', '0.68625', '0.72225',
       '0.73573'], dtype='<U32')

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

start_time = time.time()

parameters = {  'n_estimators':np.arange(1,1000).tolist()[0::200],
                'max_depth':np.arange(1,30).tolist()[0::2],
                'min_samples_split':np.arange(2,30).tolist()[1::2],
                'min_samples_leaf': np.arange(1,30).tolist()[0::2],
                'max_leaf_nodes':np.arange(3,30).tolist()[0::2],
                'max_features':  ['sqrt', 'log2', None] }

random_forest_clf = RandomizedSearchCV(RandomForestClassifier(), parameters, cv=3, n_iter=10, random_state=42, n_jobs=-1)

random_forest_clf.fit(X_train, y_train)
y_pred = random_forest_clf.predict(X_test)

duration = time.time() - start_time
print("Computation Time = ", duration)

perfomance_metrics(random_forest_clf, y_test, y_pred)

Computation Time =  269.1619818210602

 RandomizedSearchCV
Best Model:  {'n_estimators': 601, 'min_samples_split': 17, 'min_samples_leaf': 25, 'max_leaf_nodes': 19, 'max_features': None, 'max_depth': 11}

Confusion Matrix:
 [[5491 1497]
 [2206 4806]]

Accuracy:  0.7355

Precision:  0.76249

Recall:  0.6854

F1:  0.72189

AUC:  0.73559


array(['RandomizedSearchCV', '0.7355', '0.76249', '0.6854', '0.72189',
       '0.73559'], dtype='<U32')

# Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

start_time = time.time()

parameters = {  'estimator': [DecisionTreeClassifier(),RandomForestClassifier()],
                    'n_estimators':np.arange(1,1000).tolist()[0::50],
                    'max_samples' : np.arange(2,30).tolist()[1::2],
                    'bootstrap_features': [True, False],
                    'oob_score': [True, False]}


bagging_clf = RandomizedSearchCV(BaggingClassifier(), parameters, cv=2, n_iter=10, random_state=42, n_jobs=-1)


bagging_clf.fit(X_train, y_train)
y_pred = bagging_clf.predict(X_test)

duration = time.time() - start_time
print("Computation Time = ", duration)

perfomance_metrics(bagging_clf, y_test, y_pred)

# Ada Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

start_time = time.time()

parameters = {'estimator': [DecisionTreeClassifier(),RandomForestClassifier()],
                #'n_estimators':np.arange(1,1000).tolist()[0::100],
                'learning_rate': [0.0001, 0.001, 0.005, 0.01, 0.02, 0.03, 0.05, 0.07, 1],
                'algorithm' : ['SAMME', 'SAMME.R']}


ada_clf = RandomizedSearchCV(AdaBoostClassifier(), parameters, cv=2, n_iter=10, random_state=42, n_jobs=-1)


ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)

duration = time.time() - start_time
print("Computation Time = ", duration)

perfomance_metrics(ada_clf, y_test, y_pred)


# GRADIENT BOOSTING

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

start_time = time.time()

parameters = {  'loss': ['log_loss', 'exponential'],
                'learning_rate': [0.0001, 0.001, 0.005, 0.01, 0.02, 0.03, 0.05, 0.07, 1],
                'criterion': ['friedman_mse', 'squared_error'],
                'min_samples_split':np.arange(2,30).tolist()[1::2],
                'min_samples_leaf': np.arange(1,30).tolist()[0::2],
                'max_depth':np.arange(1,30).tolist()[0::2],
                'max_features':  ['sqrt', 'log2', None]}


gradient_boosting_clf = RandomizedSearchCV(GradientBoostingClassifier(), parameters, cv=3, n_iter=10, random_state=42, n_jobs=-1)

gradient_boosting_clf.fit(X_train, y_train)
y_pred = gradient_boosting_clf.predict(X_test)

duration = time.time() - start_time
print("Computation Time = ", duration)

perfomance_metrics(gradient_boosting_clf, y_test, y_pred)

Computation Time =  204.11541652679443

 RandomizedSearchCV
Best Model:  {'min_samples_split': 27, 'min_samples_leaf': 19, 'max_features': None, 'max_depth': 7, 'loss': 'exponential', 'learning_rate': 0.02, 'criterion': 'friedman_mse'}

Confusion Matrix:
 [[5504 1484]
 [2156 4856]]

Accuracy:  0.74

Precision:  0.76593

Recall:  0.69253

F1:  0.72738

AUC:  0.74008


array(['RandomizedSearchCV', '0.74', '0.76593', '0.69253', '0.72738',
       '0.74008'], dtype='<U32')

In [None]:
# Voting Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier


vt_lr_clf = LogisticRegression(solver=logistic_regression_clf.best_params_['solver'], 
                                                penalty= logistic_regression_clf.best_params_['penalty'], 
                                                C=logistic_regression_clf.best_params_['C'])

vt_dt_clf = DecisionTreeClassifier(splitter=tree_clf.best_params_['splitter'], 
                                      min_samples_split=tree_clf.best_params_['min_samples_split'],
                                      min_samples_leaf=tree_clf.best_params_['min_samples_leaf'],
                                      max_leaf_nodes=tree_clf.best_params_['max_leaf_nodes'], 
                                      max_depth=tree_clf.best_params_['max_depth'],
                                      criterion=tree_clf.best_params_['criterion']
                                    )

vt_rf_clf = RandomForestClassifier(n_estimators=random_forest_clf.best_params_['n_estimators'],
                                  min_samples_split=random_forest_clf.best_params_['min_samples_split'],
                                  min_samples_leaf=random_forest_clf.best_params_['min_samples_leaf'],
                                  max_leaf_nodes=random_forest_clf.best_params_['max_leaf_nodes'],
                                  max_features=random_forest_clf.best_params_['max_features'],
                                  max_depth=random_forest_clf.best_params_['max_depth']
                                )

vt_gb_clf = GradientBoostingClassifier(loss=gradient_boosting_clf.best_params_['loss'],
                                    learning_rate=gradient_boosting_clf.best_params_['learning_rate'],
                                    #n_estimators=gradient_boosting_clf.best_params_['n_estimators'],
                                    criterion=gradient_boosting_clf.best_params_['criterion'],
                                    min_samples_split=gradient_boosting_clf.best_params_['min_samples_split'],
                                    min_samples_leaf=gradient_boosting_clf.best_params_['min_samples_leaf'],
                                    max_depth=gradient_boosting_clf.best_params_['max_depth'],
                                    max_features=gradient_boosting_clf.best_params_['max_features']
                                  )

voting_clf = VotingClassifier(estimators=[('lr', vt_lr_clf), ('tree', vt_dt_clf), ('rf', vt_rf_clf), ('gb', vt_gb_clf)], voting='hard', n_jobs=-1)
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

perfomance_metrics(voting_clf, y_test, y_pred)



 VotingClassifier

Confusion Matrix:
 [[5614 1374]
 [2322 4690]]

Accuracy:  0.736

Precision:  0.77342

Recall:  0.66885

F1:  0.71734

AUC:  0.73612


array(['VotingClassifier', '0.736', '0.77342', '0.66885', '0.71734',
       '0.73612'], dtype='<U32')

# XGB Classifier

In [None]:
parameters = {
    'eta': [0.001, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9],
    
}
import xgboost as xgb 
import time
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np
    
start_time = time.time()

parameters = {
    'eta': [0.001, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9],
    'max_depth': np.arange(2,20).tolist()[1::2]
}

xgb_clf = RandomizedSearchCV(xgb.XGBClassifier(), parameters, cv=3, n_iter=10, random_state=42, n_jobs=-1)


xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

duration = time.time() - start_time
print("Computation Time = ", duration)

print(perfomance_metrics(xgb_clf, y_test, y_pred))


In [32]:
print(perfomance_metrics(xgb_clf, y_test, y_pred))


 RandomizedSearchCV
Best Model:  {'max_depth': 3, 'eta': 0.5}

Confusion Matrix:
 [[5361 1627]
 [2041 4971]]

Accuracy:  0.738

Precision:  0.75341

Recall:  0.70893

F1:  0.73049

AUC:  0.73805
['RandomizedSearchCV' '0.738' '0.75341' '0.70893' '0.73049' '0.73805']
