In [1]:
# Libraries
import pandas as pd

dataset = pd.read_csv("./datasets/heart_data.csv")
dataset.drop(columns=['index', 'id'], axis=1, inplace=True)


# Preparing data

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

y = dataset['cardio'].values
cardio = dataset.drop(['cardio'], axis=1)
cat_attribs = ['gender','cholesterol', 'gluc', 'smoke', 'alco', 'active']
cardio_num = cardio.drop(cat_attribs, axis=1)
num_attribs = list(cardio_num)

num_pipeline = Pipeline([('std_scaler', StandardScaler())])
full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs), #num_pipeline
 ("cat", OneHotEncoder(), cat_attribs), #one hot encoder
 ])
cardio_prepared = full_pipeline.fit_transform(cardio)

# Split into train and test set

In [3]:
from sklearn.model_selection import train_test_split

y = dataset['cardio'].values
X = cardio_prepared.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Binary Classifier

In [4]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

# Validating Training Set

In [5]:
# cross validation
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

array([0.7166122 , 0.71548722, 0.72511518])

In [6]:
#confusing matrix
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_train_pred)

#-> |a b |   |TN FP|
#-> |c d |   |FN TP|
# Binary classifier, first row is the negative class and the second row is the positive class
# a = true negatives (TN) - correctly classified as negative class
# b = false positives (FP)- wrongly classified as positive class
# c = false negative (FN)- wrongly classified as negative class
# d = true positive (TP) - correctly classified as positive class

array([[22395,  5638],
       [10094, 17873]], dtype=int64)

In [7]:
#precision (accuracy of the positive preditions) =  TP / (TP + FP)
#recall (sensitivity or True Positive Rate TPR, ratio of positive instances that are corretly detected) = TP / (TP + FN)
#f1-score (harmonic mean of precision and recall. Whereas the regular mean treats all values equally, the harmonic mean gives much more weight to low values. As a result, the classifier will only get a high f1 score, if both recall and precision are high)

from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(y_train, y_train_pred)
recall = recall_score(y_train, y_train_pred)
f1 = f1_score(y_train, y_train_pred)

print("Precision: ", precision)
print("Recall: ", recall)
print("f1: ", f1)

Precision:  0.7601973544298414
Recall:  0.6390746236636036
f1:  0.6943937215898053


In [15]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np

In [None]:
#ROC CURVE
#true positive rate x false positive rate (FPR)
#FPR is the ratio of negative instances that are incorrectly classified as positive
#FPR = 1 - true negative ratio (TNR or specificity : ratio of negative instances that are correctly classified as negative)
#Hence, ROC curve plots sensitivity (recall) x 1 - specificity
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

y_scores = cross_val_predict(sgd_clf, X_train, y_train, cv=3,method="decision_function")
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.xlabel("False Positive Rate (FPR)")
    plt.ylabel("True Positive Rate (TPR)")
    plt.grid(visible=None, which='major', axis='both')
    plt.ylim(ymin=0)  # this line
    plt.xlim(xmin=0)  # this line
plot_roc_curve(fpr, tpr)
plt.show()


from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_train, y_scores)
print("AUC: ", auc)

#ROC CURVE X Precision/Recall (PR)
# PR -> when the positive class is rare or care more about false positives than false negatives


In [None]:
#RANDOM FOREST CLASSIFIER
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train, cv=3, method="predict_proba")

y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train, y_scores_forest)

plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right")
plt.show()

Voting

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()


voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svm', svm_clf)], voting='hard') #soft voting -> gives more weight to highly confident votes (ensure that all classifiers can estimate class probabilities)
voting_clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

Bagging

In [None]:
#Another approach is to use the same training algorithm for every predictor, but to train them on different random subsets of the training set
#sampling is performed with replacement, this method is called bagging(short forbootstrap aggregating). 
#When sampling is performed without replacement, it is called pasting
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=200, max_samples=200, bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(bag_clf.oob_score_)
print(bag_clf.oob_decision_function_)

Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred = rnd_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

NameError: name 'accuracy_score' is not defined

Ada Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import xgboost

gradBoost_clf = GradientBoostingClassifier(max_depth=2, n_estimators=3, learning_rate=0.5)
gradBoost_clf.fit(X_train, y_train)

y_pred = gradBoost_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))


In [None]:
#xgboost
import xgboost as xgb 
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

In [35]:
#!/usr/bin/env python3
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import numpy as np
def perfomance_metrics(classifier, y_, y_pred):
    
    accuracy = round(accuracy_score(y_, y_pred), 5)
    precision = round(precision_score(y_, y_pred), 5)
    recall = round(recall_score(y_, y_pred), 5)
    f1 = round(f1_score(y_, y_pred), 5)
    auc = round(roc_auc_score(y_, y_pred), 5)
    
    print(classifier.__class__.__name__)
    print("\nConfusion Matrix:\n", confusion_matrix(y_, y_pred))
    print("\nAccuracy: " , accuracy)
    print("\nPrecision: ", precision)
    print("\nRecall: ", recall)
    print("\nF1: ", f1)
    print("\nAUC: ", auc)

    return np.array([classifier.__class__.__name__, accuracy, precision, recall, f1, auc])

In [39]:
parameters = {  'n_estimators':np.arange(1,1000).tolist()[0::100],
                    'max_depth':np.arange(1,30).tolist()[0::2],
                    'min_samples_split':np.arange(2,30).tolist()[1::2],
                    'min_samples_leaf': np.arange(1,30).tolist()[0::2],
                    'max_leaf_nodes':np.arange(3,30).tolist()[0::2],
                    'max_features':  ['sqrt', 'log2', None] }
random_forest_clf = RandomizedSearchCV(RandomForestClassifier(), parameters, cv=5, n_iter=150, random_state=42, n_jobs=-1)

random_forest_clf.fit(X_train, y_train)
y_pred = random_forest_clf.predict(X_test)
print("Best Estimator: ", random_forest_clf.best_params_)

perfomance_metrics(random_forest_clf, y_test, y_pred)

KeyboardInterrupt: 