In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model    import *
from sklearn.metrics         import mean_squared_error
from sklearn.pipeline        import Pipeline
from sklearn.preprocessing   import StandardScaler

from sklearn.ensemble import *
from sklearn.naive_bayes import *
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
import numpy as np
from sklearn.model_selection  import RandomizedSearchCV
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [31]:
# upload data into pandas df - no missing values, all categorical vars are numerical 
#heart = pd.read_csv('heart.csv')
heart = pd.read_csv('https://github.com/danny-carrera/ml_lab_project_dcarrera/blob/main/heart.csv')
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [32]:
# get features and target
y = heart['target']
X = heart.drop(columns=['target'])

In [33]:
X.shape

(303, 13)

In [34]:
# ensure that all features contain enough variance to be predictive - none dropped
sel = VarianceThreshold(threshold=(.9 * (1 - .0)))
sel.fit_transform(X)
X.shape

(303, 13)

In [35]:
# get test set 
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=11)

In [36]:
# get train and validation set
X_train, X_valid, y_train, y_valid= train_test_split(X, y)

In [37]:
# classifiers im interested in checking out
classifiers = [LogisticRegression(), \
               GaussianNB(), \
               tree.DecisionTreeClassifier(), \
               RandomForestClassifier(), \
               GradientBoostingClassifier()]

In [38]:
# try out a few classifier with default parameters and standardized data
for model in classifiers:
    
    pipe = Pipeline([('scalar', StandardScaler()),
                     ('clf', model)])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print(model, accuracy)

LogisticRegression() 0.7763157894736842
GaussianNB() 0.8157894736842105
DecisionTreeClassifier() 0.7631578947368421
RandomForestClassifier() 0.7894736842105263
GradientBoostingClassifier() 0.7631578947368421


Based on accuracy results above, I will move foward with Logistic Regression, Gradient Boosting Classifier, and Random Forest and try to improve from their baseline performances.

In [39]:
# define models and their parameters to be considered
search_space = [
                 {'clf': [RandomForestClassifier()], # Actual Estimator
                 'clf__n_estimators' : list(range(10,101,10)), # of trees
                 'clf__max_features' : list(range(6,32,5)),
                 'clf__min_samples_leaf' : list(range(1,7,2))}, 
                
                {'clf': [LogisticRegression()],
                 'clf__penalty': ['l1', 'l2'],
                 'clf__C': np.logspace(0, 4, 10)},
               
                {'clf': [GradientBoostingClassifier()],
                'clf__learning_rate': [1, 0.5, 0.25, 0.1, 0.05, 0.01],
                'clf__n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200]}]

# Use Random Search to go through models and different combinations of their parameters listed above
clf_algos_rand = RandomizedSearchCV(estimator=pipe, 
                                    param_distributions=search_space, 
                                    n_iter=25,
                                    cv=5, 
                                    n_jobs=-1,
                                    verbose=1)


# get best model from Random Search
best_model = clf_algos_rand.fit(X_train, y_train);

best_model.best_estimator_.get_params()['clf']

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 102 out of 125 | elapsed:    3.4s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    3.5s finished


RandomForestClassifier(max_features=6, min_samples_leaf=3)

In [40]:
hyperparameters = best_model.best_estimator_.get_params()

In [41]:
# view best model
final_model = hyperparameters['clf']
final_model

RandomForestClassifier(max_features=6, min_samples_leaf=3)

In [42]:
# view best model's parameters
final_params = best_model.best_estimator_['clf'].get_params()
final_params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 6,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [43]:
# Train best model and apply get predictions for test set
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('lr',     final_model)
                ])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

In [44]:
# get accuracy score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9210526315789473

In [45]:
# get confusion matrix
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[27,  4],
       [ 2, 43]])

In [46]:
# get recall score for true 0 - no heart condition
recall_score(y_test, y_pred, pos_label=0)

0.8709677419354839

In [47]:
recall_score(y_test, y_pred)

0.9555555555555556

In [48]:
precision_score(y_test, y_pred, pos_label=0)

0.9310344827586207

In [49]:
precision_score(y_test, y_pred)

0.9148936170212766