In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model    import *
from sklearn.metrics         import mean_squared_error
from sklearn.pipeline        import Pipeline
from sklearn.preprocessing   import StandardScaler

from sklearn.ensemble import *
from sklearn.naive_bayes import *
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
import numpy as np
from sklearn.model_selection  import RandomizedSearchCV
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [2]:
# upload data into pandas df - no missing values, all categorical vars are numerical 
heart = pd.read_csv('heart.csv')
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [4]:
# get features and target
y = heart['target']
X = heart.drop(columns=['target'])

In [5]:
X.shape

(303, 13)

In [6]:
# ensure that all features contain enough variance to be predictive - none dropped
sel = VarianceThreshold(threshold=(.9 * (1 - .0)))
sel.fit_transform(X)
X.shape

(303, 13)

In [7]:
# get test set 
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=11)

In [9]:
# get train and validation set
X_train, X_valid, y_train, y_valid= train_test_split(X, y)

In [11]:
# classifiers im interested in checking out
classifiers = [LogisticRegression(), \
               GaussianNB(), \
               tree.DecisionTreeClassifier(), \
               RandomForestClassifier(), \
               GradientBoostingClassifier()]

In [12]:
# try out a few classifier with default parameters and standardized data
for model in classifiers:
    
    pipe = Pipeline([('scalar', StandardScaler()),
                     ('clf', model)])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print(model, accuracy)

LogisticRegression() 0.8157894736842105
GaussianNB() 0.8026315789473685
DecisionTreeClassifier() 0.7368421052631579
RandomForestClassifier() 0.7763157894736842
GradientBoostingClassifier() 0.7894736842105263


Based on accuracy results above, I will move foward with Logistic Regression, Gaussian NB, and Random Forest and try to improve from their baseline performances.

In [13]:
search_space = [
                 {'clf': [RandomForestClassifier()], # Actual Estimator
                 'clf__n_estimators' : list(range(10,101,10)), # of trees
                 'clf__max_features' : list(range(6,32,5)),
                 'clf__min_samples_leaf' : list(range(1,7,2))}, 
                
                {'clf': [LogisticRegression()],
                 'clf__penalty': ['l1', 'l2'],
                 'clf__C': np.logspace(0, 4, 10)},
               
                {'clf': [GradientBoostingClassifier()],
                'clf__learning_rate': [1, 0.5, 0.25, 0.1, 0.05, 0.01],
                'clf__n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200]}]

# Good ol' random search
clf_algos_rand = RandomizedSearchCV(estimator=pipe, 
                                    param_distributions=search_space, 
                                    n_iter=25,
                                    cv=5, 
                                    n_jobs=-1,
                                    verbose=1)


#  Fit grid search
best_model = clf_algos_rand.fit(X_train, y_train);

# View best model
best_model.best_estimator_.get_params()['clf']

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 102 out of 125 | elapsed:    2.6s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    2.7s finished


RandomForestClassifier(max_features=6, min_samples_leaf=5, n_estimators=40)

In [14]:
# # Create space of candidate learning algorithms and their hyperparameters
# search_space = [{'clf': [LogisticRegression()], # Actual Estimator
#                  'clf__penalty': ['l1', 'l2'],
#                  'clf__C': np.logspace(0, 4, 10)},
                
#                 {'clf': [RandomForestClassifier()]},
               
#                {'clf': [GaussianNB()]}]

# # Good ol' random search
# clf_algos_rand = RandomizedSearchCV(estimator=pipe, 
#                                     param_distributions=search_space, 
#                                     n_iter=25,
#                                     cv=5, 
#                                     n_jobs=-1,
#                                     verbose=1)


# #  Fit grid search
# best_model = clf_algos_rand.fit(X_train, y_train);

# # View best model
# best_model.best_estimator_.get_params()['clf']

In [15]:
hyperparameters = best_model.best_estimator_.get_params()
hyperparameters

{'memory': None,
 'steps': [('scalar', StandardScaler()),
  ('clf',
   RandomForestClassifier(max_features=6, min_samples_leaf=5, n_estimators=40))],
 'verbose': False,
 'scalar': StandardScaler(),
 'clf': RandomForestClassifier(max_features=6, min_samples_leaf=5, n_estimators=40),
 'scalar__copy': True,
 'scalar__with_mean': True,
 'scalar__with_std': True,
 'clf__bootstrap': True,
 'clf__ccp_alpha': 0.0,
 'clf__class_weight': None,
 'clf__criterion': 'gini',
 'clf__max_depth': None,
 'clf__max_features': 6,
 'clf__max_leaf_nodes': None,
 'clf__max_samples': None,
 'clf__min_impurity_decrease': 0.0,
 'clf__min_impurity_split': None,
 'clf__min_samples_leaf': 5,
 'clf__min_samples_split': 2,
 'clf__min_weight_fraction_leaf': 0.0,
 'clf__n_estimators': 40,
 'clf__n_jobs': None,
 'clf__oob_score': False,
 'clf__random_state': None,
 'clf__verbose': 0,
 'clf__warm_start': False}

In [16]:
final_model = hyperparameters['clf']
final_model

RandomForestClassifier(max_features=6, min_samples_leaf=5, n_estimators=40)

In [17]:
final_params = best_model.best_estimator_['clf'].get_params()
final_params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 6,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 40,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [18]:
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('lr',     final_model)
                ])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)


In [19]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8552631578947368

In [20]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[27,  4],
       [ 7, 38]])

In [21]:
recall_score(y_test, y_pred, pos_label=0)

0.8709677419354839

In [22]:
recall_score(y_test, y_pred)

0.8444444444444444

In [23]:
precision_score(y_test, y_pred, pos_label=0)

0.7941176470588235

In [24]:
precision_score(y_test, y_pred)

0.9047619047619048

In [25]:
# input best parameters

# viola!