In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model    import *
from sklearn.metrics         import mean_squared_error
from sklearn.pipeline        import Pipeline
from sklearn.preprocessing   import StandardScaler

from sklearn.ensemble import *
from sklearn.naive_bayes import *
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
import numpy as np
from sklearn.model_selection  import RandomizedSearchCV
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [76]:
# upload data into pandas df - no missing values, all categorical vars are numerical 
heart = pd.read_csv('https://github.com/danny-carrera/ml_lab_project_dcarrera/blob/main/heart.csv')
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [77]:
# get features and target
y = heart['target']
X = heart.drop(columns=['target'])

In [78]:
X.shape

(303, 13)

In [79]:
# ensure that all features contain enough variance to be predictive - none dropped
sel = VarianceThreshold(threshold=(.9 * (1 - .0)))
sel.fit_transform(X)
X.shape

(303, 13)

In [80]:
# get test set 
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=11)

In [81]:
# get train and validation set
X_train, X_valid, y_train, y_valid= train_test_split(X, y)

In [82]:
# classifiers im interested in checking out
classifiers = [LogisticRegression(), \
               GaussianNB(), \
               tree.DecisionTreeClassifier(), \
               RandomForestClassifier(), \
               GradientBoostingClassifier()]

In [83]:
# try out a few classifier with default parameters and standardized data
for model in classifiers:
    
    pipe = Pipeline([('scalar', StandardScaler()),
                     ('clf', model)])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print(model, accuracy)

LogisticRegression() 0.7894736842105263
GaussianNB() 0.8026315789473685
DecisionTreeClassifier() 0.7368421052631579
RandomForestClassifier() 0.8421052631578947
GradientBoostingClassifier() 0.8026315789473685


Based on accuracy results above, I will move foward with Logistic Regression, Gradient Boosting Classifier, and Random Forest and try to improve from their baseline performances.

In [84]:
# define models and their parameters to be considered
search_space = [
                 {'clf': [RandomForestClassifier()], # Actual Estimator
                 'clf__n_estimators' : list(range(10,101,10)), # of trees
                 'clf__max_features' : list(range(6,32,5)), # of features to consider for split
                 'clf__min_samples_leaf' : list(range(1,7,2))}, # of min samples allowed per leaf
                
                {'clf': [LogisticRegression()],
                 'clf__penalty': ['l1', 'l2'],
                 'clf__C': np.logspace(0, 4, 10)},
               
                {'clf': [GradientBoostingClassifier()],
                'clf__learning_rate': [1, 0.5, 0.25, 0.1, 0.05, 0.01],
                'clf__n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200]}] # of trees

# Use Random Search to go through models and different combinations of their parameters listed above
clf_algos_rand = RandomizedSearchCV(estimator=pipe, 
                                    param_distributions=search_space, 
                                    n_iter=25,
                                    cv=5, 
                                    n_jobs=-1,
                                    verbose=1)


# get best model from Random Search
best_model = clf_algos_rand.fit(X_train, y_train);

best_model.best_estimator_.get_params()['clf']

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    2.9s finished


RandomForestClassifier(max_features=11, n_estimators=40)

## Display Best model

In [85]:
hyperparameters = best_model.best_estimator_.get_params()

In [86]:
# view best model
final_model = hyperparameters['clf']
final_model

RandomForestClassifier(max_features=11, n_estimators=40)

In [87]:
# view best model's parameters
final_params = best_model.best_estimator_['clf'].get_params()
final_params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 11,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 40,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [88]:
# Train best model and apply get predictions for test set
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('lr',     final_model)
                ])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

## Model Evaluation

In [89]:
# get accuracy score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9078947368421053

In [90]:
# get confusion matrix
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[28,  3],
       [ 4, 41]])

In [91]:
# get recall score for true 0's - no heart condition
recall_score(y_test, y_pred, pos_label=0)

0.9032258064516129

In [92]:
# get recall score for true 1's - heart condition

recall_score(y_test, y_pred)

0.9111111111111111

In [93]:
# get precision score for false predictions
precision_score(y_test, y_pred, pos_label=0)

0.875

In [94]:
# get precision score for true predictions
precision_score(y_test, y_pred)

0.9318181818181818

An accuracy score for a classification problem shows the percentage of accurate predictions. For this model, it is .907 which inidcates that is a good model, but other metrics should also be taken into account. 

A recall score shows that of true class labels how many were predicted correctly. This metric is a better indicator of how good your model is and lets you hone in on a label that may be more important to predict right. For this model, a recall score for no heart condition is .903, which means that of those who truely did not have a heart condition about 90.3% were accurately predicted to not have a heart condition. The recall score for heart condition is .911, which means that of those who truely did have a heart condition about 91.1% were accurately predicted to have a heart condition.

On the other hand a precision score, shows the percentage of predictions that were correct. For no heart condition, this model's predictions were coorect 87.5% of the time. For heart conditions, it was 93.2%.

In short, recall focuses on how many labels were predicted correctly, while prediction focuses on how many predictions were correct.

A confusion matrix shows this information numerically, showing the count of False Negatives, True Negatives, False Positives, and True Positives.

## Conclusion

The goal of this model was to be able to predict whether someone has a heart condition based on their demographic information and lab test results. In the end, our best model was a Random forest classifer made up of 40 descision trees who only considered a maximum 11 features when looking for a best split. The model's predictions were accurate 90.8% of the time. More specifically, no heart condition predictions were correct 87.5% of the time and heart conditions predictions were correct 93.2% of the time. The model also correctly identified those with a heart condition 91.1% of the time. 

Given that implications of a missed diagnosis, we would want to improve this model with a focus on being able to identify 100% of patients with a heart condition. That is, aim for a recall score for 1.0 for true heart conditions. A patient with a missed diagnosis may go on with out any treatment, potentially costing this person quality of life or even their life. 

This model can be used in both medical office and hospital settings by physicians, nurses, and other allied health professionals in order to identify those most at risk of having a heart condition. For example, in the ER, this model can be used to help the triage nurse assess the severity of a patient's condition. It can also be used by physicians as a guide to ordering tests to assess a patient's cardiovascular health.
