In [1]:
from sklearn import tree
import pandas as pd
import os
import numpy as np

In [2]:
df = pd.read_csv(os.path.join("..", "Resources", "heart.csv"))
df.head(30)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1
6,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
7,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0
8,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
9,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0


In [3]:
target = df["target"]
target_names = ["negative", "positive"]
print(target_names)

['negative', 'positive']


In [4]:
data = df.drop("target", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
print(X_test)
print(y_test)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
527   62    0   0       124   209    0        1      163      0      0.0   
359   53    0   2       128   216    0        0      115      0      0.0   
447   55    1   0       160   289    0        0      145      1      0.8   
31    50    0   1       120   244    0        1      162      0      1.1   
621   48    1   0       130   256    1        0      150      1      0.0   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
940   57    0   0       140   241    0        1      123      1      0.2   
787   51    1   0       140   298    0        1      122      1      4.2   
926   54    1   0       110   206    0        0      108      1      0.0   
249   42    1   2       130   180    0        1      150      0      0.0   
552   43    0   0       132   341    1        0      136      1      3.0   

     slope  ca  thal  
527      2   0     2  
359      2   0     0  
447      1   1    

In [6]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9766536964980544

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9883268482490273

In [8]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.13212762228805586, 'thal'),
 (0.1299762739133624, 'ca'),
 (0.12921861548267685, 'cp'),
 (0.1146693175231822, 'oldpeak'),
 (0.1133501712983204, 'thalach'),
 (0.08215537849286786, 'age'),
 (0.07550870409173505, 'chol'),
 (0.06906678032526622, 'trestbps'),
 (0.04978043898357801, 'exang'),
 (0.04883058697665301, 'slope'),
 (0.029446637471108057, 'sex'),
 (0.017406230149564255, 'restecg'),
 (0.008463243003629745, 'fbs')]

In [9]:
# Model Accuracy
print('Test Acc: %.3f' % rf.score(X_test, y_test))  
# Calculate classification report
from sklearn.metrics import classification_report
predictions = rf.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

Test Acc: 0.988
              precision    recall  f1-score   support

    negative       0.98      1.00      0.99       132
    positive       1.00      0.98      0.99       125

    accuracy                           0.99       257
   macro avg       0.99      0.99      0.99       257
weighted avg       0.99      0.99      0.99       257



In [10]:
#use model with new data


X_new = np.array([[82, 1, 1, 154, 260, 1, 1,163, 1, 2.6, 2, 3, 3]])
new_predictions = rf.predict(X_new)
print(new_predictions)

[0]


In [11]:
import joblib
heart_model_filename = 'cristina_heart_model.sav'
joblib.dump(rf, heart_model_filename)

['cristina_heart_model.sav']