In [1]:
import numpy as np
import pandas as pd

# Read in data

In [2]:
col_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
             'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'heart_disease_type']

In [3]:
heart_data = pd.read_csv('processed.cleveland.data.csv',
                         header=None, names=col_names)

In [4]:
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease_type
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [5]:
heart_data.heart_disease_type = heart_data.heart_disease_type.apply(
    lambda x: x > .5)

In [6]:
heart_data = heart_data[heart_data.ca != '?']
heart_data.ca = heart_data.ca.apply(float)

In [7]:
heart_data = heart_data[heart_data.thal != '?']
heart_data.thal = heart_data.thal.apply(float)

In [8]:
heart_data.dtypes

age                   float64
sex                   float64
cp                    float64
trestbps              float64
chol                  float64
fbs                   float64
restecg               float64
thalach               float64
exang                 float64
oldpeak               float64
slope                 float64
ca                    float64
thal                  float64
heart_disease_type       bool
dtype: object

In [9]:
type(heart_data.heart_disease_type[1])

numpy.bool_

In [10]:
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease_type
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,False
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,True
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,True
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,False
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,False


In [11]:
type(heart_data)

pandas.core.frame.DataFrame

In [12]:
heart_data.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'heart_disease_type'],
      dtype='object')

# Build a tree

In [13]:
from sklearn import tree

In [14]:
X = heart_data[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
                'exang', 'oldpeak', 'slope', 'ca', 'thal']]
Y = heart_data.heart_disease_type

In [15]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0


In [16]:
type(X)

pandas.core.frame.DataFrame

In [17]:
clf = tree.DecisionTreeClassifier(max_depth=3)
clf = clf.fit(X, Y)

# Visualize your tree

In [18]:
with open("heart.dot", 'w') as f:
    f = tree.export_graphviz(clf,
                             out_file=f,
                             feature_names=heart_data.columns[heart_data.columns
                                                              != 'heart_disease_type'],
                             class_names=['no disease', 'disease'],
                             filled=True)

In [19]:
# paste file contents into http://webgraphviz.com/

# Build a bot from your tree

In [20]:
def traverse_tree(left, right, threshold, features, value, target_names, node, depth):

    if (threshold[node] != -2):

        val = input("Input the value of " + features[node] + " ")

        if float(val) <= threshold[node]:
            traverse_tree(left, right, threshold, features, value, target_names,
                          left[node], depth+1)
        else:
            traverse_tree(left, right, threshold, features, value, target_names,
                          right[node], depth+1)
    else:
        target = value[node]
        value_dict = {}
        largest_val = -1
        largest_val_target_name = ""
        for i, v in zip(np.nonzero(target)[1],
                        target[np.nonzero(target)]):
            value_dict[target_names[i]] = int(v)
            if v > largest_val:
                largest_val = v
                largest_val_target_name = target_names[i]

        print("Your diagnosis: " + largest_val_target_name)

In [21]:
def bot_from_tree(tree, feature_names, target_names):

    left = tree.tree_.children_left
    right = tree.tree_.children_right

    threshold = tree.tree_.threshold
    features = [feature_names[i] for i in tree.tree_.feature]
    value = tree.tree_.value

    traverse_tree(left, right, threshold, features, value, target_names, 0, 0)

In [22]:
bot_from_tree(clf, heart_data.columns[heart_data.columns != 'heart_disease_type'], [
              'no disease', 'disease'])

Input the value of thal  1
Input the value of ca  1
Input the value of cp  1


Your diagnosis: no disease


In [23]:
# lots missing
# for example what if user enters illegal inputs? (strings)
# what if user inputs unphysical inputs?
# you can see how this starts to get complicated