In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
#from dtreeviz.trees import dtreeviz

In [2]:
# Load data
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
pima = pd.read_csv("data/pima-indians-diabetes.csv", header=None, names=col_names)

display(pima.head())

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Prepare the data
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']

X = pima[feature_cols] # Features
y = pima.label # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [4]:
clf = DecisionTreeClassifier(max_depth=3, random_state=1234) # Create Decision Tree classifer object
clf = clf.fit(X_train,y_train) # Train Decision Tree Classifer

y_pred = clf.predict(X_test) #Predict the response for test dataset
accuracy = metrics.accuracy_score(y_test, y_pred) # Evaluate model accuracy
print(f"Model accuracy (initial model): {accuracy:.1%}")

Model accuracy (initial model): 75.8%


```
viz = dtreeviz(clf, 
               X_train, 
               y_train,
               target_name='label',
               feature_names=X.columns.to_list(), 
               class_names=["red", "white"],
               scale=1.4)

viz.save("dtreeviz.svg")

viz
```

![dtreeviz.png](attachment:dtreeviz.png)

In [5]:
# Make some predictions from csv file values
predict = pd.read_csv("data/pima-indians-diabetes-predict.csv", header=None, names=col_names)
display(predict.head(predict.shape[0]))

X_predict = predict[feature_cols] # Features # print(X_predict)
y_predict = clf.predict(X_predict)
print(y_predict)

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


[1 0 0 0 1 0 0 0 1 0]


In [6]:
# Make individual predictions
def predict_individual_diabetes(pregnant, insulin, bmi, age,glucose,bp,pedigree):
    y_predict = clf.predict([[pregnant, insulin, bmi, age,glucose,bp,pedigree]])[0] #y_predict = clf.predict([[6, 0, 33.6, 50, 148, 72, 0.627]]) (should be a 1 / 'Yes')
    predicted_diabetes = "yes" if y_predict == 1 else "no"
    print(f"pregnant={pregnant}, insulin={insulin}, bmi={bmi}, age={age}, glucose={glucose}, bp={bp}, pedigree={pedigree}, predcition of diabetes='{predicted_diabetes}'")

In [7]:
predict_individual_diabetes(6, 0, 31.0, 50, 148, 72, 0.627)
predict_individual_diabetes(6, 0, 30.0, 50, 148, 72, 0.627)
predict_individual_diabetes(6, 0, 29.0, 50, 148, 72, 0.627)
predict_individual_diabetes(6, 0, 28.0, 50, 148, 72, 0.627)
predict_individual_diabetes(6, 0, 27.0, 50, 148, 72, 0.627)
predict_individual_diabetes(6, 0, 26.0, 50, 148, 72, 0.627)

pregnant=6, insulin=0, bmi=31.0, age=50, glucose=148, bp=72, pedigree=0.627, predcition of diabetes='yes'
pregnant=6, insulin=0, bmi=30.0, age=50, glucose=148, bp=72, pedigree=0.627, predcition of diabetes='yes'
pregnant=6, insulin=0, bmi=29.0, age=50, glucose=148, bp=72, pedigree=0.627, predcition of diabetes='yes'
pregnant=6, insulin=0, bmi=28.0, age=50, glucose=148, bp=72, pedigree=0.627, predcition of diabetes='yes'
pregnant=6, insulin=0, bmi=27.0, age=50, glucose=148, bp=72, pedigree=0.627, predcition of diabetes='no'
pregnant=6, insulin=0, bmi=26.0, age=50, glucose=148, bp=72, pedigree=0.627, predcition of diabetes='no'
