In [38]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics
import matplotlib.pyplot as plt
#import seaborn for statistical plots
import seaborn as sns
#to enable plotting graph in jupter notebook
%matplotlib inline

from sklearn.model_selection import train_test_split

In [39]:
colnames=['preg','glu','bp','sft','ins','bmi','dpf','age','outcome']

In [40]:
df=pd.read_csv('diabetics_pima.csv',names=colnames )

In [41]:
df.head()

Unnamed: 0,preg,glu,bp,sft,ins,bmi,dpf,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [42]:
df['outcome'].value_counts()

0    500
1    268
Name: outcome, dtype: int64

In [43]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
preg,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
glu,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
bp,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
sft,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
ins,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
bmi,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
dpf,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [44]:
df['outcome']=df['outcome'].replace({0:'Healthy',1:'diabetic'})

In [45]:
df['outcome']=df['outcome'].astype('category')
df.dtypes

preg          int64
glu           int64
bp            int64
sft           int64
ins           int64
bmi         float64
dpf         float64
age           int64
outcome    category
dtype: object

In [46]:
x=df.drop('outcome',axis=1)
y=df['outcome']

In [47]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=2)

In [48]:
X_train=np.array(x_train)
Y_train=np.array(y_train)

In [49]:
X_test=np.array(x_test)
Y_test=np.array(y_test)

In [50]:
x_train.shape

(537, 8)

In [51]:
x_test.shape

(231, 8)

In [52]:
model=DecisionTreeClassifier(criterion='entropy',random_state=0)
model.fit(X_train,Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [53]:
y_pred=model.predict(X_test)

In [54]:
acc=metrics.accuracy_score(Y_test,y_pred)
print(acc)

0.6796536796536796


In [55]:
cm=metrics.confusion_matrix(Y_test,y_pred)
print(cm)

[[122  33]
 [ 41  35]]


In [56]:
print(pd.DataFrame(model.feature_importances_,columns=['Imp'],index=x_train.columns))

           Imp
preg  0.089866
glu   0.257847
bp    0.112757
sft   0.036992
ins   0.052421
bmi   0.194523
dpf   0.168654
age   0.086940


In [57]:
list(y_train)

['Healthy',
 'Healthy',
 'Healthy',
 'diabetic',
 'Healthy',
 'diabetic',
 'diabetic',
 'Healthy',
 'diabetic',
 'Healthy',
 'diabetic',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'diabetic',
 'Healthy',
 'Healthy',
 'diabetic',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'diabetic',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'diabetic',
 'Healthy',
 'Healthy',
 'diabetic',
 'Healthy',
 'diabetic',
 'diabetic',
 'diabetic',
 'Healthy',
 'diabetic',
 'Healthy',
 'Healthy',
 'diabetic',
 'diabetic',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 'diabetic',
 'Healthy',
 'diabetic',
 'Healthy',
 'Healthy',
 'diabetic',
 'Healthy',
 'Healthy',
 'Healthy',
 'diabetic',
 'Healthy',
 'Healthy',
 'Healthy',
 'diabetic',
 'diabetic',
 'Healthy',
 'diabetic',
 'diabetic',
 'diabetic',
 'Healthy',
 'diabetic',
 'Healthy',
 'diabetic',
 'Healthy',
 'Healthy',
 'Healthy',
 'Healthy',
 

In [58]:
#from IPython.display import Image
#from sklearn import tree
#from os import system

'''Diabetic_Tree_File=open('diabetic_tree.dot','w')
dot_data=tree.export_graphviz(model,out_file=Diabetic_Tree_File,feature_names=list(x_train),class_names=list(y_train))
Diabetic_Tree_File.close()'''

"Diabetic_Tree_File=open('diabetic_tree.dot','w')\ndot_data=tree.export_graphviz(model,out_file=Diabetic_Tree_File,feature_names=list(x_train),class_names=list(y_train))\nDiabetic_Tree_File.close()"

In [59]:
#system('dot -Tpng diabetic_tree.dot -o diabetic_tree.png')
#Image('diabetic_tree.png')

In [60]:
#pip install ipython
#conda install-c anaconda ipython
!pip install graphviz



twisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.
You are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [61]:
from sklearn.tree import export_graphviz
from sklearn import tree
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
import os
os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/'

#class labels
labels=x.columns

graph=Source(tree.export_graphviz(model,out_file=None,feature_names=labels,class_names=['0','1'],filled=True))
display(SVG(graph.pipe(format='svg')))

ExecutableNotFound: failed to execute ['dot', '-Tsvg'], make sure the Graphviz executables are on your systems' PATH