# Problem statement : Decision Tree

## step: 1--root node

In [1]:
## importing libraries:

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [2]:
## loading dataset:

df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
#1st five datasets:

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
## to change column names:

col_names = ['pregnant','glocose','BP','skin','insulin','BMI','pedigree','age','label']

In [5]:
df = pd.read_csv('diabetes.csv',header = None, names = col_names)
df.head()

Unnamed: 0,pregnant,glocose,BP,skin,insulin,BMI,pedigree,age,label
0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0


In [6]:
## droping 1st row:

df = df.drop(0)
df.head()

Unnamed: 0,pregnant,glocose,BP,skin,insulin,BMI,pedigree,age,label
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0
5,0,137,40,35,168,43.1,2.288,33,1


In [7]:
df.shape

(768, 9)

In [8]:
## to check numeric columns:

df.describe()

Unnamed: 0,pregnant,glocose,BP,skin,insulin,BMI,pedigree,age,label
count,768,768,768,768,768,768,768.0,768,768
unique,17,136,47,51,186,248,517.0,52,2
top,1,100,70,0,0,32,0.258,22,0
freq,135,17,57,227,374,13,6.0,72,500


In [9]:
 ## splitting the data:
feature_cols = ['pregnant','glocose','BP','skin','insulin','BMI','pedigree','age']
x = df[feature_cols]
x.head()

Unnamed: 0,pregnant,glocose,BP,skin,insulin,BMI,pedigree,age
1,6,148,72,35,0,33.6,0.627,50
2,1,85,66,29,0,26.6,0.351,31
3,8,183,64,0,0,23.3,0.672,32
4,1,89,66,23,94,28.1,0.167,21
5,0,137,40,35,168,43.1,2.288,33


In [10]:
y = df['label']
y.head()

1    1
2    0
3    1
4    0
5    1
Name: label, dtype: object

In [11]:
## splitting data into train set and test set:

x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.3, random_state= 1)
# (70% training data and 30% test data)

In [12]:
## decision tree classifier:

clf = DecisionTreeClassifier()

## train decision tree classifier:

clf = clf.fit(x_train,y_train)

In [13]:
## predicting test data:

y_pred = clf.predict(x_test)

In [14]:
## model accuracy:

metrics.accuracy_score(y_test,y_pred)

0.658008658008658

## visualizing decision tree:

In [15]:
# importing libraries:

from IPython.display import Image
from six import StringIO
from sklearn.tree import export_graphviz
import graphviz
import pydotplus

In [16]:
dot_data = StringIO()
export_graphviz(clf, out_file= dot_data, 
                filled=True, 
                rounded= True, 
                feature_names= feature_cols, class_names=['0','1'])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

Image(graph.create_png())

InvocationException: Program terminated with status: 1. stderr follows: 'C:\Users\Swati' is not recognized as an internal or external command,
operable program or batch file.


## step: 2- optimizing decision tree:

### 1- Entropy:

In [None]:
## creating decision tree:

clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)

In [None]:
# training decision tree:

clf = clf.fit(x_train,y_train)

In [None]:
#predit:

y_pred = clf.predict(x_test)

In [None]:
## accuracy score:

metrics.accuracy_score(y_test,y_pred)

In [None]:
## visualizing decision tree:

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data, 
                filled=True, 
                rounded=True, 
                feature_names=feature_cols, class_names=['0','1'])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

Image(graph.create_png())

### 2- Gini_index:

In [None]:
## creating decision tree:

clf = DecisionTreeClassifier(criterion='gini', max_depth=3)

In [None]:
# training decision tree:

clf = clf.fit(x_train,y_train)

In [None]:
#predit:

y_pred = clf.predict(x_test)

In [None]:
#model accuracy (using metrics model) to ckeck how often the classifer is correct.

metrics.accuracy_score(y_test,y_pred)

In [None]:
## visualizing decision tree:

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, feature_names=feature_cols, class_names=['0','1'])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

## Hyperparameters for decision tree

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
## dictionary of hyperparameters as keys and its value as values:

parameters = {'max_depth':[2,3,5,6,8],
         'min_samples_leaf':[5,10,15,20,25],
         'criterion':['gini','entropy']}

In [None]:
parameters

In [None]:
## instance of decision tree:

Dt = DecisionTreeClassifier()

In [1]:
## instance of gridsearchCV:

gridsearch = GridSearchCV(estimator=Dt, param_grid=parameters,cv=4,n_jobs=-1,verbose=1, scoring='accuracy', )

NameError: name 'GridSearchCV' is not defined

In [None]:
gridsearch.fit(x_train,y_train)

In [None]:
## best estimator:

gridsearch.best_estimator_

In [None]:
Dt_best = gridsearch.best_estimator_

In [None]:
# predict

y_pred = Dt_best.predict(x_test)
y_pred

In [None]:
metrics.accuracy_score(y_test,y_pred)