In [23]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import pandas as pd

In [106]:
### Load dataset

iris = load_iris()

# print(iris.keys())
# dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

# print(iris.target_names)
# ['setosa' 'versicolor' 'virginica']

# print(iris.feature_names)
# ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

# print(type(iris_data))
# <class 'numpy.ndarray'>

iris_data = iris.data
iris_label = iris.target
iris_df = pd.DataFrame(data=iris_data, columns=iris.feature_names)
iris_df['label'] = iris_label
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [51]:
### Split dataset into training and test dataset

X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=11)

In [52]:
### Train data

dt_clf = DecisionTreeClassifier(random_state=11)

dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=11)

In [53]:
### Predict test dataset

pred = dt_clf.predict(X_test)
pred

array([2, 2, 1, 1, 2, 0, 1, 0, 0, 1, 1, 1, 1, 2, 2, 0, 2, 1, 2, 2, 1, 0,
       0, 1, 0, 0, 2, 1, 0, 1])

In [54]:
### Evaluate the prediction

from sklearn.metrics import accuracy_score

print("Accuracy : {0:.4f}".format(accuracy_score(y_test, pred)))

Accuracy : 0.9333


In [77]:
### K fold cross validation

from sklearn.model_selection import KFold
import numpy as np

# print(iris_data.shape)
# (150, 4)

kfold = KFold(n_splits=5)
cv_accuracy2 = []

for train_index, test_index in kfold.split(iris_data):
    X_train, X_test = iris_data[train_index], iris_data[test_index]
    y_train, y_test = iris_label[train_index], iris_label[test_index]
    
    dt_clf.fit(X_train, y_train)
    pred2 = dt_clf.predict(X_test)

    accuracy2 = np.round(accuracy_score(y_test, pred2), 4)
    cv_accuracy2.append(accuracy2)
    
print("Accuracy AVG : {0:.4f}".format(np.mean(cv_accuracy2)))
print("Accuracy Backdata : {}".format(np.round(cv_accuracy2, 4)))

Accuracy AVG : 0.9200
Accuracy Backdata : [1.     0.9667 0.8667 0.9333 0.8333]


In [84]:
### Stratified K fold cross validation

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3)
cv_accuracy3 = []

for train_index, test_index in skf.split(iris_data, iris_label):
    X_train, X_test = iris_data[train_index], iris_data[test_index]
    y_train, y_test = iris_label[train_index], iris_label[test_index]
   
    dt_clf.fit(X_train, y_train)
    pred3 = dt_clf.predict(X_test)
    
    
    accuracy3 = np.round(accuracy_score(y_test, pred3), 4)
    cv_accuracy3.append(accuracy3)
    
print("Accuracy AVG : {0:.4f}".format(np.mean(cv_accuracy3)))
print("Accuracy Backdata : {}".format(np.round(cv_accuracy3, 4)))

Accuracy AVG : 0.9600
Accuracy Backdata : [0.98 0.92 0.98]


In [86]:
### cross_val_score

from sklearn.model_selection import cross_val_score, cross_validate

scores = cross_val_score(dt_clf, iris_data, iris_label, scoring='accuracy', cv=3)

print("Accuracy AVG : {0:.4f}".format(np.mean(scores)))
print("Accuracy Backdata : {}".format(np.round(scores, 4)))

Accuracy AVG : 0.9600
Accuracy Backdata : [0.98 0.92 0.98]


In [120]:
### GridSearchCV

from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=121)

parameters = {'max_depth':[1, 2, 3], 'min_samples_split':[2, 3]}

grid_dtree = GridSearchCV(dt_clf, param_grid=parameters, cv=3, refit=True, return_train_score=True)
grid_dtree.fit(X_train, y_train)

scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95


In [118]:
print('GridSearchCV Best Parameter :', grid_dtree.best_params_)
print('GridSearchCV Best Accuracy : {0:.4f}'.format(grid_dtree.best_score_))

pred = grid_dtree.predict(X_test)
print('Accuracy : {0:.4f}'.format(accuracy_score(y_test,pred)))

GridSearchCV Best Parameter : {'max_depth': 3, 'min_samples_split': 2}
GridSearchCV Best Accuracy : 0.9750
Accuracy : 0.9667


In [119]:
estimator = grid_dtree.best_estimator_

pred = estimator.predict(X_test)
print('Accuracy : {0:.4f}'.format(accuracy_score(y_test,pred)))

Accuracy : 0.9667
