In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('zoo.csv', index_col='animal_name')

In [4]:
df.head()

Unnamed: 0_level_0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
animal_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [5]:
X = df.loc[:, 'hair':'catsize']
y = df['class_type']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [8]:
X_train.head()

Unnamed: 0_level_0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
animal_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
clam,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
tortoise,0,0,1,0,0,0,0,0,1,1,0,0,4,1,0,1
gull,0,1,1,0,1,1,1,0,1,1,0,0,2,1,0,0
piranha,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0
dogfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,1


In [9]:
y_train.head()

animal_name
clam        7
tortoise    3
gull        2
piranha     4
dogfish     4
Name: class_type, dtype: int64

In [10]:
sc = MinMaxScaler().fit(X_train) # scale the data
X_train_std = sc.transform(X_train)
sc1 = MinMaxScaler().fit(X_test)
X_test_std = sc1.transform(X_test)

In [11]:
params = {"max_depth": range(3,15),
              "max_features": range(4,10),
              "min_samples_leaf": range(2,20),
              "criterion": ["gini", "entropy"]}

In [17]:
model = DecisionTreeClassifier()
# Instantiate the RandomizedSearchCV object: tree_cv
model_cv = RandomizedSearchCV(model, params, n_iter=100, cv=3, scoring='accuracy', random_state = 42)
# Fit it to the data
model_cv.fit(X_train_std, y_train)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(model_cv.best_params_))
print("Best score is {}".format(model_cv.best_score_))

Tuned Decision Tree Parameters: {'min_samples_leaf': 2, 'max_features': 9, 'max_depth': 6, 'criterion': 'gini'}
Best score is 0.9


In [18]:
model = DecisionTreeClassifier(criterion='gini', max_depth = 6, max_features = 9, min_samples_leaf = 2)
model.fit(X_train_std, y_train)
y_train_pred = model.predict(X_train_std)
y_pred = model.predict(X_test_std)

In [19]:
confusion = confusion_matrix(y_train, y_train_pred)
confusion

array([[32,  0,  0,  0,  1,  0,  0],
       [ 0, 16,  0,  0,  0,  0,  0],
       [ 0,  0,  4,  0,  0,  0,  0],
       [ 0,  0,  0, 10,  0,  0,  0],
       [ 0,  0,  0,  0,  3,  0,  0],
       [ 0,  0,  0,  0,  0,  6,  0],
       [ 0,  0,  1,  0,  0,  0,  7]])

In [20]:
model.score(X_train_std, y_train)

0.97499999999999998

In [24]:
confusion = confusion_matrix(y_test, y_pred)
confusion

array([[8, 0, 0, 0, 0, 0, 0],
       [0, 4, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 3, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 2, 0],
       [0, 0, 0, 0, 0, 0, 2]])

In [25]:
model.score(X_test_std, y_pred)

1.0