In [1]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
%matplotlib inline

## Dataset

In [2]:
class_ref = pd.read_csv('dataset/class.csv').loc[:, ['Class_Number', 'Class_Type']]
zoo = pd.read_csv('dataset/zoo.csv').iloc[:, 1:]
zoo.head()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [3]:
class_ref

Unnamed: 0,Class_Number,Class_Type
0,1,Mammal
1,2,Bird
2,3,Reptile
3,4,Fish
4,5,Amphibian
5,6,Bug
6,7,Invertebrate


#### Setting the dependent and independent variables

In [4]:
zoo_X = zoo.drop('class_type', axis=1)
zoo_y = zoo.loc[:, 'class_type']

## Splitting to train and test sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(zoo_X, zoo_y, test_size=0.15)

## Initialize Decision Tree

In [7]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

**Use X_test for prediction**

In [8]:
predicted_val_tree = tree_model.predict(X_test)

**Since the index has been randomized, reset it**

In [9]:
X_test.reset_index(drop=True, inplace=True)

In [10]:
predicted_data_tree = pd.concat((X_test, pd.Series(predicted_val_tree, name='predicted')), axis=1)
predicted_data_tree.head()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,predicted
0,0,1,1,0,1,0,1,0,1,1,0,0,2,1,0,0,2
1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,7
2,0,0,1,0,0,0,0,0,1,1,0,0,4,1,0,1,3
3,1,0,0,1,0,1,1,1,1,1,0,1,0,0,0,1,1
4,0,1,1,0,0,0,1,0,1,1,0,0,2,1,0,0,2


**Replace the class number with the class type**

In [12]:
class_ref_dct = dict(zip(class_ref['Class_Number'], class_ref['Class_Type']))
predicted_data_tree['predicted'].replace(class_ref_dct, inplace=True)

In [13]:
class_ref_dct

{1: 'Mammal',
 2: 'Bird',
 3: 'Reptile',
 4: 'Fish',
 5: 'Amphibian',
 6: 'Bug',
 7: 'Invertebrate'}

In [14]:
predicted_data_tree.head()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,predicted
0,0,1,1,0,1,0,1,0,1,1,0,0,2,1,0,0,Bird
1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,Invertebrate
2,0,0,1,0,0,0,0,0,1,1,0,0,4,1,0,1,Reptile
3,1,0,0,1,0,1,1,1,1,1,0,1,0,0,0,1,Mammal
4,0,1,1,0,0,0,1,0,1,1,0,0,2,1,0,0,Bird


In [11]:
tree_model.score(X_test, y_test)

0.9375