# Decision Tree Classification using Scikit-learn

In [21]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer

In [23]:
dataset = {
    'outlook':['Sunny','Sunny','Overcast','Rain','Rain','Rain','Overcast','Sunny','Sunny','Rain','Sunny','Overcast','Overcast','Rain'],
    'temp':['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild'],
    'humidity':['High','High','High','High','Normal','Normal','Normal','High','Normal','Normal','Normal','High','Normal','High'],
    'wind':['Weak','Strong','Weak','Weak','Weak','Strong','Weak','Weak','Weak','Strong','Strong','Strong','Weak','Strong'],
    'play_tennis':['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']
}

In [24]:
df = pd.DataFrame(dataset,columns=['outlook','temp','humidity','wind','play_tennis'])

In [39]:
cols_to_retain = ['outlook', 'temp', 'humidity', 'wind']
X_feature = df[cols_to_retain]
X_dict = X_feature.T.to_dict().values()

# turn list of dicts into a numpy array
vect = DictVectorizer(sparse=False)
X_vector = vect.fit_transform(X_dict)
X_train = X_vector[:-1]
X_test = X_vector[-1:] 

In [40]:
vect.get_feature_names()

['humidity=High',
 'humidity=Normal',
 'outlook=Overcast',
 'outlook=Rain',
 'outlook=Sunny',
 'temp=Cool',
 'temp=Hot',
 'temp=Mild',
 'wind=Strong',
 'wind=Weak']

In [41]:
X_vector

array([[1., 0., 0., 0., 1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 1., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0., 0., 0., 1., 0., 1.],
       [0., 1., 0., 1., 0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 1., 0., 0., 1., 0.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 1., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 1., 0., 0., 1., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 1., 0.],
       [0., 1., 1., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0., 0., 0., 1., 1., 0.]])

In [44]:
le = LabelEncoder()
y_train = le.fit_transform(df['play_tennis'][:-1])

In [45]:
y_train

array([0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1])

In [47]:
clf = DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(X_train,y_train)

In [49]:
le.inverse_transform(clf.predict(X_test))

array(['Yes'], dtype=object)

In [52]:
# Metrics related to the DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

print ('Accuracy is:', accuracy_score(y_train, clf.predict(X_train)))
print (classification_report(y_train, clf.predict(X_train)))

Accuracy is: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         9

   micro avg       1.00      1.00      1.00        13
   macro avg       1.00      1.00      1.00        13
weighted avg       1.00      1.00      1.00        13

