# Desion Tree Classifiers

In [None]:
import numpy as no
import pandas as pd
import sklearn
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv('iris.csv')
df.head()

### Train/Test Sets

In [None]:
train, test = train_test_split(df, test_size = 0.2)

# train sets
train_x = train.drop('Name', 1)
train_y = train['Name']

# test sets
test_x = test.drop('Name', 1)
test_y = test['Name']

# Decision Tree

In [None]:
# model
decision_tree = DecisionTreeClassifier(criterion = "entropy")

# train
decision_tree.fit(train_x, train_y)
clf = tree.fit(train_x, train_y) #for viz later
# predict
decision_tree_predictions = decision_tree.predict(test_x)

In [None]:
cross_val_score(decision_tree, df.drop('Name', 1), df['Name'], cv = 5)

### Feature Importances

The ".feature\_importances\_" attribute of the DecisionTreeClassifier() object gives us the information gain of each attribute as a measure of importance.

In [None]:
# I put the attribute names and their respective information gains in a data frame for readability.
pd.DataFrame({'Gain': decision_tree.feature_importances_}, index = train_x.columns).sort_values('Gain', ascending = False)

### Confusion Matrix

A confusion matrix is a good way to check the accuracy of your model and to see in what ways your model may be predicting incorrectly.

We do this ising the Pandas crosstab() function…

In [None]:
pd.crosstab(test_y, tree_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

In [None]:
# import graphviz 
# dot_data = tree.export_graphviz(clf, out_file=None, 
#                          feature_names=train_x.columns,  
#                          class_names="Species",  
#                          filled=True, rounded=True,  
#                          special_characters=True)  
# graph = graphviz.Source(dot_data)  
# graph 

# Bagging

![](bagging.png)

In [None]:
# model
bag = BaggingClassifier()

# train
bag.fit(train_x, train_y)

# predict
bag_predictions = bag.predict(test_x)

# confusion matrix
pd.crosstab(test_y, bag_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

# Random Forest

In [None]:
# model
forest = RandomForestClassifier(criterion = 'entropy')

# train
forest.fit(train_x, train_y)

# predict
forest_predictions = forest.predict(test_x)

# feature importances
print(pd.DataFrame({'Importance': forest.feature_importances_}, index = train_x.columns).sort_values('Importance', ascending = False))

# confusion matrix
pd.crosstab(test_y, forest_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

# Out of Bag Error

In [None]:
# model
oob_forest = RandomForestClassifier(criterion = 'entropy', oob_score = True, n_estimators = 500)

# train
oob_forest.fit(df.drop('Name', 1), df['Name'])

# Out of bag score
oob_forest.oob_score_

# On your own

In [None]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')

Recode the quality column (what we want to predict) into a classification variable (good, average, bad)

Create a decision tree, bagging classifier, and random forest to predict quality. Compare results.