# Decision Tree Example

This notebook uses a sampling of data taken from the Buzzfeed news Github repository on the analysis of surveillance planes found [here](https://github.com/BuzzFeedNews/2016-04-federal-surveillance-planes) and arbitrarily assigned classes.

In [None]:
# After running this cell to install graphviz, restart the python kernel
!sudo add-apt-repository universe 
!sudo apt update
!sudo apt install -y graphviz
!pip3 install --user graphviz

In [None]:
#get the packages we need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from IPython import display
#from sklearn.externals.six import StringIO  
import graphviz 

In [None]:
#read in data
data = pd.read_csv('../data/demo-notebooks-data/simple_example_classes.csv')

In [None]:
#look at a few rows of data
data.sample(5)

In [None]:
#look at breakdown of classes
data['class'].value_counts().plot.bar();

In [None]:
#create the X and y data from our dataset
X = data[['speed', 'altitude']]
y = data['class']

In [None]:
#split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
#create and fit decision tree model
decision_tree_model = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)

# Generate a plot of the decision tree
columns=list(X_train.columns)
graphviz.Source(tree.export_graphviz(decision_tree_model, out_file=None, feature_names=columns,class_names=True))

In [None]:
#make predictions on the test data
predictions = decision_tree_model.predict(X_test)

In [None]:
#create a data frame that compares the real values to the predicted values
test_data = pd.concat([X_test, y_test], axis=1)
test_data['predicted_class'] = predictions
test_data.reset_index(drop=True, inplace=True)
test_data.head()

In [None]:
test_data[(test_data['speed'] <= 80.5) & (test_data['altitude'] <= 4987)]

In [None]:
test_data[(test_data['speed'] > 80.5) & (test_data['altitude'] > 4987)]

In [None]:
#print accuracy score for the test set
decision_tree_model.score(X_test, y_test)