# Solution: Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

## Read in the Breast Cancer dataset, __`bcan.csv`__

In [None]:
data = pd.read_csv('data/bcan.csv', na_values='?')

In [None]:
data.head()

## Drop the __`id`__ since it's not going to the model

In [None]:
data = data.drop('id', axis=1)

In [None]:
data.head()

## Remember that __`sklearn`__ wants the features in 2-d matrix, and the targets in a 1-d array

In [None]:
data = data.dropna() # drop the missing rows, there aren't that many
y = data['Diag'] # targets are the final column
data = data.drop('Diag', axis=1) # now remove the final column
X = data.values # features are the values from the dataframe

## Fit the model
* if you get an error here, figure out why

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

## Use export_graphviz to generate the dot file
* __`feature_names`__ should be the list of column headers
* __`class_names`__ should be the targets

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(tree_clf, out_file="bcan.dot",
               feature_names=data.columns,
               class_names='benign malignant'.split(),
               rounded=True,
               filled=True)

## Generate a PNG images from the dot file and open it

In [None]:
!dot -Tpng bcan.dot -o bcan.png
from IPython.display import Image
Image('bcan.png')

## How well did the model do?

In [None]:
tree_clf.score(X, y)

## Do we do better if we increase the tree depth?

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=8)
tree_clf.fit(X, y)
tree_clf.score(X, y)

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(tree_clf, out_file="bcan.dot",
               feature_names=data.columns,
               class_names='benign malignant'.split(),
               rounded=True,
               filled=True)
!dot -Tpng bcan.dot -o bcan.png
from IPython.display import Image
Image('bcan.png')

## Why might we not want to do that?