# GridSearch

In this lab we will try to find the visualise the effect of increasing
depth in a decision tree, and then try to create the best possible decision tree.


In [1]:
# Do the usual tasks to load the Titanic data
import pandas
titanic = pandas.read_csv('../../data/titanic.csv', index_col='PassengerId')
titanic['Gender'] = [0 if x == 'male' else 1 for x in titanic.Sex]
feature_cols = ['Pclass', 'Parch', 'Age', 'Gender']
good_data = titanic[titanic.Age.notnull()]
X = good_data[feature_cols]
y = good_data.Survived

In [2]:
# Import the sklearn libraries for grid_search and tree
import sklearn.tree
import sklearn.grid_search

In [3]:
# Create a decision tree classifier
c = sklearn.tree.DecisionTreeClassifier()

In [5]:
# Initially, we will just explore the effect of increasing
# the max_depth parameter.
# Create a dictionary which has a key of 'max_depth' and
# a value of the numbers between 1 and 20
params = {'max_depth': range(1,20)}

In [6]:
# Create a GridSearchCV object. It will need two parameters:
# - the decision tree classifier (from two cells back)
# - the parameters dictionary (from the previous cell)
# There is an optional "cv" parameter which you might want
# to explore later.
brute_force = sklearn.grid_search.GridSearchCV(c, params)

In [7]:
# Use the GridSearchCV to fit the X and y data
brute_force.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [8]:
# What does the best_params_ attribute say?
brute_force.best_params_

{'max_depth': 2}

In [9]:
# What about the best_score_ attribute?
brute_force.best_score_

0.79411764705882348

In [10]:
# We can see how this compares with other values.
# The grid_scores_ attribute of the GridSearchCV object
# is now a list. Print it out
brute_force.grid_scores_

[mean: 0.78011, std: 0.01121, params: {'max_depth': 1},
 mean: 0.79412, std: 0.01436, params: {'max_depth': 2},
 mean: 0.77171, std: 0.02918, params: {'max_depth': 3},
 mean: 0.77451, std: 0.03016, params: {'max_depth': 4},
 mean: 0.78151, std: 0.02200, params: {'max_depth': 5},
 mean: 0.79132, std: 0.01000, params: {'max_depth': 6},
 mean: 0.77731, std: 0.01496, params: {'max_depth': 7},
 mean: 0.78431, std: 0.02158, params: {'max_depth': 8},
 mean: 0.78151, std: 0.01838, params: {'max_depth': 9},
 mean: 0.79412, std: 0.01988, params: {'max_depth': 10},
 mean: 0.78711, std: 0.01653, params: {'max_depth': 11},
 mean: 0.78151, std: 0.02015, params: {'max_depth': 12},
 mean: 0.79132, std: 0.00645, params: {'max_depth': 13},
 mean: 0.78571, std: 0.01426, params: {'max_depth': 14},
 mean: 0.78711, std: 0.01229, params: {'max_depth': 15},
 mean: 0.78571, std: 0.01716, params: {'max_depth': 16},
 mean: 0.78711, std: 0.01518, params: {'max_depth': 17},
 mean: 0.78711, std: 0.01518, params: {'

In [32]:
# Each element of this list is a 3-tuple. The middle element
# is the score. You can use
#  [x[1] for result in YOUR_GRID_VARIABLE.grid_scores_]
# to display it
x = brute_force.grid_scores_
li = []
for i in range(0, len(x)):
    li.append(x[i][1])
li

[0.78011204481792717,
 0.79411764705882348,
 0.77170868347338939,
 0.77450980392156865,
 0.78151260504201681,
 0.79131652661064422,
 0.77731092436974791,
 0.78431372549019607,
 0.78151260504201681,
 0.79411764705882348,
 0.78711484593837533,
 0.78151260504201681,
 0.79131652661064422,
 0.7857142857142857,
 0.78711484593837533,
 0.7857142857142857,
 0.78711484593837533,
 0.78711484593837533,
 0.79271708683473385]

In [14]:
# Import matplotlib.pyplot; remember to do %matplotlib inline first
%matplotlib inline
import matplotlib.pyplot

In [16]:
# Use matplotlib.pyplot.plot to plot these grid_scores_
# Once you have that working, use pyplot.plot to also add
# a marker on the best_score (pass 'ro' as the third argument
# to get a red circle on it)
brute_force.grid_scores_.plot.scatter(x)

AttributeError: 'list' object has no attribute 'plot'

In [None]:
# If you have Graphviz and pydot installed and working,
# display the resulting decision tree

## Brute-force every option (optional)

In [None]:
# Create a new GridSearchCV object and a new parameters
# dictionary. This time, try all combinations of parameters
# max_features (from 1 to 4 inclusive), max_depth (from 1 to 20)
# criterion ('gini' or 'entropy').
# What is the best decision tree you can make?