### Randomized Parameter Search

### Randomized Parameter Search

When you cannot do a comprehensive parameter search due to the number of parameters growing _really_ fast, then, we need to be _smarter_ about grid search. I.e. decrease the parameter search space without (hopefully) giving up too much performance. The performance of the classifier may not match to the grid search, but it may often approach it, for a fraction of the computation time. 

RandomizedParameterSearch in scikit-learn does the hard work for us and without losing too much performance, we have similar results to the GridSearch in a much more efficient and faster way.

In [1]:
%matplotlib inline 

import matplotlib as mlp
import matplotlib.pyplot as plt
import numpy as np
from operator import itemgetter
import os
import pandas as pd
from scipy.stats import randint as sp_randint
import sklearn
from sklearn import cross_validation
from sklearn import datasets
from sklearn import ensemble
from sklearn import grid_search
from sklearn import metrics
from sklearn import preprocessing
import time

plt.style.use('fivethirtyeight')

_DATA_DIR = 'data'
_DATA_PATH = os.path.join(_DATA_DIR, 'titanic.csv')

# To encode categorical variables
label_encoder = preprocessing.LabelEncoder()

In [2]:
df = pd.read_csv(_DATA_PATH)

### Preprocessing Variables

In [3]:
del df['row.names']
df['pclass'] = label_encoder.fit_transform(df.pclass)
df['embarked'] = label_encoder.fit_transform(df.embarked)
df['sex'] = label_encoder.fit_transform(df.sex)
def convert_age(number):
    try:
        number = int(number)
    except ValueError:
        number = 0
    return number

def extract_home_destination(address):
    try:
        address = address.split(',')[-1]
    except AttributeError:
        address = ''
    return address

df.age = df.age.apply(convert_age)
# Preprocess first
df['home.dest'] = df['home.dest'].apply(extract_home_destination)
df['destination'] = label_encoder.fit_transform(df['home.dest'])
del df['home.dest']
del df['boat']
del df['ticket']
del df['name']
del df['room']
y = np.array(df.survived.tolist())
del df['survived']

In [4]:
feature_names = ['pclass', 'age', 'embarked', 'sex', 'destination']
X = df[feature_names].as_matrix()

In [5]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
param_grid = {
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 6],
              'min_samples_leaf': [1, 2, 3, 4, 9, 15],
              'n_estimators': [1000, 2000, 3000],
              }

est = ensemble.GradientBoostingClassifier()

start_time = time.time()
# run randomized search
n_iter_search = 20
randomized_search = grid_search.RandomizedSearchCV(est, param_distributions=param_grid,
    n_iter=n_iter_search, n_jobs=4).fit(X_train, y_train)

gs_cv = grid_search.GridSearchCV(est, param_grid, n_jobs=4).fit(X_train, y_train)
end_time = time.time()

print('It took {} seconds'.format(end_time - start_time))

It took 995.277484894 seconds


In [7]:
# best hyperparameter setting
randomized_search.best_params_

{'learning_rate': 0.01,
 'max_depth': 4,
 'min_samples_leaf': 1,
 'n_estimators': 2000}

In [8]:
randomized_search.best_score_

0.80476190476190479

In [None]:
randomized_search.grid_scores_

[mean: 0.77905, std: 0.01553, params: {'n_estimators': 3000, 'learning_rate': 0.1, 'max_depth': 6, 'min_samples_leaf': 3},
 mean: 0.78000, std: 0.02100, params: {'n_estimators': 3000, 'learning_rate': 0.1, 'max_depth': 4, 'min_samples_leaf': 9},
 mean: 0.78286, std: 0.00841, params: {'n_estimators': 3000, 'learning_rate': 0.05, 'max_depth': 4, 'min_samples_leaf': 2},
 mean: 0.80286, std: 0.00233, params: {'n_estimators': 1000, 'learning_rate': 0.01, 'max_depth': 4, 'min_samples_leaf': 3},
 mean: 0.79238, std: 0.00943, params: {'n_estimators': 1000, 'learning_rate': 0.05, 'max_depth': 6, 'min_samples_leaf': 2},
 mean: 0.78571, std: 0.00404, params: {'n_estimators': 3000, 'learning_rate': 0.05, 'max_depth': 6, 'min_samples_leaf': 2},
 mean: 0.77714, std: 0.01234, params: {'n_estimators': 3000, 'learning_rate': 0.1, 'max_depth': 4, 'min_samples_leaf': 2},
 mean: 0.80286, std: 0.00467, params: {'n_estimators': 1000, 'learning_rate': 0.01, 'max_depth': 4, 'min_samples_leaf': 1},
 mean: 0.78

### Comparison between Grid Search and Randomized Parameter Search

In [None]:
clf = ensemble.RandomForestClassifier()

def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for ii, score in enumerate(top_scores):
        print("Model with rank: {}".format(ii + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {}\n".format(score.parameters))

param_grid = {"max_depth": [3, 2, 1, None],
              "max_features": [1, 2, 3, 4],
              "min_samples_split": [1, 2, 3, 4],
              "min_samples_leaf": [1, 2, 3, 4],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
              'n_estimators': [50, 100, 1000, 2000, 3000],
}

n_iter_search = 20
random_search = grid_search.RandomizedSearchCV(clf, param_distributions=param_grid,
                                   n_iter=n_iter_search)

start = time.time()
random_search.fit(X, y)
end = time.time()
print("RandomizedSearchCV took {0:.2f} seconds for {1} candidates"
      " parameter settings.".format((end - start), n_iter_search))
report(random_search.grid_scores_)s


param_grid = {"max_depth": [3, 2, 1, None],
              "max_features": [1, 2, 3, 4],
              "min_samples_split": [1, 2, 3, 4],
              "min_samples_leaf": [1, 2, 3, 4],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
              'n_estimators': [50, 100, 1000, 2000, 3000],
}

gs = grid_search.GridSearchCV(clf, param_grid=param_grid)
start = time.time()
gs.fit(X, y)
end = time.time()

print("GridSearchCV took {0:.2f} seconds for {1} candidate parameter settings.".format(
      (end - start, len(gs.grid_scores_))))
report(gs.grid_scores_)