# Grid Search, Random Search

In [78]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns

# Dataset
from sklearn.datasets import make_regression

# sklearn preprocess
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# Search
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Metrics
from sklearn.metrics import mean_squared_error

In [47]:
# Dataframe
df = make_regression(n_samples=2000, n_features=5,
                     n_informative=4, noise=1, random_state=12)

# Split X and y
X= df[0]
y= df[1]

In [33]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random)

In [34]:
# Creating the steps for the pipeline
steps = [ ('scale', StandardScaler()),
          ('model', DecisionTreeRegressor())  ]

# Creating pipeline for Decision Tree Regressor
pipe = Pipeline(steps)

# Fit the model
pipe.fit(X_train, y_train)

Pipeline(steps=[('scale', StandardScaler()),
                ('model', DecisionTreeRegressor())])

### Grid Search

In [86]:
%%timeit
# Creating dictionary of parameters to be tested
params= {'model__max_features': [2,5],
         'model__min_samples_split':[2, 5, 10],
         'model__criterion': ['friedman_mse', 'absolute_error']}

# Applying the Grid Search
grid = GridSearchCV(pipe, param_grid=params, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

# Best model
grid.best_estimator_

1.89 s ± 228 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [75]:
%%timeit
# Creating dictionary of parameters to be tested
params= {'model__max_features': [2,3,4,5],
         'model__min_samples_split':[2,5,6,7,8,10],
         'model__criterion': ['friedman_mse', 'absolute_error']}

# Applying the Grid Search
grid = GridSearchCV(pipe, param_grid=params, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

# Best model
grid.best_estimator_

7.71 s ± 681 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [85]:
# Taking the best estimator
best_grid = grid.best_estimator_

preds_grid = best_grid.predict(X_test)

# RMSE
np.sqrt( mean_squared_error(y_test, preds_grid) )

53.70886778489411

In [64]:
# Best Score
np.sqrt(-grid.cv_results_['mean_test_score'])

array([74.77529745, 71.43364645, 70.35861139, 62.12057004, 61.75300427,
       63.68961232, 70.77323106, 72.78083116, 72.12092212, 60.27517609,
       59.27527033, 61.29225372])

### Randomized Search

In [76]:
%%timeit
# Creating dictionary of parameters to be tested
params= {'model__max_features': [2,5],
         'model__min_samples_split':[2, 5, 10],
         'model__criterion': ['friedman_mse', 'absolute_error']}

# Applying the Grid Search
randcv = RandomizedSearchCV(pipe, param_distributions=params, cv=5, scoring='neg_mean_squared_error')
randcv.fit(X_train, y_train)

# Best model
randcv.best_estimator_

1.47 s ± 140 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [91]:
%%timeit
# Creating dictionary of parameters to be tested
params= {'model__max_features': [2,3,4,5],
         'model__min_samples_split':[2,5,6,7,8,9,10],
         'model__criterion': ['friedman_mse', 'absolute_error']}

# Applying the Grid Search
randcv = RandomizedSearchCV(pipe, param_distributions=params, cv=5, scoring='neg_mean_squared_error')
randcv.fit(X_train, y_train)

# Best model
randcv.best_estimator_

1.52 s ± 352 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [90]:
# Taking the best estimator
best_rand = randcv.best_estimator_

preds_rand = best_rand.predict(X_test)

# RMSE
np.sqrt( mean_squared_error(y_test, preds_rand) )

55.35583215782757

In [73]:
# Best Score
np.sqrt(-randcv.cv_results_['mean_test_score'])

array([60.90828553, 71.58519132, 73.54790483, 61.4266836 , 72.81570668,
       72.91009214, 60.4132181 , 60.64637689, 72.06737796, 61.54023208])

In [92]:
#Difference
(55.35583215782757 - 53.70886778489411)/53.70886778489411

0.030664663785682663