# CV search experimentation

In [16]:
from sklearn import datasets
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import xgboost
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from scipy import stats
import pandas as pd

## Load and split the data

In [5]:
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [20]:
reg = GradientBoostingRegressor(random_state=7, n_estimators=1000, learning_rate=0.01,
                                n_iter_no_change=20)
param_dist = {'max_depth': np.arange(2, 9),
              'subsample': stats.uniform(0, 1)}

In [30]:
n_iter = 40
# Random
random_search = RandomizedSearchCV(reg, param_distributions=param_dist,
                                   n_iter=n_iter, cv=5, n_jobs=6)
random_search.fit(X_train, y_train)
random_predictions = random_search.predict(X_test)
random_prediction_df = pd.DataFrame({'prediction': random_predictions, 'observation': y_test})
random_prediction_df['params'] = 'Randomized'
# Default
default_reg = GradientBoostingRegressor(random_state=7)
default_reg.fit(X_train, y_train)
default_predictions = default_reg.predict(X_test)
default_prediction_df = pd.DataFrame({'prediction': default_predictions, 'observation': y_test})
default_prediction_df['params'] = 'default'
# Slow
slow_reg = GradientBoostingRegressor(random_state=7, n_estimators=1000, learning_rate=0.01, n_iter_no_change=20)
slow_reg.fit(X_train, y_train)
slow_predictions = slow_reg.predict(X_test)
slow_prediction_df = pd.DataFrame({'prediction': slow_predictions, 'observation': y_test})
slow_prediction_df['params'] = 'slow'


In [32]:
prediction_df = pd.concat([default_prediction_df, random_prediction_df, slow_prediction_df])
(prediction_df.groupby('params')
 .apply(lambda df: stats.pearsonr(df['prediction'], df['observation'])))

params
Randomized    (0.6813995602419678, 1.9951285718558276e-13)
default       (0.6429897932742135, 1.0908872232829323e-11)
slow            (0.6668616770656229, 9.72299029031537e-13)
dtype: object