# Подбор параметров

In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
# read in the data
boston = load_boston()

# create X (features) and y (response)
X = boston.data
y = boston.target

In [3]:
for i in range(20):
    rand_state = np.random.randint(100)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rand_state)
    model = LinearRegression() 
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    rmse = (np.sqrt(mean_squared_error(y_test, predictions)))
    r2 = round(model.score(X_train, y_train),2)
    print('№{} RMSE: {}, R2: {}'.format(i, round(rmse, 5),r2))

№0 RMSE: 4.43993, R2: 0.74
№1 RMSE: 3.9561, R2: 0.73
№2 RMSE: 6.23628, R2: 0.77
№3 RMSE: 4.65018, R2: 0.75
№4 RMSE: 4.35126, R2: 0.72
№5 RMSE: 6.23628, R2: 0.77
№6 RMSE: 4.52315, R2: 0.74
№7 RMSE: 4.06019, R2: 0.74
№8 RMSE: 5.35729, R2: 0.76
№9 RMSE: 4.76798, R2: 0.73
№10 RMSE: 4.7628, R2: 0.74
№11 RMSE: 5.21824, R2: 0.75
№12 RMSE: 5.10872, R2: 0.75
№13 RMSE: 4.7909, R2: 0.73
№14 RMSE: 5.1324, R2: 0.76
№15 RMSE: 4.87158, R2: 0.73
№16 RMSE: 4.13814, R2: 0.72
№17 RMSE: 4.89864, R2: 0.75
№18 RMSE: 5.01652, R2: 0.75
№19 RMSE: 4.77999, R2: 0.75


In [4]:
model = LinearRegression()
results = cross_val_score(model, X, y, cv=10)
print(results)
print('Average result: {}'.format(round(np.mean(results), 3)))

[ 0.73376082  0.4730725  -1.00631454  0.64113984  0.54766046  0.73640292
  0.37828386 -0.12922703 -0.76843243  0.4189435 ]
Average result: 0.203


## GridSearchCV

In [5]:
# instantiate the grid
aran = np.linspace(0.0001,0.1,10)
param_grid = dict(alpha=aran)
grid = GridSearchCV(Lasso(), param_grid=param_grid, cv=10, scoring='r2')

In [6]:
# fit the grid with data
grid.fit(X, y)

GridSearchCV(cv=10, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([0.0001, 0.0112, 0.0223, 0.0334, 0.0445, 0.0556, 0.0667, 0.0778,
       0.0889, 0.1   ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='r2', verbose=0)

In [7]:
# view the complete results (list of named tuples)
for k in grid.cv_results_:
    print(k, ":", grid.cv_results_[k][0])

mean_fit_time : 0.0015040874481201173
std_fit_time : 0.0006769046526173155
mean_score_time : 0.0008833169937133789
std_score_time : 0.0012777896998235677
param_alpha : 0.0001
params : {'alpha': 0.0001}
split0_test_score : 0.7337764275910776
split1_test_score : 0.47329435215804827
split2_test_score : -1.0035617942756199
split3_test_score : 0.6410720314067737
split4_test_score : 0.5478399328225411
split5_test_score : 0.7364729336872822
split6_test_score : 0.378485255506969
split7_test_score : -0.12927075368030905
split8_test_score : -0.7684845464039076
split9_test_score : 0.41888881923551535
mean_test_score : 0.20285126580483714
std_test_score : 0.5947754185281068
rank_test_score : 10


In [8]:
test_scores = grid.cv_results_['mean_test_score']
print(test_scores)

[0.20285127 0.23299723 0.25192648 0.25951362 0.25901309 0.25677857
 0.26001547 0.26163897 0.26307594 0.26432739]


In [9]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

0.2643273893487372
{'alpha': 0.1}
Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)


## Перебор нескольких параметров одновременно

In [10]:
params = { 'loss': ['ls', 'lad', 'huber', 'quantile'],'learning_rate':aran}

In [11]:
grid = GridSearchCV(GradientBoostingRegressor(), param_grid=params, cv=10, scoring='r2')
grid.fit(X, y)

GridSearchCV(cv=10, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_...
                            

In [12]:
for k in grid.cv_results_:
    print(k, ":", grid.cv_results_[k][0])

mean_fit_time : 0.10801830291748046
std_fit_time : 0.0032307991482699765
mean_score_time : 0.0005620956420898437
std_score_time : 8.36326838203006e-05
param_learning_rate : 0.0001
param_loss : ls
params : {'learning_rate': 0.0001, 'loss': 'ls'}
split0_test_score : -0.13739050778255368
split1_test_score : -0.12629540638553727
split2_test_score : -3.3852487860507265
split3_test_score : -0.9261278570644325
split4_test_score : -0.44782839001877206
split5_test_score : -1.26916273843092
split6_test_score : -0.03166396969029517
split7_test_score : -0.20462510557993152
split8_test_score : -5.134671794072899
split9_test_score : -0.8755988009052735
mean_test_score : -1.253861335598134
std_test_score : 1.600515304539295
rank_test_score : 36


In [13]:
grid.best_estimator_

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.07780000000000001,
                          loss='lad', max_depth=3, max_features=None,
                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                          min_impurity_split=None, min_samples_leaf=1,
                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                          n_estimators=100, n_iter_no_change=None,
                          presort='deprecated', random_state=None,
                          subsample=1.0, tol=0.0001, validation_fraction=0.1,
                          verbose=0, warm_start=False)

## RandomizedSearchCV

In [14]:
rnd_search = RandomizedSearchCV(GradientBoostingRegressor(), param_distributions=params, 
                                scoring='r2')
rnd_search.fit(X, y)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                          

In [15]:
print(rnd_search.best_estimator_)
print(rnd_search.best_params_)
print(rnd_search.best_score_)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.07780000000000001,
                          loss='ls', max_depth=3, max_features=None,
                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                          min_impurity_split=None, min_samples_leaf=1,
                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                          n_estimators=100, n_iter_no_change=None,
                          presort='deprecated', random_state=None,
                          subsample=1.0, tol=0.0001, validation_fraction=0.1,
                          verbose=0, warm_start=False)
{'loss': 'ls', 'learning_rate': 0.07780000000000001}
0.669949761523501


## OOB Score

In [16]:
regr = RandomForestRegressor(oob_score=True, n_estimators=20, max_depth=5)

In [17]:
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=20, n_jobs=None, oob_score=True,
                      random_state=None, verbose=0, warm_start=False)

In [18]:
print(regr.oob_score_)
print(regr.score(X_test,y_test))

0.7810767405002199
0.8495205505333759
