#### Random Search with Cross Validation 

We perform here cross validated random search of the model hyperparameters, to later on retrieve the best parameters with a grid search around the best found results of the CV.

Using **k-fold cross validation** below:

![](https://i.imgur.com/HLbgMSS.png)

Source: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from src.models.ml_utils import predict_ML

if model_type == 'RF':

    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    ## Evaluate the default model
    base_model = RandomForestRegressor(n_estimators = 1000, random_state = 42)
    
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    random_model = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1)
elif model_type == 'SVR':
    
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    random_grid = {"C": [1e0, 1e1, 1e2, 1e3], 
                   "gamma": np.logspace(-2, 2, 5),
                   "kernel": ['rbf', 'sigmoid'],
                  "shrinking": [True, False]}
    
    ## Create the default model
    base_model = SVR(kernel='rbf', gamma=0.1)

    ## Create randomized Search
    random_model = RandomizedSearchCV(estimator = SVR(), cv=5, 
                             n_iter = 100, scoring = 'neg_mean_absolute_error',
                             param_distributions=random_grid,  verbose=2, random_state=42, n_jobs=-1)
    
# Fit the base model
base_model.fit(train_X, train_y)
## Get base model prediction
dataFrameTrain_base = predict_ML(base_model, features[:n_train_periods], labels[:n_train_periods], dataframeModel.index[:n_train_periods])
dataFrameTest_base = predict_ML(base_model, features[n_train_periods:], labels[n_train_periods:], dataframeModel.index[n_train_periods:])

# Fit the random search model
random_model.fit(train_X, train_y)
random_model.best_params_
best_random = random_model.best_estimator_
## Evaluate the best model
dataFrameTrain_best = predict_ML(best_random, features[:n_train_periods], labels[:n_train_periods], dataframeModel.index[:n_train_periods])
dataFrameTest_best = predict_ML(best_random, features[n_train_periods:], labels[n_train_periods:], dataframeModel.index[n_train_periods:])

In [None]:
from sklearn.model_selection import GridSearchCV

if model_type == 'RF':
    # Create the parameter grid based on the results of random search 
    param_grid = {
        'bootstrap': [False],
        'max_depth': [80, 90, 100, 110],
        'max_features': [2, 3],
        'min_samples_leaf': [1, 2],
        'min_samples_split': [2, 3],
        'n_estimators': [200, 300, 400, 1000]
    }
        
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = RandomForestRegressor(), param_grid = param_grid, 
                               scoring = 'neg_mean_absolute_error', cv = 3, 
                               n_jobs = -1, verbose = 2)
elif model_type == 'SVR':

    # Create the parameter grid based on the results of random search 
    param_grid = {"C": [1e0, 1e1, 1e2, 1e3], 
                   "gamma": np.logspace(-2, 2, 5),
                  "shrinking": [True, False]
    }

    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = SVR(), param_grid = param_grid, 
                               scoring = 'neg_mean_absolute_error', cv = 3, 
                               n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(train_X, train_y)
# Fit the grid search to the data
grid_search.fit(train_X, train_y)

If happy with the best predictions of the grid search, put them in the dataframe for plotting and archiving

In [None]:
print (grid_search.best_params_)
best_grid = grid_search.best_estimator_
print (best_grid)
dataFrameTrain_best_grid = predict_ML(best_grid, features[:n_train_periods], labels[:n_train_periods], dataframeModel.index[:n_train_periods])
dataFrameTest_best_grid = predict_ML(best_grid, features[n_train_periods:], labels[n_train_periods:], dataframeModel.index[n_train_periods:])

If happy with the best predictions of the grid search, put them in the dataframe for plotting and archiving

In [None]:
dataFrameExport = dataFrameTrain_best_grid.copy()
dataFrameExport = dataFrameExport.combine_first(dataFrameTest_best_grid)

# Get model metrics
metrics_model = dict()
metrics_model['train'] = metrics(dataFrameTrain_best_grid['reference'], dataFrameTrain_best_grid['prediction'])
metrics_model['test'] = metrics(dataFrameTest_best_grid['reference'], dataFrameTest_best_grid['prediction'])

records.archive_model(test_model, model_full_name + '_best_grid_search', 
                      metrics_model, 
                      dataFrameExport, best_grid, model_type, 
                      model_target, ratio_train)

print ('Metrics Summary:')
print ("{:<23} {:<7} {:<5}".format('Metric','Train','Test'))
for metric in metrics_model['train'].keys():
    print ("{:<20}".format(metric) +"\t" +"{:0.3f}".format(metrics_model['train'][metric]) +"\t"+ "{:0.3f}".format(metrics_model['test'][metric]))