### Sample program for grid search of hyper parameters  

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from joblib import dump

#### Parameters  

In [None]:
csv_in = 'winequality-white_small.csv'

#### Read CSV file  

In [None]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

#### Get X and y  

In [None]:
X = df.iloc[:, :-1]  # explanatory variables
y = df['quality']  # objective variable
print('X:', X.shape)
display(X.head())
print('y:', y.shape)
print(y.head())

#### Make pipeline and set parameters for grid search   

In [None]:
rfr=RandomForestRegressor(random_state=0)            

# Hyperparameter settings for grid search
param_grid = {
    'n_estimators': [50, 100, 500],
    'max_depth' : [2, 4, 6, None],
}

#### Preparation of objects for cross validation  

In [None]:
grid_cv = KFold(n_splits=4, shuffle=True, random_state=7)  # for grid search
gen_cv = KFold(n_splits=4, shuffle=True, random_state=11)  # for estimation of generalization performance

#### Define the grid search for hyperparameters  

In [None]:
gs = GridSearchCV(rfr, param_grid , cv=grid_cv, scoring='neg_mean_squared_error')

#### Estimation of generalization performance  

In [None]:
%%time
nested_score = cross_val_score(gs, X=X, y=y, cv=gen_cv,
                               scoring='neg_mean_squared_error')
print(nested_score)
print(np.sqrt(-nested_score.mean()))

**Generalization performance (RMSE) / 汎化性能 (平均2乗誤差平方根): 0.675**  

#### Cross-validation to obtain the model with the best hyperparameter set (best estimator)  
- Note: gs_best is already fit to the whole data (X) in gs.fit(X, y)  

In [None]:
%%time
gs.fit(X, y)
gs_best = gs.best_estimator_

In [None]:
print(gs_best)

#### Show feature importances of the best model  

In [None]:
print(pd.Series(gs_best.feature_importances_, index=X.columns))

In [None]:
plt.bar(X.columns, gs_best.feature_importances_)
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.show()

**Features (explanatory variable) with the largest importance: alcohol**  
**重要度が最大の説明変数(特徴量): alcohol**  

#### Save the best predictor  

In [None]:
tag = 'wine'
model_file = 'rfr_best_{}.joblib'.format(tag)
dump(gs_best, model_file)