## ***Training (hyperparameter optimization)***

In [None]:
stat = ''

# Fitting hyperparameters
rf1 = RandomForestClassifier(max_depth=3, n_estimators=100, random_state=0)
rf1.fit(X_train_diabetes, y_train_diabetes)

# Model instance
rfc_diabetes = RandomForestClassifier(random_state=42)

# Hyperparameter definition
#   bootstrap                 : Method for sampling data points (default is True)
#   ccp_alpha                 : Complexity parameter used for minimal cost-complexity pruning (default is 0.0)
#   class_weight              : Weights associated with classes. If not given, all classes are supposed to have weight one (default is None)
#   criterion                 : Function to measure the quality of a split (default is 'gini')
#   max_depth                 : Maximum number of levels (depth) in each decision tree (default is None)
#   max_features              : Maximum number of features considered for splitting a node (default is 'sqrt')
#   max_leaf_nodes            : Grow trees with max_leaf_nodes in best-first fashion (default is None)
#   max_samples               : If bootstrap is True, the number of samples to draw from X to train each base estimator (default is None)
#   min_impurity_decrease     : A node will be split if this split induces a decrease of the impurity (default is 0.0)
#   min_samples_leaf          : Minimum number of data points allowed in a leaf node (default is 1)
#   min_samples_split         : Minimum number of data points placed in a node before the node is split (default is 2)
#   min_weight_fraction_leaf  : Minimum weighted fraction of the sum total of weights (of all the input samples) (default is 0.0)
#   monotonic_cst             : Indicates the monotonicity constraint to enforce on each feature (default is None)
#   n_estimators              : Number of trees in the foreset (default value is 100)
#   n_jobs                    : The number of jobs to run in parallel (default=None)
#   oob_score                 : Whether to use out-of-bag samples to estimate the generalization score. Only available if bootstrap=True (default is False)
#   random_state              : Controls both the randomness of the bootstrapping of the samples used when building trees (default is None)
#   verbose                   : Controls the verbosity when fitting and predicting (default is 0)
#   warm_start                : Reuse the solution of the previous call to fit and add more estimators to the ensemble (default is False)
#   For more details go to https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
param_grid = {
    'n_estimators'     : [10,25,50],                                        # Number of trees in the foreset
    'max_depth'        : [5,10,15],                                         # Maximum number of levels (depth) in each decision tree
    'criterion'        : ['mse', 'mae', 'gini', 'entropy', 'log_loss'],     # Function to measure the quality of a split
    'min_samples_split': [2,4,6],                                           # Minimum number of data points placed in a node before the node is split
    'min_samples_leaf' : [1,2,4],                                           # Minimum number of data points allowed in a leaf node
             }

# Iterations no. = 3+3+5+3+3 = 17

# Create the GridSearchCV object
#   cv                  : Determines the cross-validation splitting strategy (default is None). That is, the number of cross-validation splits
#   error_score         : Value to assign to the score if an error occurs in estimator fitting (default is np.nan)
#   estimator           : This is assumed to implement the scikit-learn estimator interface (template)
#   n_jobs              : Number of jobs to run in parallel (default is None)
#   param_grid          : Dictionary with parameters names (str) as keys and lists of parameter settings to try
#   pre_dispatch        : Controls the number of jobs that get dispatched during parallel execution (default is ’2*n_jobs’)
#   refit               : Refit an estimator using the best found parameters on the whole dataset (default is True)
#   return_train_score  : If False, the cv_results_ attribute will not include training scores (default is False)
#   scoring             : Strategy to evaluate the performance of the cross-validated model on the test set (default is None)
#   verbose             : Controls the verbosity. That is, the higher, the more messages
#   For more details go to https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
grid_search = GridSearchCV(estimator=rfc_diabetes, param_grid=param_grid, cv=5, scoring='accuracy')

# Model fitting with GridSearchCV
grid_search.fit(X_train_diabetes, y_train_diabetes)

# Get the best performing model
best_model_diabetes = grid_search.best_estimator_

# Best model hyperparameters
grid_search.best_params_

stat = 'accuracy on training dataset : ' + str(round(rf1.score(X_train_diabetes, y_train_diabetes),3)) + \
     '\naccuracy on test dataset     : ' + str(round(rf1.score(X_test_diabetes, y_test_diabetes),3)) + \
   '\n\nOPTIMIZING HYPERPARAMETERS     ' + \
   '\n\nbest performing model        : ' + str(best_model_diabetes) + \
     '\nbest model hyperparameters   : ' + str(grid_search.best_params_)

# Status message
print(stat)