<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

<br><h2>Script 05 | Hyperparameter Tuning and Ensemble Modeling</h2>
<br>
Written by Chase Kusterer<br>
<a href="https://github.com/chase-kusterer">GitHub</a> | <a href="https://www.linkedin.com/in/kusterer/">LinkedIn</a>
<br><br><br>

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

<h2>Part I: Preparation</h2>
<br>
Run the following code to import necessary packages, load data, and set display options for pandas. 

In [None]:
########################################
# importing packages
########################################

# essentials
import matplotlib.pyplot as plt # data visualization
import pandas            as pd  # data science essentials
import numpy             as np  # mathematical essentials
import warnings


# model preparation
from sklearn.preprocessing import StandardScaler       # standard scaler
from sklearn.model_selection import train_test_split   # train-test split
from sklearn.model_selection import RandomizedSearchCV # hp tuning


# model results
from sklearn.metrics import roc_auc_score              # auc score
from sklearn.metrics import make_scorer                # customizable scorer
from sklearn.metrics import confusion_matrix           # confusion matrix


# machine learning
from sklearn.tree import DecisionTreeClassifier         # classification trees
from sklearn.tree import plot_tree                      # tree plots
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier # gbm


########################################
# loading data and setting display options
########################################
# loading data
titanic = pd.read_excel(io = './datasets/titanic_feature_rich.xlsx')


## Options ##
# setting pandas print options and supressing warnings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)
warnings.simplefilter(action = 'ignore', category = UserWarning)


## this code will not produce an output ##

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<strong>User-Defined Functions</strong><br>
Run the following code to load the user-defined functions used throughout this Notebook.

In [None]:
####################
## tuning_results ##
####################
def tuning_results(cv_results, n=5):
    """
    This function will display the top "n" models from hyperparameter tuning,
    based on "rank_test_score".

    PARAMETERS
    ----------
    cv_results = results dictionary from the attribute ".cv_results_"
    n          = number of models to display
    """
    param_lst = []

    for result in cv_results["params"]:
        result = str(result).replace(":", "=")
        param_lst.append(result[1:-1])


    results_df = pd.DataFrame(data = {
        "Model_Rank" : cv_results["rank_test_score"],
        "Mean_Test_Score" : cv_results["mean_test_score"],
        "SD_Test_Score" : cv_results["std_test_score"],
        "Parameters" : param_lst
    })


    results_df = results_df.sort_values(by = "Model_Rank", axis = 0)
    return results_df.head(n = n)


#####################
## sklearn_summary ##
#####################
def classification_summary(x,
                           y,
                           model,
                           model_name   = "",
                           results_df   = None,
                           tt_split     = True,
                           test_size    = 0.25,
                           scale        = False,
                           full_tree    = False,
                           random_state = 702):
    """  
    This function is designed to generate summary statistics for the following
    classification models from scikit-learn:
    * LogisticRegression         - Logistic Regression
    * DecisionTreeClassifier     - Classification Tree
    * RandomForestClassifier     - Random Forest
    * GradientBoostingClassifier - Gradient Boosted Machine


    Additional Functionality
    ------------------------
    This function will standardize the data using StandardScaler() and create
    training and testing sets using train-test split, stratifying the
    y-variable.
    
    It will also output a tabular confusion matrix, calculate area under the
    ROC curve (AUC) for the training and testing sets, as well as the train-
    test gap.
    

    PARAMETERS
    ----------
    x            | array     | X-data before train-test split | No default.
    y            | array     | y-data before train-test split | No default.
    model        | model     | model object to instantiate    | No default.
    model_name   | str       | option to name the model       | Default = ""
    results_df   | DataFrame | place to store model results   | Default = None
    test_size    | float     | test set proportion            | Default = 0.25
    scale        | bool      | whether to scale the data      | Default = False
    random_state | int       | seed for train-test split      | Default = 702
    """
    
    ###########
    # scaling #
    ###########
    
    if scale == True:
        # instantiating a StandardScaler() object
        scaler = StandardScaler(copy = True)


        # FITTING the scaler with the data
        scaler.fit(x)

        # TRANSFORMING our data after fit
        x_scaled = scaler.transform(x)

        # converting scaled data into a DataFrame
        x_scaled_df = pd.DataFrame(x_scaled)

        # reattaching column names
        x_scaled_df.columns = list(x.columns)

        # reverting back to x as the DataFrame's name
        x = x_scaled_df
    
    
    ####################
    # train-test split #
    ####################
    # standard train-test split
    x_train, x_test, y_train, y_test = train_test_split(x, # x
                                                        y, # y
                                                        test_size    = test_size,
                                                        random_state = random_state,
                                                        stratify     = y)
    
    
    #########################
    # fit - predict - score #
    #########################
    # fitting to training data
    model_fit = model.fit(x_train, y_train)


    # predicting on new data
    model_pred = model.predict(x_test)


    # scoring results
    model_train_auc   = round(roc_auc_score(y_true  = y_train,
                              y_score = model.predict(x_train)), ndigits = 4) # auc
    
    model_test_auc    = round(roc_auc_score(y_true  = y_test,
                              y_score = model.predict(x_test)),  ndigits = 4) # auc

    model_gap         = round(abs(model_train_auc - model_test_auc), ndigits = 4)

    
    ####################
    # confusion matrix #
    ####################
    full_tree_tn, \
    full_tree_fp, \
    full_tree_fn, \
    full_tree_tp = confusion_matrix(y_true = y_test, y_pred = model_pred).ravel()

    
    ###########################
    # storing/showing results #
    ###########################
    # instantiating a list to store model results
    results_lst = [ model_name, model_train_auc, model_test_auc, model_gap ]

    # converting to DataFrame
    results_lst = pd.DataFrame(data = results_lst)

    # transposing (rotating) DataFrame
    results_lst = np.transpose(a = results_lst)
    
    # if no results DataFrame provided
    if results_df == None:

        # concatenating to coef_df
        results_df = pd.DataFrame(data = results_lst)
    
    # if results DataFrame provided
    else:
        
        # concatenating to coef_df
        results_df = pd.concat(objs = [results_df, results_lst],
                               axis         = 0,
                               ignore_index = True)
        
    # adding column names
    results_columns = ['Model Name', 'train_auc', 'test_auc', 'tt_gap']
    
    # renaming columns
    results_df.columns = results_columns
    
    
    print(f"""
    Results for {model_name}
    {'=' * 20}
    Model Type: {model}
    Training Samples: {len(x_train)} 
    Testing  Samples: {len(x_test)}
    
    
    Summary Statistics
    ------------------
    AUC (Train): {model_train_auc}
    AUC (Test) : {model_test_auc}
    TT Gap     : {model_gap}
    
    
    Confusion Matrix (test set)
    ---------------------------
    True Negatives : {full_tree_tn}
    False Positives: {full_tree_fp}
    False Negatives: {full_tree_fn}
    True Positives : {full_tree_tp}
    """)
    

########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    labels : DataFrame with labels (i.e., x_data)
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = x_data.shape[1]
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('Feature_Importance_Plot.png')
        
        
########################################
# visual_cm
########################################
def visual_cm(true_y, pred_y, labels = None):
    """
    Creates a visualization of a confusion matrix.

    PARAMETERS
    ----------
    true_y : true values for the response variable
    pred_y : predicted values for the response variable
    labels : , default None
        """
    # visualizing the confusion matrix

    # setting labels
    lbls = labels
    

    # declaring a confusion matrix object
    cm = confusion_matrix(y_true = true_y,
                          y_pred = pred_y)


    # heatmap
    sns.heatmap(cm,
                annot       = True,
                xticklabels = lbls,
                yticklabels = lbls,
                cmap        = 'Blues',
                fmt         = 'g')


    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of the Classifier')
    plt.show()
    

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<strong>Preparing Data</strong><br>
Run the following code cells to prepare the x- and y-data.

In [None]:
# reversing m_boat
titanic['lifeboat'] = abs(titanic['m_boat'] - 1)

<br>

In [None]:
# preparing to partition data
x_data   =  titanic.drop(['survived', 'm_boat', 'lifeboat',
                          'male', 'pclass_3'],
                               axis = 1)


y_data =  titanic['lifeboat']

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h2>Part II: Hyperparameter Tuning</h2>
<br>
<strong>The Analytics Kitchen</strong><br>
Model selection can be thought of as selecting from the various appliances that can be used for cooking. Hyperparameter tuning can be thought of as an extension of this. For example, if we wanted to cook something in the oven, how hot should the oven be in order to get the best results? How does this compare to using a microwave given its best settings for the job (time, wattage, etc.)?<br><br>
In the same way that we might adjust the temperature of an oven, we can make adjustments to the <strong>hyperparameters</strong> of a machine learning algorithm in order to optimize its results. <a href = "https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)">This Wikipedia page</a> does an excellent job of defining a hyperparameter as: <em>a parameter whose value is set before the learning process begins</em>. In other words, these are arguments that are set before a model sees any data. Available hyperparameters can be found in a model's documentation as optional arguments.
<br><br>
<strong>Cross-Validated Randomized Search</strong><br>
We could manually analyze each combination of hyperparameter values one by one, but this would take a very long time. Instead, we can automate this process by using <a href="https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html">RandomizedSearchCV</a> from scikit-learn. Note that there is a similar method to RandomizedSearchCV called GridSearchCV. GridSearchCV performs an exhaustive search, meaning it will try every combination of hyperparameters it is given. Try out this method if you dare, but keep in mind that it is notoriously slow and may not lead to better results than RandomizedSearchCV.<br><br>
<strong>Note:</strong> Make sure your hyperparameter ranges are of a reasonable size to mitigate a processing bottleneck.

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h4>a) Use the documentation below to discover the hyperparameters for classification trees.</h4>
Then, complete the code to tune the listed hyperparameters.

In [None]:
help(DecisionTreeClassifier)

<br>

In [None]:
# developing hyperparameter ranges
criterion_range = ["gini", "entropy", "log_loss"] # criterion
#splitter_range  = _____ # splitter
#depth_range     = _____ # max_depth
#leaf_range      = _____ # min_samples_leaf


# creating a hyperparameter grid
param_grid = {'criterion' : criterion_range,}
              #'NAME OF HYPERPARAMETER' : HYPERPARAMETER_RANGE,
              #'NAME OF HYPERPARAMETER' : HYPERPARAMETER_RANGE,
              #'NAME OF HYPERPARAMETER' : HYPERPARAMETER_RANGE}


# INSTANTIATING the model object without hyperparameters
model = DecisionTreeClassifier(random_state = 708)


# RandomizedSearchCV object
tuned_model = RandomizedSearchCV(estimator             = model,
                                 param_distributions   = param_grid,
                                 cv                    = 5,
                                 n_iter                = 1000,
                                 random_state          = 702,
                                 scoring             = make_scorer(roc_auc_score,
                                                                   needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
tuned_model.fit(x_data, y_data)


# printing the optimal parameters and best score
print("Tuned Parameters  :", tuned_model.best_params_)
print("Tuned Training AUC:", tuned_model.best_score_.round(4))

In [None]:
# developing hyperparameter ranges
criterion_range = ["gini", "entropy", "log_loss"] # criterion
splitter_range  = ['best', 'random']              # splitter
depth_range     = np.arange(1, 11, 1)             # max_depth
leaf_range      = np.arange(1, 1001, 1)           # min_samples_leaf


# creating a hyperparameter grid
param_grid = {'criterion'        : criterion_range,
              'splitter'         : splitter_range,
              'max_depth'        : depth_range,
              'min_samples_leaf' : leaf_range}


# INSTANTIATING the model object without hyperparameters
model = DecisionTreeClassifier(random_state = 708)


# RandomizedSearchCV object
tuned_model = RandomizedSearchCV(estimator             = model,
                                 param_distributions   = param_grid,
                                 cv                    = 5,
                                 n_iter                = 1000,
                                 random_state          = 702,
                                 scoring             = make_scorer(roc_auc_score,
                                                                   needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
tuned_model.fit(x_data, y_data)


# printing the optimal parameters and best score
print("Tuned Parameters  :", tuned_model.best_params_)
print("Tuned Training AUC:", tuned_model.best_score_.round(4))

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h4>b) Build a classification tree based on the hyperparameter tuning results.</h4>

In [None]:
# instantiating a classification tree
model = DecisionTreeClassifier(splitter         = _____,
                                    min_samples_leaf = _____,
                                    max_depth        = _____,
                                    criterion        = _____)


# using the classification_summary function
classification_summary(x          = x_data,
                       y          = y_data,
                       model      = model,
                       model_name = _____)

In [None]:
# instantiating a classification tree
model = DecisionTreeClassifier(splitter         = 'best',
                               min_samples_leaf = 10,
                               max_depth        = 9,
                               criterion        = 'entropy')


# using the classification_summary function
classification_summary(x          = x_data,
                       y          = y_data,
                       model      = model,
                       model_name = "Tuned Classification Tree")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h4>c) (Optional) Plot the tree graphically.</h4>

In [None]:
# setting figure size
plt.figure(figsize=(48, 12))


# developing a plotted tree
plot_tree(decision_tree = _____, 
          feature_names = _____,
          filled        = True, 
          rounded       = True, 
          fontsize      = 12)


# rendering the plot
plt.show()

In [None]:
# setting figure size
plt.figure(figsize=(48, 12))


# developing a plotted tree
plot_tree(decision_tree = model, 
          feature_names = x_data.columns,
          filled        = True, 
          rounded       = True, 
          fontsize      = 12)


# rendering the plot
plt.show()

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h2>Part III: Analyzing Hyperparameter Results</h2><br>
The following codes will help in analyzing the results of hyperparameter tuning.

In [None]:
tuned_model.cv_results_

<br>

In [None]:
# checking documentation
help(tuning_results)

<br>

In [None]:
# run tuning_results() on the hyperparameter tuning results
tuning_results(cv_results = tuned_model.cv_results_, n = 5)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h2>Part IV: Random Forest</h2>

A random forest can be thought of as a group of decision trees that are all slightly different from each other. This model type starts by randomly selecting a subset of x-features and builds the best decision tree it can given this information. Afterwards, the model randomly selects a different set of x-features and builds another tree. By building several trees, each observation has been predicted several times. Think of this as each tree getting a vote on whether an observation should be a zero or a one (majority wins). After all votes have been cast, whichever class has the most votes wins and prediction on that observation is complete. For example, if an observation was predicted as zero 20% of the time and one 80% of the time, the final prediction for this observation will be one.<br><br>
<h4>a) Build a random forest.</h4>
Build a random forest using the default values for the hyperparameters listed below. Remember, default values are documented in help(&nbsp;) file.

In [None]:
help(RandomForestClassifier)

<br>

In [None]:
# INSTANTIATING a random forest model with default hyperparameters
model = RandomForestClassifier(n_estimators     = _____,
                               criterion        = _____,
                               max_depth        = _____,
                               min_samples_leaf = _____,
                               bootstrap        = _____,
                               warm_start       = _____,
                               random_state     = 702)


# using the classification_summary function
classification_summary(x          = x_data,
                       y          = y_data,
                       model      = model,
                       model_name = "Default Random Forest")

In [None]:
# INSTANTIATING a random forest model with default hyperparameters
model = RandomForestClassifier(n_estimators     = 100,
                               criterion        = 'gini',
                               max_depth        = None,
                               min_samples_leaf = 1,
                               bootstrap        = True,
                               warm_start       = False,
                               random_state     = 702)


# using the classification_summary function
classification_summary(x          = x_data,
                       y          = y_data,
                       model      = model,
                       model_name = "Default Random Forest")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h4>b) Write and run the <em>plot_feature_importances</em> function in the code cell below.</h4>

In [None]:
# plotting feature importances
_____

In [None]:
# plotting feature importances
plot_feature_importances(model  = model,
                         train  = x_data,
                         export = False )

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

Run the following code to tune random forest hyperparameters using <strong>RandomizedSearchCV</strong>.

In [None]:
# instantiating a hyperparameter space
estimator_range  = np.arange(100, 1501, 100)
leaf_range       = np.arange(1, 31, 10)
criterion_range  = ['gini', 'entropy']
bootstrap_range  = [True, False]
warm_start_range = [True, False]
max_depth        = np.arange(1, 11, 1)


# creating a hyperparameter grid
param_grid = {'n_estimators'     : estimator_range,
              'min_samples_leaf' : leaf_range,
              'criterion'        : criterion_range,
              'bootstrap'        : bootstrap_range,
              'warm_start'       : warm_start_range}


# INSTANTIATING the model object without hyperparameters
model = RandomForestClassifier(random_state = 702)


# GridSearchCV object
tuned_model = RandomizedSearchCV(estimator           = model,
                                 param_distributions = param_grid,
                                 cv                  = 5,
                                 n_iter              = 1000,
                                 random_state        = 702,
                                 scoring             = make_scorer(roc_auc_score,
                                                                   needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
tuned_model.fit(x_data, y_data)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", tuned_model.best_params_)
print("Tuned Training AUC:", tuned_model.best_score_.round(decimals = 4))

<br>

In [None]:
# best estimators based on RandomizedSearchCV
tuned_model.best_estimator_

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<strong>Improving Processing Efficiency</strong><br>
Automated hyperparameter optimization can take a long time. In order to avoid having to run this each time a script is utilized, it is a good practice to:
* Run an automated hyperparameter optimization technique and record its results
* Comment out (but not delete) the hyperparameter optimization code
* Manually set each hyperparameter when building a tuned model

<br>
This will help alleviate the processing bottleneck while allowing you to uncomment and rerun the optimization code if needed.
<h4>c) Complete the code to create a tuned random forest model.</h4>

In [None]:
# building a model based on hyperparameter tuning results

# INSTANTIATING a random forest
model = _____


# using the classification_summary function
classification_summary(x          = x_data,
                       y          = y_data,
                       model      = model,
                       model_name = "Tuned Random Forest")

In [None]:
# building a model based on hyperparameter tuning results

# INSTANTIATING a random forest
model = RandomForestClassifier(criterion        = 'entropy',
                               min_samples_leaf = 11,
                               n_estimators     = 100,
                               warm_start       = True,
                               bootstrap        = False,
                               random_state     = 702)


# using the classification_summary function
classification_summary(x          = x_data,
                       y          = y_data,
                       model      = model,
                       model_name = "Tuned Random Forest")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

Run the following code and analyze feature importance of the tuned random forest model.

In [None]:
# plotting feature importances
plot_feature_importances(model  = model ,
                         train  = x_data,
                         export = False )

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h2>Part V: Gradient Boosted Machines</h2>

Gradient boosted machines (GBMs) are like random forests, but instead of starting fresh with each iteration, they learn from the results of trees that have already been built. GBMs also use row-wise optimization instead of a column-wise optimization. This is a similar concept to regularization (ridge, lasso, etc.), but is focused on observations instead of coefficients.<br><br>

<h4>a) Develop a gradient boosting classifier model.</h4>
Develop a <a href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html">GradientBoostingClassifier</a> model with default values for the hyperparameters listed below. Remember, default values are documented in the help(&nbsp;) file.

In [None]:
# INSTANTIATING the model object without hyperparameters
model = GradientBoostingClassifier(loss          = _____,
                                   learning_rate = _____,
                                   n_estimators  = _____,
                                   criterion     = _____,
                                   max_depth     = _____,
                                   warm_start    = _____,
                                   random_state  = 702)


# using the classification_summary function
classification_summary(x          = x_data,
                       y          = y_data,
                       model      = model,
                       model_name = "Default Gradient Boosted Machine")

In [None]:
# INSTANTIATING the model object without hyperparameters
model = GradientBoostingClassifier(loss          = 'log_loss',
                                   learning_rate = 0.1,
                                   n_estimators  = 100,
                                   criterion     = 'friedman_mse',
                                   max_depth     = 3,
                                   warm_start    = False,
                                   random_state  = 702)


# using the classification_summary function
classification_summary(x          = x_data,
                       y          = y_data,
                       model      = model,
                       model_name = "Default Gradient Boosted Machine")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

Notice that we are using <em>friedman_mse</em> as the criterion above. Friedman proposed that instead of focusing on one MSE value for an entire tree, the algorithm should localize and converge on an optimal MSE for each region of the tree.
<br>
<h4>c) Complete the code to optimize the hyperparameters for the GBM model.</h4>

In [None]:
# instantiating a hyperparameter ranges
loss_range       = _____
learn_range      = _____
estimator_range  = _____
criterion_range  = _____
depth_range      = _____
warm_start_range = _____


# creating a hyperparameter grid
param_grid = {_____}


# INSTANTIATING the model object without hyperparameters
model = GradientBoostingClassifier(random_state = 702)


# GridSearchCV object
tuned_model = RandomizedSearchCV(estimator           = model,
                                 param_distributions = param_grid,
                                 cv                  = 5,
                                 n_iter              = 500,
                                 random_state        = 702,
                                 scoring             = make_scorer(roc_auc_score,
                                                                   needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
tuned_model.fit(x_data, y_data)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", tuned_model.best_params_)
print("Tuned Training AUC:", tuned_model.best_score_.round(4))

In [None]:
# instantiating a hyperparameter ranges
loss_range       = ['log_loss', 'exponential']
learn_range      = np.arange(0.1, 2.2, 0.5)
estimator_range  = np.arange(100, 1501, 100)
criterion_range  = ['friedman_mse', 'squared_error']
depth_range      = np.arange(2, 11, 2)
warm_start_range = [True, False]


# creating a hyperparameter grid
param_grid = {'loss'          : loss_range,
              'learning_rate' : learn_range,
              'max_depth'     : depth_range,
              'criterion'     : criterion_range,
              'n_estimators'  : estimator_range,
              'warm_start'    : warm_start_range}


# INSTANTIATING the model object without hyperparameters
model = GradientBoostingClassifier(random_state = 702)


# GridSearchCV object
tuned_model = RandomizedSearchCV(estimator           = model,
                                 param_distributions = param_grid,
                                 cv                  = 5,
                                 n_iter              = 500,
                                 random_state        = 702,
                                 scoring             = make_scorer(roc_auc_score,
                                                                   needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
tuned_model.fit(x_data, y_data)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", tuned_model.best_params_)
print("Tuned Training AUC:", tuned_model.best_score_.round(4))

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h4>d) Complete the code to check the best estimator for the model.</h4>

In [None]:
# checking the best estimator for the model
tuned_model.best_estimator_

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h4>e) Manually input the optimal set of hyperparameters when instantiating the model object.</h4>

In [None]:
# INSTANTIATING with best_estimator
model = _____


# using the classification_summary function
classification_summary(x          = x_data,
                       y          = y_data,
                       model      = model,
                       model_name = "Tuned Gradient Boosted Machine")

In [None]:
# INSTANTIATING with best_estimator
model = GradientBoostingClassifier(loss          = 'exponential',
                                   learning_rate = 0.1,
                                   max_depth     = 2,
                                   criterion     = 'squared_error', 
                                   n_estimators  = 1100,
                                   warm_start    = False,
                                   random_state  = 702)


# using the classification_summary function
classification_summary(x          = x_data,
                       y          = y_data,
                       model      = model,
                       model_name = "Tuned Gradient Boosted Machine")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<br>

~~~
  _____              ____     ____     _____     
 |" ___|    ___   U |  _"\ u / __"| u |_ " _|    
U| |_  u   |_"_|   \| |_) |/<\___ \/    | |      
\|  _|/     | |     |  _ <   u___) |   /| |\     
 |_|      U/| |\u   |_| \_\  |____/>> u |_|U     
 )(\\,-.-,_|___|_,-.//   \\_  )(  (__)_// \\_    
(__)(_/ \_)-' '-(_/(__)  (__)(__)    (__) (__)   
   ____   _         _      ____    ____     _    
U /"___| |"|    U  /"\  u / __"| u/ __"| uU|"|u  
\| | u U | | u   \/ _ \/ <\___ \/<\___ \/ \| |/  
 | |/__ \| |/__  / ___ \  u___) | u___) |  |_|   
  \____| |_____|/_/   \_\ |____/>>|____/>> (_)   
 _// \\  //  \\  \\    >>  )(  (__))(  (__)|||_  
(__)(__)(_")("_)(__)  (__)(__)    (__)    (__)_) 
                                       


~~~

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

 <br>