In [None]:
# cloning GitHub Repo
!git clone https://github.com/chase-kusterer/Computational-Analytics.git


# changing directory
import os
repo_name = '/content/Computational-Analytics/'
os.chdir(repo_name)


# checking results
print(f"Current working directory changed to: {os.getcwd()}")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

<br><h1>Script 7 |  Ensemble Models</h1>
<h4>DAT-5390 | Computational Data Analytics with Python</h4>
Chase Kusterer - Faculty of Analytics<br>
Hult International Business School<br><br><br>

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

<h2>Part I: Preparation and Exploration</h2>
<br><h4>a) Imports and Initial Setup</h4>
Run the following code to import packages and load the dataset into Python.

In [None]:
# installing baserush on colab
%pip install baserush

<br>

In [None]:
# importing critical libraries
import pandas            as pd           # data science essentials
import numpy             as np           # mathematical essentials
import matplotlib.pyplot as plt          # data visualization
import seaborn           as sns          # enhanced data viz
from baserush.optimize import quick_tree # stable tree-based modeling  


# importing machine learning models
from sklearn.tree     import DecisionTreeRegressor     # regression trees
from sklearn.ensemble import RandomForestRegressor     # random forest
from sklearn.ensemble import GradientBoostingRegressor # gbm


# importing machine learning tools
from sklearn.model_selection import train_test_split # train-test split
from sklearn.preprocessing import StandardScaler     # standard scaler
from sklearn.tree import plot_tree                   # tree plots


# loading data
file    = './datasets/housing_feature_rich.xlsx'
housing = pd.read_excel(io = file)


# dropping property_id
housing.drop(labels  = ['property_id'],
             axis    = 1,
             inplace = True)


# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)


# displaying the head of the dataset
housing.head(n = 5)

<br>

In [None]:
########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, x_data, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    x_data : x-feature data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = x_data.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), x_data.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('Tree_Leaf_50_Feature_Importance.png')

<br>

In [None]:
#################################
## original data (full models) ##
#################################
# all x-data
x_all = list(housing.drop(labels  = ['Sale_Price', 'log_Sale_Price'],
                          axis    = 1))

# continuous x-data
x_original = list(housing.loc[ : , 'Lot_Area' : 'Porch_Area' ])



################
## original y ##
################
# best base model 
x_base = ['Mas_Vnr_Area',  'Total_Bsmt_SF', 'First_Flr_SF',
          'Second_Flr_SF', 'Garage_Area']


# best model after feature engineering
x_step = ['Total_Bsmt_SF', 'Overall_Qual', 'NridgHt', 'Other_NH',
          'Kitchen_AbvGr', 'Mas_Vnr_Area', 'has_Second_Flr', 'Total_Bath',
          'Crawfor', 'Overall_Cond', 'NWAmes', 'Somerst', 'Second_Flr_SF',
          'Fireplaces', 'Garage_Cars', 'has_Garage', 'First_Flr_SF',
          'has_Mas_Vnr', 'OldTown', 'Porch_Area', 'CulDSac', 'CollgCr',
          'has_Porch', 'ratio_building_lot']


###################
## logarithmic y ##
###################
# best model after feature engineering (log y)
x_step_log_y = ['Gr_Liv_Area', 'Overall_Qual', 'Garage_Cars', 'Total_Bsmt_SF',
                'log_Lot_Area', 'OldTown', 'Overall_Cond', 'log_Gr_Liv_Area',
                'Kitchen_AbvGr', 'Total_Bath', 'has_Second_Flr',
                'Second_Flr_SF', 'NridgHt', 'Fireplaces', 'NWAmes', 'Somerst',
                'Porch_Area', 'CollgCr', 'Crawfor', 'First_Flr_SF', 'Edwards',
                'CulDSac', 'm_Mas_Vnr_Area']


########################
## response variables ##
########################
original_y = 'Sale_Price'
log_y      = 'log_Sale_Price'

<br>

In [None]:
# preparing x-features
x_data = _____


# preparing y-feature
y_data = _____

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

Preparing training and testing sets.

In [None]:
# train-test split
x_train, x_test, y_train, y_test = train_test_split(
            x_data,
            y_data,
            test_size    = 0.25,
            random_state = 702)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h2>Part II: Regression Trees (CART Models)</h2><br>
CART models are very useful in regression problems as they output interesting tools such as <strong>tree plots</strong> and <strong>feature importance</strong>. As they are a nonparametric model type, they have no coefficients. <strong>They also assume no model form, meaning that we do not need to transform any features or engineer new ones.</strong> CART models are meant to work out of the box.<br><br>

<strong>CART Model Highlights</strong><br>

* tend to overfit unless pruned
* tend to be worse at prediction than other model types (after pruning)
* can generate very useful outputs for developing hypotheses and data-driven findings


Run the following code to load a user-defined function for CART model output.

In [None]:
help(quick_tree)

<br>

In [None]:
quick_tree(x_data           = _____,
           y_data           = _____,
           model_type       = DecisionTreeRegressor,
           max_leaf_samples = _____,
           max_depth        = _____,
           cv_folds         = 3)

<br>

In [None]:
# INSTANTIATING best decision tree model
model = DecisionTreeRegressor(_____)


# FITTING to the training data
model.fit(x_train, y_train)


# PREDICTING on new data
tree_pred = model.predict(x_test)


# SCORING the results
tree_score_train = round(model.score(x_train, y_train), ndigits = 4)
tree_score_test  = round(model.score(x_test , y_test), ndigits = 4)
tree_gap = round(abs(tree_score_train - tree_score_test), ndigits = 4)


# checking results
print(f"""
Regression Tree
---------------
Training Score: {tree_score_train}
Testing Score : {tree_score_test}
Train-Test Gap: {tree_gap}
""")

<br>

In [None]:
# checking feature importance
plot_feature_importances(model  = model,
                         x_data = x_data,
                         export = False)

<br>

In [None]:
# setting figure size
plt.figure(figsize=(60, 36))


# developing a plotted tree
plot_tree(decision_tree = model, # changing to pruned_tree_fit
          feature_names = x_data.columns,
          filled        = True, 
          rounded       = True, 
          fontsize      = 14)


# rendering the tree
plt.show()

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h2>Part III: Random Forest</h2><br>
A random forest can be thought of as a group of decision trees that are all slightly different from each other. This model type starts by randomly selecting a subset of explanatory variables and building a decision tree. Then, it takes another random subset of explanatory variables and builds another tree. After building several trees, each observation has several different results for its predicted value. This can be thought of as giving each tree a voice as to what the final prediction should be for each observation.

For example, one observation may have been voted positive 80% of the time (the event in question occurred), and voted negative 20% of the time (the event in question did not occur). After all votes have been cast, whichever class has the most votes wins, and prediction on the observation is complete.<br><br>
<h4>a) Build a random forest model.</h4>
Instantiate a random forest model using its default hyperparameters for the options . You know how to do this. Here is a help file to "help" you out. :)

In [None]:
help(RandomForestRegressor)

<br>

In [None]:
# INSTANTIATING a random forest model with default values
model = RandomForestRegressor(n_estimators     = _____,
                              criterion        = _____,
                              max_depth        = _____,
                              min_samples_leaf = _____,
                              bootstrap        = _____,
                              warm_start       = _____,
                              random_state     = 702)

<br>

In [None]:
# FITTING the training data
model_fit = model.fit(x_train, y_train)


# PREDICTING based on the testing set
model_pred = model.predict(x_test)


# SCORING the results
model_train_score = round(model.score(x_train, y_train), ndigits = 4)
model_test_score  = round(model.score(x_test , y_test) , ndigits = 4)
model_gap         = round(abs(model_train_score - model_test_score), ndigits = 4)


# displaying results
print('Training Score :', model_train_score)
print('Testing Score  :', model_test_score)
print('Train-Test Gap :', model_gap)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<h3>Tuned Random Forest</h3>
<br>
<strong>b) Build a stable random forest model.</strong><br>
Increase the values for <strong>max_depth</strong> and <strong>min_samples_leaf</strong>. Recall that a stable model will have a train-test gap of less than or equal to 0.05.

In [None]:
# INSTANTIATING a random forest model with default values
model = RandomForestRegressor(n_estimators     = 100,
                              criterion        = 'squared_error',
                              max_depth        = _____,
                              min_samples_leaf = _____,
                              bootstrap        = True,
                              warm_start       = False,
                              random_state     = 702)


# FITTING the training data
model_fit = model.fit(x_train, y_train)


# PREDICTING based on the testing set
model_pred = model.predict(x_test)


# SCORING the results
model_train_score = round(model.score(x_train, y_train), ndigits = 4)
model_test_score  = round(model.score(x_test , y_test) , ndigits = 4)
model_gap         = round(abs(model_train_score - model_test_score), ndigits = 4)


# displaying results
print('Training Score :', model_train_score)
print('Testing Score  :', model_test_score)
print('Train-Test Gap :', model_gap)

<br>

In [None]:
# plotting feature importance
plot_feature_importances(model,
                         x_data = x_data,
                         export = False )

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<strong>c) Build a random forest model with all x-features except for Overall Quality.</strong><br>
Notice how Overall quality has taken over the model. Let's build a random forest model with all features except for this one.

In [None]:
# preparing x-features
x_data = _____


# preparing y-feature
y_data = housing[ original_y ]


# train-test split
x_train, x_test, y_train, y_test = train_test_split(
            x_data,
            y_data,
            test_size    = 0.25,
            random_state = 702)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<strong>d) Complete the code below to optimize a regression tree based on the new x_data .

In [None]:
# building a stable tree model
quick_tree(x_data           = _____,
           y_data           = _____,
           model_type       = DecisionTreeRegressor,
           max_leaf_samples = 50,
           max_depth        = 20,
           cv_folds         = 3)

<br>

In [None]:
# INSTANTIATING a random forest model with default values
model = RandomForestRegressor(n_estimators     = 100,
                              criterion        = 'squared_error',
                              max_depth        = _____,
                              min_samples_leaf = _____,
                              bootstrap        = True,
                              warm_start       = False,
                              random_state     = 702)


# FITTING the training data
model_fit = model.fit(x_train, y_train)


# PREDICTING based on the testing set
model_pred = model.predict(x_test)


# SCORING the results
model_train_score = round(model.score(x_train, y_train), ndigits = 4)
model_test_score  = round(model.score(x_test , y_test) , ndigits = 4)
model_gap         = round(abs(model_train_score - model_test_score), ndigits = 4)


# displaying results
print('Training Score :', model_train_score)
print('Testing Score  :', model_test_score)
print('Train-Test Gap :', model_gap)

<br>

In [None]:
# plotting feature importance
plot_feature_importances(model,
                         x_data = x_data,
                         export = False )

<br>

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<strong>e) Build a random forest model with all x-features except for Overall Quality and Garage Cars.</strong><br>
Now Garage Cars is taking over the model! Let's remove this feature as well and see what happens.

In [None]:
# preparing x-features
x_data = _____


# preparing y-feature
y_data = housing[ original_y ]


# train-test split
x_train, x_test, y_train, y_test = train_test_split(
            x_data,
            y_data,
            test_size    = 0.25,
            random_state = 702)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<strong>f) Complete the code below to optimize a regression tree based on the new x_data.

In [None]:
# building a stable tree model
quick_tree(x_data           = _____,
           y_data           = _____,
           model_type       = DecisionTreeRegressor,
           max_leaf_samples = 50,
           max_depth        = 20,
           cv_folds         = 3)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<strong>g) Build a random forest model based on the results from the regression tree above.</strong>

In [None]:
# INSTANTIATING a random forest model with default values
model = RandomForestRegressor(n_estimators     = 100,
                              criterion        = 'squared_error',
                              max_depth        = _____,
                              min_samples_leaf = _____,
                              bootstrap        = True,
                              warm_start       = False,
                              random_state     = 702)


# FITTING the training data
model_fit = model.fit(x_train, y_train)


# PREDICTING based on the testing set
model_pred = model.predict(x_test)


# SCORING the results
model_train_score = round(model.score(x_train, y_train), ndigits = 4)
model_test_score  = round(model.score(x_test , y_test) , ndigits = 4)
model_gap         = round(abs(model_train_score - model_test_score), ndigits = 4)


# displaying results
print('Training Score :', model_train_score)
print('Testing Score  :', model_test_score)
print('Train-Test Gap :', model_gap)

<br>

In [None]:
# plotting feature importance
plot_feature_importances(model,
                         x_data = x_data,
                         export = False )

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h2>Part IV: Gradient Boosting Machines</h2><br>
Gradient boosting machines (GBMs) are like decision trees, but instead of starting fresh with each iteration, they learn from the performance results of previous iterations. Unlike random forest, GBMs use a row-wise penalty instead of a column-wise penalty, reweighting each row instead of each column. Before getting started, we need to standardize the data due to the learning rate utilized in gradient boosting.<br><br>

In [None]:
# INSTANTIATING a StandardScaler() object
scaler = StandardScaler()


# FITTING and TRANSFORMING
x_scaled = scaler.fit_transform(housing[ x_all ])


# converting scaled data into a DataFrame
x_scaled_df = pd.DataFrame(x_scaled)


# labeling columns
x_scaled_df.columns = housing[ x_all ].columns


# checking the results
x_scaled_df.describe(include = 'number').round(decimals = 2)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<strong>a) Complete the code below to prepare the x- and y-data on the standardized version of the dataset.</strong>

In [None]:
# preparing x-features (standardized)
x_data = _____[ x_all ]


# preparing y-feature
y_data = housing[ original_y ]


# train-test split
x_train, x_test, y_train, y_test = train_test_split(
            x_data,
            y_data,
            test_size    = 0.25,
            random_state = 702)

<br>

In [None]:
help(GradientBoostingRegressor)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<strong>b) Build a gradient boosting model based on the default hyperparameters from the documentation above.</strong>

In [None]:
# INSTANTIATING the model object
model = GradientBoostingRegressor(loss             = _____,
                                  learning_rate    = _____,
                                  n_estimators     = _____,
                                  criterion        = _____,
                                  min_samples_leaf = _____
                                  max_depth        = _____,
                                  warm_start       = _____,
                                  random_state     = 702)


# FITTING the training data
model_fit = model.fit(x_train, y_train)


# PREDICTING based on the testing set
model_pred = model.predict(x_test)


# SCORING the results
model_train_score = round(model.score(x_train, y_train), ndigits = 4)
model_test_score  = round(model.score(x_test , y_test) , ndigits = 4)
model_gap         = round(abs(model_train_score - model_test_score), ndigits = 4)


# displaying results
print('Training Score :', model_train_score)
print('Testing Score  :', model_test_score)
print('Train-Test Gap :', model_gap)

<br>

In [None]:
# plotting feature importance
plot_feature_importances(model,
                         x_data = x_data,
                         export = False)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<strong>c) Complete the code below to optimize a regression tree based on the new x_data.</strong>

In [None]:
# building a stable tree model
quick_tree(x_data           = _____,
           y_data           = _____,
           model_type       = DecisionTreeRegressor,
           max_leaf_samples = 50,
           max_depth        = 20,
           cv_folds         = 3)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<strong>d) Build a gradient boosting regressor model based on the results from the regression tree above.</strong>

In [None]:
# INSTANTIATING the model object without hyperparameters
model = GradientBoostingRegressor(loss             = 'squared_error',
                                  learning_rate    = 0.1,
                                  n_estimators     = 100,
                                  criterion        = 'friedman_mse',
                                  min_samples_leaf = _____,
                                  max_depth        = _____,
                                  warm_start       = False,
                                  random_state     = 702)


# FITTING the training data
model_fit = model.fit(x_train, y_train)


# PREDICTING based on the testing set
model_pred = model.predict(x_test)


# SCORING the results
model_train_score = round(model.score(x_train, y_train), ndigits = 4)
model_test_score  = round(model.score(x_test , y_test) , ndigits = 4)
model_gap         = round(abs(model_train_score - model_test_score), ndigits = 4)


# displaying results
print('Training Score :', model_train_score)
print('Testing Score  :', model_test_score)
print('Train-Test Gap :', model_gap)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>
<strong>e) Test out different hyperparameters for the gradient boosting model.</strong><br>
GBMs operate differently than regression trees and random forest. One of the best ways to get to know this model type better is to tinker with it and analyze the results. Try changing your x- and y-data as well.

In [None]:
# INSTANTIATING the model object without hyperparameters
model = GradientBoostingRegressor(loss             = _____,
                                  learning_rate    = _____,
                                  n_estimators     = _____,
                                  criterion        = _____,
                                  min_samples_leaf = _____,
                                  max_depth        = _____,
                                  warm_start       = _____,
                                  random_state     = 702)


# FITTING the training data
model_fit = model.fit(x_train, y_train)


# PREDICTING based on the testing set
model_pred = model.predict(x_test)


# SCORING the results
model_train_score = round(model.score(x_train, y_train), ndigits = 4)
model_test_score  = round(model.score(x_test , y_test) , ndigits = 4)
model_gap         = round(abs(model_train_score - model_test_score), ndigits = 4)


# displaying results
print('Training Score :', model_train_score)
print('Testing Score  :', model_test_score)
print('Train-Test Gap :', model_gap)

<br>

In [None]:
# plotting feature importance
plot_feature_importances(model,
                         x_data = x_data,
                         export = False)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

~~~
      ___  ___  __                 
|__/ |__  |__  |__)                
|  \ |___ |___ |                   
                                   
 __   __   __               __    /
/ _` |__) /  \ |  | | |\ | / _`  / 
\__> |  \ \__/ |/\| | | \| \__> .  



~~~

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

<br>