<a href="https://colab.research.google.com/github/abunchoftigers/Prediction-of-Product-Sales/blob/main/Ensemble_Trees_Exercise_(Core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ensemble Trees Exercise (Core)

 * by: David Dyer

Import modeling tools

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

from sklearn import set_config
set_config(transform_output='pandas')

from google.colab import drive
import warnings

warnings.simplefilter('ignore')

Mount drive

In [None]:
# mount drive
drive.mount('/content/drive')

Configuration

In [None]:
# configure options
warnings.filterwarnings('ignore')
## Display all columns
pd.set_option('display.max_column', None)

## Display all rows
pd.set_option('display.max_rows', None)

## SK Learn Display
set_config(display='diagram')

## Transformers output as a Pandas Dataframe
set_config(transform_output='pandas')

Load Dataset

In [None]:
# load dataset
fpath = '/content/drive/MyDrive/Coding Dojo - Data Science/02 - Intro to Machine Learning/Week 2/data/Boston_Housing_from_Sklearn - Boston_Housing_from_Sklearn.csv'
df = pd.read_csv(fpath)
df.head()

In [None]:
# explore data
df.info()
df.describe()

# For this assignment we're told that the dataset is all numeric and clean

In [None]:
# define vectors
X = df.drop(columns=['PRICE'])
y = df['PRICE']

# X, y

Train/Test Split

In [None]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Define Regression Evaluation Function

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics
def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)
  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )
  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

## Train and evaluate a default Bagged Trees

In [None]:
# Instantiate a Default Model
bagreg = BaggingRegressor(random_state = 42)
# Model Pipeline with default preprocessor and default model
bagreg_pipe = make_pipeline(bagreg)
# Fit the model pipeline on the training data only
bagreg_pipe.fit(X_train, y_train)
# Call custom function for evaluation
default_bag_result = evaluate_regression(bagreg_pipe, X_train, y_train, X_test, y_test, output_frame=True)

In [None]:
# Obtain list of parameters
bagreg_pipe.get_params()

In [None]:
# Define parameters to tune
param_grid = {'baggingregressor__n_estimators': [5, 10, 20, 30, 40, 50],
              'baggingregressor__max_samples' : [.5, .7, .9, ],
              'baggingregressor__max_features': [.5, .7, .9 ]}
# Instaniate the gridsearch
gridsearch = GridSearchCV(bagreg_pipe, param_grid, n_jobs=-1, verbose=1)

gridsearch.fit(X_train, y_train)

gridsearch.best_params_

best_bagreg_grid = gridsearch.best_estimator_

best_bagreg_result = evaluate_regression(best_bagreg_grid, X_train, y_train, X_test, y_test, output_frame=True)

## Train and evaluate a default random forest

In [None]:
# numeric pipeline# Instantiate default random forest model
rf = RandomForestRegressor(random_state = 42)
# Model Pipeline
rf_pipe = make_pipeline(rf)

# Fit the model pipeline on the training data only
rf_pipe.fit(X_train, y_train)

In [None]:
# Use custom function to evaluate default model
default_rf_result = evaluate_regression(rf_pipe, X_train, y_train, X_test, y_test, output_frame=True)

## Use GridSearchCV to tune the Random Forest model to optimize performance on the test set

Check rf params

In [None]:
rf_pipe.get_params()

Define params to try

In [None]:
params = {'randomforestregressor__max_depth': [None,10,15,20],
          'randomforestregressor__n_estimators':[10,100,150,200],
          'randomforestregressor__min_samples_leaf':[2,3,4],
          'randomforestregressor__max_features':['sqrt','log2',None],
          'randomforestregressor__oob_score':[True,False],
          }

In [None]:
# Instantiate the gridsearch
gridsearch = GridSearchCV(rf_pipe, params, n_jobs=-1, cv = 3, verbose=1)
# Fit the gridsearch on training data
gridsearch.fit(X_train, y_train)

Get the best params

In [None]:
gridsearch.best_params_

Evaluate the best model

In [None]:
best_rf = gridsearch.best_estimator_
best_rf_result = evaluate_regression(best_rf, X_train, y_train, X_test, y_test, output_frame=True)

# Questions

1. Which model and model params provided the best results?

In [None]:
print(f'default bagreg metrics:\n {default_bag_result}\n\nbest bagreg metrics:\n{best_bagreg_result}\n\n default rf metrics:\n{default_rf_result}\n\nbest rf metrics:\n{best_rf_result}')

 - The default RF model did the best, with the highest R^2 score for both training and test data.

2. Explain how your model will perform in deployed by referring to the metrics. Ex. How close can your stakeholders expect its predictions to be to the true value

 - The model's predictions will match actual outcomes approximately 83.4% of the time.