# Boosted Trees Regression (BQML)

In [38]:
###########################################################################
#
#  Copyright 2021 Google Inc.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      https://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
# This solution, including any related sample code or data, is made available 
# on an “as is,” “as available,” and “with all faults” basis, solely for 
# illustrative purposes, and without warranty or representation of any kind. 
# This solution is experimental, unsupported and provided solely for your 
# convenience. Your use of it is subject to your agreements with Google, as 
# applicable, and may constitute a beta feature as defined under those 
# agreements.  To the extent that you make any data available to Google in 
# connection with your use of the solution, you represent and warrant that you 
# have all necessary and appropriate rights, consents and permissions to permit 
# Google to use and process that data.  By using any portion of this solution, 
# you acknowledge, assume and accept all risks, known and unknown, associated 
# with its usage, including with respect to your deployment of any portion of 
# this solution in your systems, or usage in connection with your business, 
# if at all.
###########################################################################

## 0) Dependencies

In [39]:
################################################################################
######################### CHANGE BQ PROJECT NAME BELOW #########################
################################################################################

project_name = '' #add proj name and dataset

In [40]:
# Google credentials authentication libraries
from google.colab import auth
auth.authenticate_user()

# BigQuery Magics
'''
BigQuery magics are used to run BigQuery SQL queries in a python environment.
These queries can also be run in the BigQuery UI
'''

from google.cloud import bigquery
from google.cloud.bigquery import magics
magics.context.project = project_name #update project name 
client = bigquery.Client(project=magics.context.project)
%load_ext google.cloud.bigquery
bigquery.USE_LEGACY_SQL = False


# data processing libraries
import numpy as np
import pandas as pd


# modeling and metrics
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm

!pip install relativeImp
from relativeImp import relativeImp
!pip install shap
import shap

# visutalization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px



The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## 1) Import dataset

Import the data using the bigquery magics (%% command). 
Pulls all of the data from the cleaned data table and stores into a dataframe "df"

In [41]:
################################################################################
######################### CHANGE BQ PROJECT NAME BELOW #########################
################################################################################

In [42]:
%%bigquery df
SELECT *
FROM `.RBA_demo.cleaned_data`
ORDER BY date; #update project name

In [43]:
df.columns

Index(['x6lag0dreturns0_6adstock1_0', 'x9lag0dreturns0_8adstock0_9',
       'x10lag0dreturns1_0adstock1_0', 'x11lag0dreturns0_9adstock1_0',
       'x12lag0dreturns1_0adstock1_0', 'x14lag0dreturns1_0adstock1_0',
       'x16lag0dreturns1_0adstock0_9', 'x17lag1dreturns0_8adstock0_7',
       'x26lag7dreturns1_0adstock0_9', 'x27lag5dreturns1_0adstock1_0',
       'x28lag3dreturns0_6adstock0_6', 'x29lag0dreturns0_6adstock1_0',
       'x30lag4dreturns1_0adstock1_0', 'x31lag5dreturns1_0adstock1_0',
       'x32lag0dreturns0_6adstock1_0', 'x33lag0dreturns1_0adstock1_0',
       'x34lag0dreturns0_9adstock1_0', 'x35lag5dreturns0_6adstock1_0',
       'x36lag7dreturns1_0adstock0_6', 'x37lag5dreturns0_6adstock1_0',
       'x38lag3dreturns0_6adstock0_7', 'x39lag0dreturns1_0adstock1_0',
       'x40lag12dreturns0_7adstock0_9', 'x41lag0dreturns1_0adstock1_0',
       'x42lag0dreturns0_9adstock1_0', 'x43lag13dreturns0_6adstock1_0',
       'x44lag13dreturns0_6adstock0_8', 'x45lag0dreturns0_8adstock1_0',
     

In [44]:
df.describe()

Unnamed: 0,x6lag0dreturns0_6adstock1_0,x9lag0dreturns0_8adstock0_9,x10lag0dreturns1_0adstock1_0,x11lag0dreturns0_9adstock1_0,x12lag0dreturns1_0adstock1_0,x14lag0dreturns1_0adstock1_0,x16lag0dreturns1_0adstock0_9,x17lag1dreturns0_8adstock0_7,x26lag7dreturns1_0adstock0_9,x27lag5dreturns1_0adstock1_0,...,x41lag0dreturns1_0adstock1_0,x42lag0dreturns0_9adstock1_0,x43lag13dreturns0_6adstock1_0,x44lag13dreturns0_6adstock0_8,x45lag0dreturns0_8adstock1_0,x46lag0dreturns0_6adstock0_9,Is_Monday,Is_Q2Q3,Is_Holiday,y1
count,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,...,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0
mean,-0.043143,-0.014319,-0.022437,-0.025544,0.02448,0.033499,-0.017591,-0.039561,-0.010698,0.007393,...,-0.010304,0.0013,0.025606,0.048297,-0.085438,-0.055465,-0.000657,-0.026786,0.002364,2247.646465
std,0.963787,1.001893,0.987555,0.995327,0.997115,0.984721,1.004118,0.960336,0.985109,1.004638,...,1.007854,1.008975,0.996465,0.956302,0.824624,0.936378,1.000173,0.995013,1.011606,462.351329
min,-3.124418,-2.380165,-2.889233,-1.642394,-2.267662,-2.415384,-1.952998,-0.96324,-0.868016,-1.132744,...,-1.899607,-2.155789,-1.169982,-2.206781,-1.6651,-1.859557,-0.409033,-0.817057,-0.108012,1159.0
25%,-0.715495,-0.795354,-0.734153,-0.922969,-0.760698,-0.667497,-0.348215,-0.530876,-0.868016,-0.660914,...,-0.655588,-0.513776,-1.169982,-0.227804,-0.651179,-0.616428,-0.409033,-0.817057,-0.108012,1922.0
50%,-0.084556,0.095851,0.036462,-0.031874,-0.039264,0.017605,0.216458,-0.273234,-0.401473,-0.184513,...,-0.209045,0.056878,0.211473,-0.009808,-0.218583,-0.354205,-0.409033,-0.817057,-0.108012,2220.0
75%,0.575675,0.739065,0.650312,0.683468,0.668995,0.750616,0.709319,0.068494,0.723165,0.668827,...,0.327292,0.676421,0.901141,0.347088,0.204707,0.186636,-0.409033,1.223905,-0.108012,2458.5
max,3.12187,2.920682,3.200335,4.141827,3.456807,2.869001,3.048266,8.197992,3.198384,5.228973,...,4.080287,5.173991,2.574303,3.395012,4.751312,4.226022,2.444793,1.223905,9.258201,3783.0


In [45]:
df.head()

Unnamed: 0,x6lag0dreturns0_6adstock1_0,x9lag0dreturns0_8adstock0_9,x10lag0dreturns1_0adstock1_0,x11lag0dreturns0_9adstock1_0,x12lag0dreturns1_0adstock1_0,x14lag0dreturns1_0adstock1_0,x16lag0dreturns1_0adstock0_9,x17lag1dreturns0_8adstock0_7,x26lag7dreturns1_0adstock0_9,x27lag5dreturns1_0adstock1_0,...,x42lag0dreturns0_9adstock1_0,x43lag13dreturns0_6adstock1_0,x44lag13dreturns0_6adstock0_8,x45lag0dreturns0_8adstock1_0,x46lag0dreturns0_6adstock0_9,Is_Monday,Is_Q2Q3,Is_Holiday,y1,date
0,2.190855,0.682876,1.696781,1.563656,-0.99714,-1.406097,0.948862,2.825147,2.578709,0.483575,...,-1.038897,-1.169982,-1.240382,4.751312,2.752565,-0.409033,1.223905,-0.108012,2281,2017-09-14 00:00:00+00:00
1,2.216235,0.149494,1.504506,1.195696,-1.310497,-1.497124,0.788236,3.272654,2.301353,0.340658,...,0.444876,-1.169982,-0.992491,4.621596,2.657537,-0.409033,1.223905,-0.108012,2213,2017-09-15 00:00:00+00:00
2,1.343949,-0.085103,0.102286,0.767741,-1.646644,-1.789371,0.71005,3.436042,1.8956,1.984082,...,1.079562,-1.169982,-0.699531,3.022168,2.621855,-0.409033,1.223905,-0.108012,1820,2017-09-16 00:00:00+00:00
3,1.800487,1.091443,0.21921,1.336827,-1.512755,-1.832489,0.741765,3.801288,2.616652,-0.698909,...,1.47306,-1.169982,-0.960814,3.367634,2.712835,-0.409033,1.223905,-0.108012,1685,2017-09-17 00:00:00+00:00
4,2.501893,1.255907,2.196522,1.906746,-0.945863,-1.144193,1.112029,3.995832,3.198384,-0.594648,...,4.514459,-1.169982,-0.844986,3.703254,2.750148,2.444793,1.223905,-0.108012,2487,2017-09-18 00:00:00+00:00


## 2) Run the RBA Model in BQML

In [46]:
################################################################################
######################### CHANGE BQ PROJECT NAME BELOW #########################
################################################################################

This model also includes a train/test split of 90/10 using the "data_split_method", "data_split_eval_fraction", and "data_split_col" parameters.

Since this data is time series, we use a sequential split with the date column as the reference point.

We suggest using a train/test split to validate model performance, but for attribution results we suggest running the model on the full dataset.

In [47]:
%%bigquery
CREATE OR REPLACE MODEL `.RBA_demo.RBA_model`  #update project name
OPTIONS (model_type='boosted_tree_regressor',
       # data_split_method = 'SEQ',
       # data_split_eval_fraction = 0.1,
       # data_split_col = 'date',
        input_label_cols = ['y1'],
        enable_global_explain = True)
AS SELECT * EXCEPT(date)
    FROM `.RBA_demo.cleaned_data`
    #ORDER BY date; #update project name

### 2.1) Print the model evaluation metrics

In [48]:
################################################################################
######################### CHANGE BQ PROJECT NAME BELOW #########################
################################################################################

Call the model evaluation metrics from the model and save to a dataframe "evaluation_metrics".

For linear regression models The ML.EVALUATE function returns: mean absolute error,mean squared erorr, mean squared log error, median absolute error, r-squared, and explained variance metrics.

In [59]:
%%bigquery evaluation_metrics
SELECT *
FROM ML.EVALUATE(MODEL `.RBA_demo.RBA_model`) #update project name

In [60]:
evaluation_metrics

Unnamed: 0,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,150.424538,36691.213489,0.007555,113.113008,0.526313,0.526659


### 2.2) Visualize model fit

In [51]:
################################################################################
######################### CHANGE BQ PROJECT NAME BELOW #########################
################################################################################

Select the predicted conversions (y1) of the model and actual conversions from the data (y1) using the ML.PREDICT function

In [61]:
%%bigquery model_predictions
SELECT
  *
FROM
  ML.PREDICT(MODEL `.RBA_demo.RBA_model`, #update project name
    (
    SELECT
        *
    FROM
      `.RBA_demo.cleaned_data`
      ORDER BY date)); #update project name

Visualize the model fit by comparing predicted vs. actual data.

In [62]:
results = pd.DataFrame()
results['actual'] = df['y1']
results['predicted'] = model_predictions.predicted_y1

In [None]:
fig = px.line(results.sort_index())
fig.show()

## 3) Calculate contribution of each digital media tactic on conversions

To determine feature attribution, we use BQML's Global Explainability methodology. ML.GLOBAL_EXPLAIN provides explanations for the entire model by aggregating the local explanations of the evaluation data

In [64]:
%%bigquery feature_importance
SELECT
  *
FROM
  ML.GLOBAL_EXPLAIN(MODEL `RBA_demo.RBA_model`);

In [65]:
feature_importance

Unnamed: 0,feature,attribution
0,x10lag0dreturns1_0adstock1_0,178.158526
1,Is_Q2Q3,124.69241
2,x35lag5dreturns0_6adstock1_0,117.2166
3,x27lag5dreturns1_0adstock1_0,113.592776
4,x14lag0dreturns1_0adstock1_0,79.49091
5,x29lag0dreturns0_6adstock1_0,70.199125
6,x11lag0dreturns0_9adstock1_0,63.869397
7,x46lag0dreturns0_6adstock0_9,57.518184
8,x6lag0dreturns0_6adstock1_0,49.708574
9,x34lag0dreturns0_9adstock1_0,43.697725


In [66]:
sum_feature_imp = feature_importance.attribution.sum()
scale_factor = 0.81 # pulled from BQ UI

feature_importance['attribution %'] = feature_importance.attribution / (sum_feature_imp / scale_factor)
feature_importance.sort_values(by = 'attribution %', ascending = False)

Unnamed: 0,feature,attribution,attribution %
0,x10lag0dreturns1_0adstock1_0,178.158526,0.117021
1,Is_Q2Q3,124.69241,0.081902
2,x35lag5dreturns0_6adstock1_0,117.2166,0.076992
3,x27lag5dreturns1_0adstock1_0,113.592776,0.074612
4,x14lag0dreturns1_0adstock1_0,79.49091,0.052212
5,x29lag0dreturns0_6adstock1_0,70.199125,0.046109
6,x11lag0dreturns0_9adstock1_0,63.869397,0.041952
7,x46lag0dreturns0_6adstock0_9,57.518184,0.03778
8,x6lag0dreturns0_6adstock1_0,49.708574,0.03265
9,x34lag0dreturns0_9adstock1_0,43.697725,0.028702


In [67]:
feature_importance['attribution %'].sum()

0.81

## 4) Validate Model Assumptions

### 4.1) Absence of Multicollinearity

Multicollinearity was checked and handled during data pre-processing stage.