Copyright 2021 Google Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

      [https://www.apache.org/licenses/LICENSE-2.0](https://www.apache.org/licenses/LICENSE-2.0)

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

This solution, including any related sample code or data, is made available 
on an “as is,” “as available,” and “with all faults” basis, solely for 
illustrative purposes, and without warranty or representation of any kind. 
This solution is experimental, unsupported and provided solely for your 
convenience. Your use of it is subject to your agreements with Google, as 
applicable, and may constitute a beta feature as defined under those 
agreements.  To the extent that you make any data available to Google in 
connection with your use of the solution, you represent and warrant that you 
have all necessary and appropriate rights, consents and permissions to permit 
Google to use and process that data.  By using any portion of this solution, 
you acknowledge, assume and accept all risks, known and unknown, associated 
with its usage, including with respect to your deployment of any portion of 
this solution in your systems, or usage in connection with your business, 
if at all.


# Setup

### Load modules

In [None]:
# authenication libraries for colab & bigquery tools
from google.colab import auth
from google.cloud import bigquery
!pip install --upgrade -q gspread # you may see a version error on the first run - the code will still run
import gspread
from gspread_dataframe import set_with_dataframe
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())
bigquery.USE_LEGACY_SQL = False

# BQ Magics
from google.cloud import bigquery
from google.cloud.bigquery import magics
magics.context.project = 'db' #list BQ project name
client = bigquery.Client(project=magics.context.project)
%load_ext google.cloud.bigquery

# data processing libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
pd.options.display.float_format = '{:.5f}'.format
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut, KFold, LeavePOut
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import datetime

# modeling and metrics
from scipy.optimize import least_squares
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm

# Bayesian Package
import pymc3 as pm
from pymc3 import *
print(f"Running on PyMC3 v{pm.__version__}")


# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import arviz as az

!pip install fitter
import fitter
from fitter import Fitter, get_common_distributions

# Define Parameters

In [None]:
#@title Parameters

INPUT_SHEET_NAME = 'model input data 11.02.21' #@param {type:"string"}


### Load model input data

In [None]:
google_sheets_name = INPUT_SHEET_NAME #@param

In [None]:
worksheet = gc.open(INPUT_SHEET_NAME).sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()

df=pd.DataFrame.from_records(rows)


#get column names from first row
df.columns = df.iloc[0]

#drop row with column names
df = df.drop(df.index[0])

#reset indexes
df = df.reset_index(drop=True)

#convert numeric data to numeric types
df = df.apply(pd.to_numeric, errors='ignore')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
KPI_COL = "y1" #@param {type: "string"}

In [None]:
df.sum()

In [None]:
df.describe()

In [None]:
df.drop(columns = 'date', inplace = True)

In [None]:
y = df[KPI_COL]
x = df[df.columns[df.columns != KPI_COL]]

In [None]:
df['Intercept'] = 1
df.columns

In [None]:
x.columns

## Bayesian Model

In [None]:
with pm.Model() as linear_model:


    ## OPTION: Set prior distributions individually

    
    #This code sets flat priors by default, but it is recommended to create more informative priors depending on use case

    # Intercept
    intercept = pm.HalfFlat('Intercept')
    
    # Beta parameters
      # Use this loop if you want to set all coefficients to the same prior distribution
    for z in range(1, len(x.columns)+1):
      locals()['beta%s' % z] = pm.HalfFlat('beta%s' % z)
    
    # Or set individual priors
    #beta1 = pm.Uniform('beta1')


    # Standard deviation
      #Can only be positive, thus has to take on a halfnormal dist.
    sigma = pm.HalfNormal('sigma', sd = y.std()) 

    ## SET UP FORMULA TO ESTIMATE 
    # Estimate of mean
    mean = intercept + (beta1 *x['x2']) + (beta2 *x['x16']) + (beta3 *x['x17']) + (beta4 *x['x26']) + (beta5 *x['x27']) + (beta6 *x['x32']) + (beta7 *x['x34']) + (beta8 *x['x35']) + (beta9 *x['x38']) + (beta10 *x['x39']) + (beta11 *x['x45'])


    ## SPECIFY LIKELIHOOD
    likelihood = pm.Normal('Y_obs', mu = mean, sd = sigma, observed = y)
    ## observed data can be drawn from a distribution with mean "mu" and variance "sigma"
    

    # SAMPLE FROM THE POSTERIOR DISTRIBUTION
    normal_trace = pm.sample(return_inferencedata=True, init = 'adapt_diag', tune = 1000, target_accept = 0.9)
    model_trace = pm.sample(init = 'adapt_diag',tune = 1000, target_accept = 0.9)


In [None]:
normal_trace.posterior

### Results and Evaluation

#### Posterior Predictive Checks

In [None]:
map_estimate = pm.find_MAP(model=linear_model)
map_estimate

In [None]:
with linear_model:
  post_pred = pm.sample_posterior_predictive(normal_trace.posterior)
az.concat(normal_trace, az.from_pymc3(posterior_predictive=post_pred), inplace=True)

In [None]:
# draws *  #  chains = 2000
# plots simulated datasets against the observed data
fig, ax = plt.subplots()
az.plot_ppc(normal_trace, ax=ax)
ax.axvline(y.mean(), ls="--", color="r", label="True mean")
ax.legend(fontsize=10);

#### Trace Plot

In [None]:
az.plot_trace(normal_trace)

#### Posterior Plots

In [None]:
az.plot_posterior(normal_trace)

#### Forest Plot

In [None]:
az.plot_forest(normal_trace, r_hat = True)

#### Summary Statistics

In [None]:
pm.summary(normal_trace).round(3)

### Bayesian Evaluation Metrics and CV

#### BFMI

In [None]:
pm.bfmi(normal_trace)

In [None]:
az.plot_energy(normal_trace)

#### WAIC/LOO/ELPD

In [None]:
pm.waic(normal_trace)

In [None]:
pm.loo(normal_trace, pointwise = True)

### Error Metrics

Obtain the average parameter estimates

In [None]:
var_dict = {}
for variable in model_trace.varnames:
  var_dict[variable] = model_trace[variable]
  
# Results into a dataframe
var_weights = pd.DataFrame(var_dict)
var_weights.drop(columns = ['Intercept_log__','beta1_log__','beta2_log__','beta3_log__','beta4_log__','beta5_log__','beta6_log__','sigma_log__', 'sigma'], inplace = True)
# Means for all the weights
var_means = var_weights.mean(axis=0)

In [None]:
var_means

In [None]:
x['Intercept'] = 1

Create predictions using the average parameter estimates

In [None]:
Y_pred = (var_means['Intercept']*x.Intercept) + (var_means['beta1'] *x['x2']) + (var_means['beta2'] *x['x16']) + (var_means['beta3'] *x['x17']) + (var_means['beta4'] *x['x26']) + (var_means['beta5'] *x['x27']) + (var_means['beta6'] *x['x32']) + (var_means['beta7'] *x['x34']) + (var_means['beta8'] *x['x35']) + (var_means['beta9'] *x['x38']) + (var_means['beta10'] *x['x39']) + (var_means['beta11'] *x['x45'])


In [None]:
residuals = Y_pred - y

#### MAE

In [None]:
mean_absolute_error(y,Y_pred)

#### MSE

In [None]:
mean_squared_error(y,Y_pred)

#### RMSE

In [None]:
rmse = np.sqrt(mean_squared_error(y,Y_pred))
rmse

#### R-Squared

In [None]:
az.r2_score(y, Y_pred)[0]

### Calculate % Effect

In [None]:
print(round(100*(var_means['Intercept']*(x['Intercept']/Y_pred)).mean()))
print(round(100*(var_means['beta1']*(x['x2']/Y_pred)).mean()))
print(round(100*(var_means['beta2']*(x['x16']/Y_pred)).mean()))
print(round(100*(var_means['beta3']*(x['x17']/Y_pred)).mean()))
print(round(100*(var_means['beta4']*(x['x27']/Y_pred)).mean()))
print(round(100*(var_means['beta5']*(x['x27']/Y_pred)).mean()))
print(round(100*(var_means['beta6']*(x['x32']/Y_pred)).mean()))
print(round(100*(var_means['beta7']*(x['x34']/Y_pred)).mean()))
print(round(100*(var_means['beta8']*(x['x35']/Y_pred)).mean()))
print(round(100*(var_means['beta9']*(x['x38']/Y_pred)).mean()))
print(round(100*(var_means['beta10']*(x['x39']/Y_pred)).mean()))
print(round(100*(var_means['beta11']*(x['x45']/Y_pred)).mean()))


# Validate Linear Regression Model Assumptions

####1. Linearity

Visually inspect linearity between target variable and predictions

In [None]:
plt.plot(Y_pred,y,'o',alpha=0.5)
plt.show()

#### 2. Normality of Errors

Visually inspect the residuals to confirm normality

In [None]:
fig = sm.qqplot(residuals)

In [None]:
sns.kdeplot(residuals, label = '', shade = True)
plt.xlabel('Model Residuals'); plt.ylabel('Density'); plt.title('Distribution of Residuals');

####3. Absence of Multicollinearity

Tested and checked during data processing stage

####4. Homoscedasticity


Visually inspect residuals to confirm constant variance

In [None]:
residuals.mean()

In [None]:
plt.plot(residuals,'o',alpha=0.5)
plt.show()

# Appendix

## Parameterization Help

Use the `fitter` package to auto-detect the probability distribution of your data

In [None]:
'''
sns.kdeplot(x[x.columns[3]], label = '', shade = True)
plt.xlabel(''); plt.ylabel('Density'); plt.title('Density Plot');

f = Fitter(x[x.columns[2]],
           distributions= get_common_distributions())
f.fit()
f.summary()

f.get_best(method = 'aic')
f.fitted_param['uniform']
'''