**PLEASE MAKE A COPY BEFORE CHANGING**

**Copyright** 2021 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


<b>Important</b>
This content are intended for educational and informational purposes only.

## Introduction 
<b>Purpose:</b> The goal of this colab is to show an example of how to calculate conversion probability. As a result we can create the feature importance report.

**Key notes**

*   This example assumes enhanced ecommerce is implemented (we are predicting transactions).
*   It is possible to adjust the query to predict other events instead of a transaction.

**Instructions**
*   First of all: <b>MAKE A COPY</b>;
*   Fulfill the query parameters in the Box 1;
*   In the menu above click in Runtime > Run All;
*   Authorize your credentials;

## User Input (Training Query)

In [None]:
project_id = 'your-billing-project-id'#@param
table = 'your-project-id.your-ga-dataset.ga_sessions_*'#@param
lookback_start_date = '2018-08-01'#@param {type:"date"}
lookback_end_date = '2018-08-31'#@param {type:"date"}
conversion_window_start_date = '2018-09-01'#@param {type:"date"}
conversion_window_end_date = '2018-09-30'#@param {type:"date"}
prediction_type = 'transaction'#@param['transaction', 'event']
event_filter_type = 'eventLabel'#@param['eventCategory', 'eventAction', 'eventLabel', ' ']
event_filter_value = 'conversion'#@param
test_size = 0.5#@param
downsample_majority_class = 0.1#@param {type:"slider", min:0.1, max:1, step:0.1}

## User Input (Classification Query)

In [None]:
classification_start_date = '2019-10-01'#@param {type:"date"}
classification_end_date = '2019-10-31'#@param {type:"date"}
index_dimension = 'ga:dimension14'#@param
value_dimension = 'ga:dimension15'#@param


## Code Section

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from google.colab import auth, files
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler

%matplotlib inline

In [None]:
# Function to print results
def results(X_test, Y_test, clf):
    probs = clf.predict_proba(X_test)
    auc_ = roc_auc_score(Y_test, probs[:,1])
    print("AUC: %.4f" % auc_)
    predictions = clf.predict(X_test)
    print("accuracy: %.4f" % accuracy_score(Y_test, predictions))
    print(classification_report(Y_test, clf.predict(X_test)))

# Function to plot a roc curve
def plot_roc_curve(X_test, model):
    probs = model.predict_proba(X_test)
    preds = probs[:,1]
    fpr, tpr, threshold = roc_curve(y_test, preds)
    roc_auc = auc(fpr, tpr)
    plt.title('AdaBoosting AUC Curve')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

# Function to plot feature importance (valid for adaBoosting only)
def feature_relevance(X_test, model):
    names = X_test.columns
    feature_importance = model.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.figure(figsize=(20,10))
    plt.subplot(1, 2, 2)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos,map(lambda x: names[x], sorted_idx))
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()

In [None]:
# Authenticate the user to access BigQuery Projects
auth.authenticate_user()

In [None]:
# Build the query
dc ={}
dc['project_id'] = project_id
dc['table'] = table
dc['lookback_start_date'] = lookback_start_date.replace('-', '')
dc['lookback_end_date'] = lookback_end_date.replace('-', '')
dc['conversion_window_start_date'] = conversion_window_start_date.replace('-', '')
dc['conversion_window_end_date'] = conversion_window_end_date.replace('-', '')
dc['prediction_type'] = prediction_type
dc['event_filter_type'] = event_filter_type
dc['event_filter_value'] = event_filter_value
dc['downsample_majority_class'] = downsample_majority_class
dc['classification_start_date'] = classification_start_date.replace('-', '')
dc['classification_end_date'] = classification_end_date.replace('-', '')

q1 = """
WITH
  latest_session AS (
  SELECT
    * EXCEPT(rn)
  FROM (
    SELECT
      ROW_NUMBER() OVER(PARTITION BY clientid ORDER BY visitnumber DESC) AS rn,
      clientid,
      visitNumber,
      channelgrouping,
      IF(device.browser NOT IN ('Chrome', 'Safari', 'Firefox', 'Android Webview', 'Edge'), 'Others', device.browser) as browser,
      device.deviceCategory,
      IF(device.operatingSystem NOT IN('Android', 'iOS', 'Windows', 'Macintosh', 'Linux'), 'Others', device.operatingSystem ) AS operatingSystem,
      geoNetwork.region
    FROM
      `{table}`
    WHERE
      _TABLE_SUFFIX BETWEEN '{lookback_start_date}' AND '{lookback_end_date}'
      AND clientid IS NOT NULL)
  WHERE
    rn = 1 ),

session_hits as (
SELECT
    clientid,
    SUM(totals.visits) AS visits,
    SUM(totals.pageviews) AS pageviews,
    SUM(totals.hits) AS hits,
    SUM(totals.timeonsite) AS timeonsite,
    SUM(totals.bounces) AS bounces,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (5,6,7,8,9,10) THEN 1 ELSE 0 END) AS morning_visits,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (11,12,13,14,15,16) THEN 1 ELSE 0 END) AS daytime_visits,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (17,18,19,20,21,22) THEN 1 ELSE 0 END) AS evening_visits,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (23,24,0,1,2,3,4) THEN 1 ELSE 0 END) AS midnight_visits,
    SUM(totals.transactions) AS conversion,
    SUM(totals.totalTransactionRevenue) / 100000 AS revenue
FROM
  `{table}`
WHERE
  _TABLE_SUFFIX BETWEEN '{lookback_start_date}' AND '{lookback_end_date}' AND clientid IS NOT NULL
GROUP BY 1),
 
converted as (
SELECT 
  *
FROM (
  SELECT 
    clientid,
    SUM(totals.transactions) AS y_conversions
  FROM
    `{table}`
  WHERE
    _TABLE_SUFFIX BETWEEN '{conversion_window_start_date}' AND '{conversion_window_end_date}' AND clientid IS NOT NULL
  GROUP BY 1)
WHERE 
  y_conversions > 0
),

joined as(
SELECT
  sh.clientid,
  ls.channelgrouping AS last_channel,
  ls.browser,
  ls.deviceCategory,
  ls.operatingSystem,
  ls.region,
  ls.visitnumber AS current_visit,
  IFNULL(SUM(sh.visits), 0) AS total_visits,
  IFNULL(SUM(sh.pageviews), 0) AS total_pageviews,
  IFNULL(SUM(sh.hits), 0) AS total_hits,
  IFNULL(SUM(sh.timeonsite), 0) AS total_timeonsite,
  IFNULL(SUM(sh.bounces), 0) AS total_bounces,
  IFNULL(SUM(sh.morning_visits), 0) AS total_morning_visits,
  IFNULL(SUM(sh.daytime_visits), 0) AS total_daytime_visits,
  IFNULL(SUM(sh.evening_visits), 0) AS total_evening_visits,
  IFNULL(SUM(sh.midnight_visits), 0) AS total_midnight_visits,
  IFNULL(SUM(sh.conversion), 0) AS total_conversions,
  IF(IFNULL(SUM(c.y_conversions), 0) > 0, 1, 0) AS y_conversions
FROM
  session_hits sh LEFT OUTER JOIN latest_session ls
  ON sh.clientid = ls.clientid
  LEFT OUTER JOIN converted c ON sh.clientid = c.clientid
GROUP BY 1,2,3,4,5,6,7)

SELECT * FROM joined WHERE y_conversions = 0 AND RAND() <= {downsample_majority_class} UNION ALL(SELECT * FROM joined WHERE y_conversions = 1)
""".format(**dc)

q2 = """

WITH
  latest_session AS (
  SELECT
    * EXCEPT(rn)
  FROM (
    SELECT
      ROW_NUMBER() OVER(PARTITION BY clientid ORDER BY visitnumber DESC) AS rn,
      clientid,
      visitNumber,
      channelgrouping,
      IF(device.browser NOT IN ('Chrome', 'Safari', 'Firefox', 'Samsung Internet', 'Android Webview', 'Edge'), 'Others', device.browser) as browser,
      device.deviceCategory,
      IF(device.operatingSystem NOT IN('Android', 'iOS', 'Windows', 'Macintosh', 'Linux'), 'Others', device.operatingSystem ) AS operatingSystem,
      geoNetwork.region
    FROM
      `{table}`
    WHERE
      _TABLE_SUFFIX BETWEEN '{lookback_start_date}' AND '{lookback_end_date}'
      AND clientid IS NOT NULL)
  WHERE
    rn = 1 ),

session_hits as (
SELECT
    clientid,
    SUM(totals.visits) AS visits,
    SUM(totals.pageviews) AS pageviews,
    SUM(totals.hits) AS hits,
    SUM(totals.timeonsite) AS timeonsite,
    SUM(totals.bounces) AS bounces,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (5,6,7,8,9,10) THEN 1 ELSE 0 END) AS morning_visits,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (11,12,13,14,15,16) THEN 1 ELSE 0 END) AS daytime_visits,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (17,18,19,20,21,22) THEN 1 ELSE 0 END) AS evening_visits,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (23,24,0,1,2,3,4) THEN 1 ELSE 0 END) AS midnight_visits,
    SUM(totals.transactions) AS conversion,
    SUM(totals.totalTransactionRevenue) / 100000 AS revenue
FROM
  `{table}`
WHERE
  _TABLE_SUFFIX BETWEEN '{lookback_start_date}' AND '{lookback_end_date}' AND clientid IS NOT NULL
GROUP BY 1),
 
converted as (
SELECT 
  *
FROM (
  SELECT 
    clientid,
    COUNT(1) AS y_conversions
  FROM
    `{table}`, UNNEST(hits) h
  WHERE
    _TABLE_SUFFIX BETWEEN '{conversion_window_start_date}' AND '{conversion_window_end_date}' AND clientid IS NOT NULL
    AND h.eventInfo.{event_filter_type}	= '{event_filter_value}'
  GROUP BY 1)
WHERE 
  y_conversions > 0
),

joined as(
SELECT
  sh.clientid,
  ls.channelgrouping AS last_channel,
  ls.browser,
  ls.deviceCategory,
  ls.operatingSystem,
  ls.region,
  ls.visitnumber AS current_visit,
  IFNULL(SUM(sh.visits), 0) AS total_visits,
  IFNULL(SUM(sh.pageviews), 0) AS total_pageviews,
  IFNULL(SUM(sh.hits), 0) AS total_hits,
  IFNULL(SUM(sh.timeonsite), 0) AS total_timeonsite,
  IFNULL(SUM(sh.bounces), 0) AS total_bounces,
  IFNULL(SUM(sh.morning_visits), 0) AS total_morning_visits,
  IFNULL(SUM(sh.daytime_visits), 0) AS total_daytime_visits,
  IFNULL(SUM(sh.evening_visits), 0) AS total_evening_visits,
  IFNULL(SUM(sh.midnight_visits), 0) AS total_midnight_visits,
  IFNULL(SUM(sh.conversion), 0) AS total_conversions,
  IF(IFNULL(SUM(c.y_conversions), 0) > 0, 1, 0) AS y_conversions
FROM
  session_hits sh LEFT OUTER JOIN latest_session ls
  ON sh.clientid = ls.clientid
  LEFT OUTER JOIN converted c ON sh.clientid = c.clientid
GROUP BY 1,2,3,4,5,6,7)

SELECT * FROM joined WHERE y_conversions = 0 AND RAND() <= {downsample_majority_class} UNION ALL(SELECT * FROM joined WHERE y_conversions = 1)

""".format(**dc)


q3 = """
WITH
  latest_session AS (
  SELECT
    * EXCEPT(rn)
  FROM (
    SELECT
      ROW_NUMBER() OVER(PARTITION BY clientid ORDER BY visitnumber DESC) AS rn,
      clientid,
      visitNumber,
      channelgrouping,
      IF(device.browser NOT IN ('Chrome', 'Safari', 'Firefox', 'Android Webview', 'Edge'), 'Others', device.browser) as browser,
      device.deviceCategory,
      IF(device.operatingSystem NOT IN('Android', 'iOS', 'Windows', 'Macintosh', 'Linux'), 'Others', device.operatingSystem ) AS operatingSystem,
      geoNetwork.region
    FROM
      `{table}`
    WHERE
      _TABLE_SUFFIX BETWEEN '{classification_start_date}' AND '{classification_end_date}'
      AND clientid IS NOT NULL)
  WHERE
    rn = 1 ),

session_hits as (
SELECT
    clientid,
    SUM(totals.visits) AS visits,
    SUM(totals.pageviews) AS pageviews,
    SUM(totals.hits) AS hits,
    SUM(totals.timeonsite) AS timeonsite,
    SUM(totals.bounces) AS bounces,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (5,6,7,8,9,10) THEN 1 ELSE 0 END) AS morning_visits,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (11,12,13,14,15,16) THEN 1 ELSE 0 END) AS daytime_visits,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (17,18,19,20,21,22) THEN 1 ELSE 0 END) AS evening_visits,
    SUM(CASE WHEN EXTRACT(HOUR FROM TIMESTAMP_SECONDS(visitStartTime) AT TIME ZONE "America/Los_Angeles") IN (23,24,0,1,2,3,4) THEN 1 ELSE 0 END) AS midnight_visits,
    SUM(totals.transactions) AS conversion,
    SUM(totals.totalTransactionRevenue) / 100000 AS revenue
FROM
  `{table}`
WHERE
  _TABLE_SUFFIX BETWEEN '{classification_start_date}' AND '{classification_end_date}' AND clientid IS NOT NULL
GROUP BY 1),
 


joined as(
SELECT
  sh.clientid,
  ls.channelgrouping AS last_channel,
  ls.browser,
  ls.deviceCategory,
  ls.operatingSystem,
  ls.region,
  ls.visitnumber AS current_visit,
  IFNULL(SUM(sh.visits), 0) AS total_visits,
  IFNULL(SUM(sh.pageviews), 0) AS total_pageviews,
  IFNULL(SUM(sh.hits), 0) AS total_hits,
  IFNULL(SUM(sh.timeonsite), 0) AS total_timeonsite,
  IFNULL(SUM(sh.bounces), 0) AS total_bounces,
  IFNULL(SUM(sh.morning_visits), 0) AS total_morning_visits,
  IFNULL(SUM(sh.daytime_visits), 0) AS total_daytime_visits,
  IFNULL(SUM(sh.evening_visits), 0) AS total_evening_visits,
  IFNULL(SUM(sh.midnight_visits), 0) AS total_midnight_visits,
  IFNULL(SUM(sh.conversion), 0) AS total_conversions
FROM
  session_hits sh LEFT OUTER JOIN latest_session ls
  ON sh.clientid = ls.clientid
GROUP BY 1,2,3,4,5,6,7)

SELECT * FROM joined


""".format(**dc)
if prediction_type == 'transaction':
    q = q1
else:
    q = q2

In [None]:
%%time
df = pd.io.gbq.read_gbq(q, project_id=project_id, verbose=False, dialect='standard')

In [None]:
df.head()

In [None]:
print("Dataset has {} rows and {} columns".format(df.shape[0], df.shape[1]))
print()
print("Class distribution:")
print(df.y_conversions.value_counts())
print()
print("converters to non converters proportion:")
print(df.y_conversions.value_counts()[1] / df.y_conversions.value_counts()[0])

In [None]:
# Drop the label and clientid (Xs)
X_all = df.drop(['y_conversions', 'clientid', 'region', 'browser', 'operatingSystem'],1)
# Select the label to predict (Ys)
y_all = df['y_conversions']
# Get all categorical columns in a list.
text = list(X_all.select_dtypes(include=['object', 'category']).columns)
# Get all numeric columns in a list.
numbers = list(X_all.select_dtypes(include=np.number))
# Convert categoricals into the proper type
X_all.loc[:,text] = X_all.loc[:,text].astype('category')
# Stratified split into train, test sets
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                    test_size = test_size,
                                                    random_state = 4,
                                                    stratify = y_all)

In [None]:
# Build and fit the pipeline

preprocess = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), text),
    (StandardScaler(), numbers))

pipe_ada = make_pipeline(
    preprocess,
    AdaBoostClassifier(n_estimators=150, learning_rate=0.1, random_state=42)
)

pipe_reg = make_pipeline(
    preprocess,
    LogisticRegressionCV(max_iter=1000)
)

pipe_ada.fit(X_train,y_train)
pipe_reg.fit(X_train, y_train)

;

In [None]:
# Plot the model results
print('Adaboosting Results:')
print()
results(X_test, y_test, pipe_ada)
plot_roc_curve(X_test, pipe_ada)

print()
print('Logistic Regression Results:')
print()
results(X_test, y_test, pipe_reg)
plot_roc_curve(X_test, pipe_reg)

In [None]:
cat = list(pipe_ada.named_steps.columntransformer.transformers_[0][1].get_feature_names())
features = cat + numbers
X_test = pipe_ada.named_steps.columntransformer.transform(X_test)
X_test = pd.DataFrame(X_test, columns = features)

In [None]:
feature_relevance(X_test, pipe_ada.named_steps.adaboostclassifier)

In [None]:
%%time
df = pd.io.gbq.read_gbq(q3, project_id=project_id, verbose=False, dialect='standard')

In [None]:
preds = pipe_reg.predict_proba(df.drop(['clientid', 'region', 'browser', 'operatingSystem'],1))
df['prob'] = preds[:,1]
df.head(10)

In [None]:
df['segment'] = df.prob.apply(lambda x: 'high' if x > 0.5 else('medium' if x >0.3 else 'low'))

In [None]:
df.segment.value_counts()

In [None]:
df = df.loc[:, ['clientid', 'segment']]
df.columns = [index_dimension, value_dimension]
df.to_csv('dataset.csv', index=False)

In [None]:
df.head()