In [None]:
# Copyright 2020 Google LLC

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#      https://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [Run in Colab](https://colab.research.google.com/github/dpanigra/cdp-workshop/blob/master/CDP_Workshop_Customer_segmentation_Propensity_to_Purchase.ipynb)

 # Overview
CDP workshop - Customer segmentation using Propensity to Purchase.

Step by step solution guide with explanation is [here.](https://cloud.google.com/architecture/predicting-customer-propensity-to-buy)

References:
1. Blog post: [How to build an end-to-end propensity to purchase solution using BigQuery ML and Kubeflow Pipelines](https://medium.com/@dpani/how-to-build-an-end-to-end-propensity-to-purchase-solution-using-bigquery-ml-and-kubeflow-pipelines-cd4161f734d9)
1. Github code: [Comprehensive notebox](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/tree/master/retail/propensity-model/bqml)
1. Blog post: [How to make an ML model inference on KFServing from container apps (web, Spark) running on Google Cloud Kubernetes Engine?](https://medium.com/google-cloud/how-to-make-an-ml-model-inference-on-kfserving-from-container-apps-web-spark-running-on-google-c50ca849c9f0)
1. Technical reference guide (in Google Cloud Solutions section): [Predicting customer propensity to buy by using BigQuery ML and AI Platform](https://cloud.google.com/architecture/predicting-customer-propensity-to-buy)

Other patterns:
Smart analytics reference patterns (in Google Cloud Solutions section): [Overview page](https://cloud.google.com/solutions/smart-analytics/reference-patterns/overview)


# Setup environment

## *PIP install appropriate packages*

In [None]:
%pip install google-cloud-storage # for Storage Account
%pip install google-cloud # for cloud sdk
%pip install google-cloud-bigquery # for BigQuery
%pip install google-cloud-bigquery-storage # for BigQuery Storage client
# for data exploration
%pip install pandas 
%pip install matplotlib 
%pip install pandas_profiling 

# Restart kernel after installs
# import IPython
# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

Collecting google-cloud
  Downloading google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)
Installing collected packages: google-cloud
Successfully installed google-cloud-0.34.0


## *Initialize all the variables*

In [30]:
CS_GCP_PROJECT = "dpani-sandbox" #@param {type:"string"}
CS_DATASET_LOCATION = "us-central1-b" #@param {type:"string"}
# provides a mechansim to version
# e.g. different dataset could use different models
#   to compare the performance of the models
CS_BQ_DATASET_NAME = "cdp_ws_cs" #@param {type:"string"}

CS_PROP_INPUT_TABLE = "prop_sol_input_user_event" #@param {type:"string"}
CS_PROP_ML_MODEL_NAME = "prop_sol_ml_model" #@param {type:"string"}

# create a variable that you can pass to the bq Cell magic
# import the variables to the shell
import os
cs_all_args = [key for key in locals().keys() if key.startswith('CS')]
CS_BQ_ARGS = {}
for cs_each_key in cs_all_args:
    # print (f"{cs_each_key}:{locals()[cs_each_key]}")
    # del locals()[cs_each_key]
    if cs_each_key != 'CS_BQ_ARGS':
      CS_BQ_ARGS[cs_each_key] = locals()[cs_each_key]
      os.environ[cs_each_key] = str(CS_BQ_ARGS[cs_each_key])
print (CS_BQ_ARGS)

{'CS_GCP_PROJECT': 'dpani-sandbox', 'CS_DATASET_LOCATION': 'us-central1-b', 'CS_BQ_DATASET_NAME': 'cdp_ws_cs', 'CS_PROP_INPUT_TABLE': 'prop_sol_input_user_event', 'CS_PROP_ML_MODEL_NAME': 'prop_sol_ml_model'}


## *Setup your Google Cloud project*

In [2]:
!export CS_GCP_PROJECT
!echo $CS_GCP_PROJECT
# set the desired Google Cloud project
!gcloud config set project $CS_GCP_PROJECT
import os
os.environ['GOOGLE_CLOUD_PROJECT'] = CS_GCP_PROJECT
# validate that the Google Cloud project has been set properly.
# !gcloud info --format='value(config.project)'

dpani-sandbox
Updated property [core/project].


## *Authenticate with Google Cloud*

### Authenticate using ServiceAccount Key file

In [3]:
# download the ServiceAccount key and provide the path to the file below
# CS_GCP_APPLICATION_CREDENTIALS = "<Full path with the file name to the above downloaded json file>"
# CS_GCP_APPLICATION_CREDENTIALS = "/Users/dpani/Downloads/dpani-sandbox-2-3073195cd132.json"

# uncomment the below code in codelab environment
# authenticate using service account
# from google.colab import files
# # Upload service account key
# keyfile_upload = files.upload()
# CS_GCP_APPLICATION_CREDENTIALS = list(keyfile_upload.keys())[0]

# import os
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = CS_GCP_APPLICATION_CREDENTIALS
# # set the account
# !echo "Setting Service Account:" $CS_GCP_APPLICATION_CREDENTIALS
# !gcloud auth activate-service-account --key-file=$CS_GCP_APPLICATION_CREDENTIALS

### Authenticate using OAuth

In [5]:
# uncomment the below code in codelab environment
# authenticate using oauth
import sys
if 'google.colab' in sys.modules:
  from google.colab import auth as google_auth
  google_auth.authenticate_user()

## *Enable the below Google Cloud Services for the solution*

In [6]:
# set the proper Permission for the required Google Cloud Services
!gcloud services enable \
    storage-component.googleapis.com \
    bigquery.googleapis.com \
    ml.googleapis.com \
    notebooks.googleapis.com

Operation "operations/acf.p2-976354649621-e28667de-f5ec-4949-b39f-c6e9d48b807f" finished successfully.


In [7]:
# validate that all desired Permission have been set properl.
!gcloud services list | grep 'storage-component.googleapis.com\|bigquery.googleapis.com\|ml.googleapis.com\|notebooks.googleapis.com'

automl.googleapis.com                   Cloud AutoML API
bigquery.googleapis.com                 BigQuery API
ml.googleapis.com                       AI Platform Training & Prediction API
notebooks.googleapis.com                Notebooks API
storage-component.googleapis.com        Cloud Storage


## *Create a BigQuery client, import the libraries, load the bigquery Cell magic*

In [8]:
# create a BQ client
from google.cloud import bigquery
bq_client = bigquery.Client(project=CS_GCP_PROJECT)
# load the bigquery Cell magic
# %load_ext google.cloud.bigquery
%reload_ext google.cloud.bigquery

In [9]:
# test that BQ client works
sql = """
    SELECT name
    FROM `bigquery-public-data.usa_names.usa_1910_current`
    WHERE state = 'TX'
    LIMIT 100
"""

# Run a Standard SQL query using the environment's default project
df = bq_client.query(sql).to_dataframe()
df

Unnamed: 0,name
0,Mary
1,Ruby
2,Annie
3,Willie
4,Ruth
...,...
95,Leona
96,Lucile
97,Lucy
98,Manuela


# Utilities fuctions

## *Create the BigQuery dataset (DDL)*

In [11]:
# create_bq_ds
def create_bq_ds(CS_GCP_PROJECT: str,
                 CS_BQ_DATASET_NAME: str,
                 CS_LOCATION: str
                 ):
  """The function creates a BigQuery dataset if don't exist.

      The idea is to create DataSet only one time.
      Args:
        CS_GCP_PROJECT:(:obj:`str`): Google Cloud project for deployment
        CS_BQ_DATASET_NAME:(:obj:`str`): Name of the dataset.
        CS_LOCATION:(:obj:`str`): Location of the Google Cloud region
          of the BigQuery dataset
  """
  from google.cloud import bigquery
  from google.cloud.exceptions import NotFound
  client = bigquery.Client()
  dataset_id = f"{CS_GCP_PROJECT}.{CS_BQ_DATASET_NAME}"

  ds_found = True
  try:
    client.get_dataset(dataset_id)  # Make an API request.
    print('Dataset {} already exists'.format(dataset_id))
  except NotFound:
    print('Dataset {} is not found'.format(dataset_id))
    ds_found = False

  import traceback
  if ds_found is False:
    try:
      # Construct a full Dataset object to send to the API.
      dataset = bigquery.Dataset(dataset_id)
      dataset.location = CS_LOCATION.split('-')[0].upper()
      dataset = client.create_dataset(dataset)  # Make an API request.
      print('Created dataset {}.{} in location: {}.'.\
            format(client.project, dataset.dataset_id, dataset.location))
    except Exception as e:
      error = traceback.format_exc()
      print(error)
      print(e)
      raise RuntimeError(f"Can't create the BigQuery DS {dataset_id}")

## *Delete a dataset in BigQuery (DDL)*

In [None]:
# delete the BigQuery dataset...!!! BE CAREFUL !!!
def delete_dataset(dataset_id):
    """Deletes a BigQuery dataset
    This is not recommendated to use it in a production enviornment.
    Comes handy in the iterative development and testing phases of the SDLC.
    !!! BE CAREFUL !!!!
    Args:
        dataset_id(:obj:`str`): The BigQuery dataset name that we want to delete
    """
    # [START bigquery_delete_dataset]
    from google.cloud import bigquery
    # Construct a BigQuery client object.
    client = bigquery.Client()
    # dataset_id = 'your-project.your_dataset'
    # Use the delete_contents parameter to delete a dataset and its contents.
    # Use the not_found_ok parameter to not receive an error if the
    #     dataset has already been deleted.
    client.delete_dataset(
        dataset_id, delete_contents=True, not_found_ok=True
    )  # Make an API request.
    print("Deleted dataset '{}'.".format(dataset_id))

In [None]:
## *Execute query in BigQuery (DDL + DML)*

## *Execute SQL BigQuery*


In [24]:
# execute_sql
def execute_sql(sql_query: str):
  """The executes the sql.

    Args:
        sql_query:(:obj:`str`): SQL query to execute
  """
  from google.cloud import bigquery
  from google.cloud.exceptions import NotFound
  client = bigquery.Client()
  import traceback
  try:
    client = bigquery.Client()
    query_job = client.query(sql_query)  # Make an API request.
    print(f"Querty executed.")
    results = query_job.result()  # Waits for job to complete.
    for row in results:
      # print("{} : {} views".format(row.url, row.view_count))
      print (row)
  except Exception as e:
    error = traceback.format_exc()
    print(error)
    print(e)
    raise RuntimeError(f"Can't execute the query {sql_query}")

In [None]:
# delete BigQuery table if not needed...!!! BE CAREFUL !!!
def delete_table(table_id):
  """Deletes a BigQuery table
    This is not recommendated to use it in a production enviornment.
    Comes handy in the iterative development and testing phases of the SDLC.
    !!! BE CAREFUL !!!!
    Args:
      table_id(:obj:`str`): The BigQuery table name that we want to delete
  """
  from google.cloud import bigquery
  # Construct a BigQuery client object.
  client = bigquery.Client()
  # client.delete_table(table_id, not_found_ok=True)  # Make an API request.
  client.delete_table(table_id)  # Make an API request.
  print("Deleted table '{}'.".format(table_id))

## *Deletes an ML model* (DDL)

In [None]:
# delete the BQML model if not needed...!!! BE CAREFUL !!!
def delete_model(model_id):
  """Deletes a BigQuery table
    This is not recommendated to use it in a production enviornment.
    Comes handy in the iterative development and testing phases of the SDLC.
    !!! BE CAREFUL !!!!
    Args:
      delete_model(:obj:`str`): The BigQuery ML model name that we want to delete
  """
  from google.cloud import bigquery
  # Construct a BigQuery client object.
  client = bigquery.Client()
  # TODO(developer): Set model_id to the ID of the model to fetch.
  # model_id = 'your-project.your_dataset.your_model'
  client.delete_model(model_id)  # Make an API request.
  print("Deleted model '{}'.".format(model_id))

# Collect and Transform phase

## *Creates the BigQuery dataset* (DDL)

In [12]:
# create the bq dataset
create_bq_ds(CS_GCP_PROJECT,
 CS_BQ_DATASET_NAME,
 CS_DATASET_LOCATION,
)

Dataset dpani-sandbox.cdp_ws_cs already exists


## *Populate input table* (DDL + DML)

In [26]:
# load curated data
prop_create_curated_table_query = f"""
  # select initial features and label to feed into our model
  CREATE OR REPLACE TABLE {CS_BQ_DATASET_NAME}.{CS_PROP_INPUT_TABLE}
  OPTIONS(
  description="Google Store curated Data"
  ) AS 
  SELECT
  fullVisitorId,
  bounces,
  time_on_site,
  will_buy_on_return_visit # <--- our label
  FROM
  # features
  (SELECT
      fullVisitorId,
      IFNULL(totals.bounces, 0) AS bounces,
      IFNULL(totals.timeOnSite, 0) AS time_on_site
  FROM
      `bigquery-public-data.google_analytics_sample.*`
  WHERE
      totals.newVisits = 1
      AND date BETWEEN '20160801' AND '20170430') # train on first 9 months
  JOIN
  (SELECT
      fullvisitorid,
      IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
  FROM
      `bigquery-public-data.google_analytics_sample.*`
  GROUP BY fullvisitorid)
  USING (fullVisitorId)
  ORDER BY time_on_site DESC # order by most time spent first
"""
print (prop_create_curated_table_query)
execute_sql (prop_create_curated_table_query)
print(f"Table {CS_PROP_INPUT_TABLE} created.")


  # select initial features and label to feed into our model
  CREATE OR REPLACE TABLE cdp_ws_cs.prop_sol_input_user_event
  OPTIONS(
  description="Google Store curated Data"
  ) AS 
  SELECT
  fullVisitorId,
  bounces,
  time_on_site,
  will_buy_on_return_visit # <--- our label
  FROM
  # features
  (SELECT
      fullVisitorId,
      IFNULL(totals.bounces, 0) AS bounces,
      IFNULL(totals.timeOnSite, 0) AS time_on_site
  FROM
      `bigquery-public-data.google_analytics_sample.*`
  WHERE
      totals.newVisits = 1
      AND date BETWEEN '20160801' AND '20170430') # train on first 9 months
  JOIN
  (SELECT
      fullvisitorid,
      IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
  FROM
      `bigquery-public-data.google_analytics_sample.*`
  GROUP BY fullvisitorid)
  USING (fullVisitorId)
  ORDER BY time_on_site DESC # order by most time spent first

Querty executed.
Table prop_sol_input_user_event created.


In [28]:
# print total number of records in the table
df = bq_client.query('''
  SELECT count (*) AS total_records
  FROM `%s.%s`
''' % (CS_BQ_DATASET_NAME, CS_PROP_INPUT_TABLE)).to_dataframe()
# print total number of records in the table
df

Unnamed: 0,total_records
0,550037


In [29]:
# adjust the below query to grab only a sample dataset e.g. use a where clause.
df = bq_client.query('''
  SELECT *
  FROM `%s.%s`
  WHERE bounces <> 0
  LIMIT 10
''' % (CS_BQ_DATASET_NAME, CS_PROP_INPUT_TABLE)).to_dataframe()
df

Unnamed: 0,fullVisitorId,bounces,time_on_site,will_buy_on_return_visit
0,498553357537999820,1,1,0
1,9621622720764337033,1,0,0
2,594549172933846354,1,0,0
3,9595629889938024111,1,0,0
4,5748628212328826505,1,0,0
5,9588265300422387531,1,0,0
6,1895119008484678478,1,0,0
7,5815882738258374648,1,0,0
8,3764021740467965794,1,0,0
9,45193150849762828,1,0,0


## Create the propensity to purhcase model (DDL)

In [31]:
# create the mo model
# takes a couple of mins
prop_sol_create_clv_model = f"""
  CREATE OR REPLACE MODEL `{CS_BQ_DATASET_NAME}.{CS_PROP_ML_MODEL_NAME}`
  OPTIONS(MODEL_TYPE = 'logistic_reg',
          labels = [ 'will_buy_on_return_visit' ]
          )
  AS
  SELECT * EXCEPT (fullVisitorId)
  FROM `{CS_BQ_DATASET_NAME}.{CS_PROP_INPUT_TABLE}`
"""
print (prop_sol_create_clv_model)
execute_sql(prop_sol_create_clv_model)


  CREATE OR REPLACE MODEL `cdp_ws_cs.prop_sol_ml_model`
  OPTIONS(MODEL_TYPE = 'logistic_reg',
          labels = [ 'will_buy_on_return_visit' ]
          )
  AS
  SELECT * EXCEPT (fullVisitorId)
  FROM `cdp_ws_cs.prop_sol_input_user_event`

Querty executed.


## Evaluate the ml model

In [32]:
# evaluate the ml model, check its roc
bqml_eval_query = f"""
  SELECT
  roc_auc, CASE WHEN roc_auc > .9 THEN 'good'
  WHEN roc_auc > .8 THEN 'fair' WHEN roc_auc > .7 THEN 'decent'
  WHEN roc_auc > .6 THEN 'not great' ELSE 'poor' END AS modelquality
  FROM
  ML.EVALUATE(MODEL {CS_BQ_DATASET_NAME}.{CS_PROP_ML_MODEL_NAME})
"""

df = bq_client.query(bqml_eval_query).to_dataframe()
df

Unnamed: 0,roc_auc,modelquality
0,0.845673,fair


# Analyze phase

## Display propensity to purchase prediction

In [33]:
# predict
bqml_predict_query = f"""
  SELECT
    fullVisitorId,
    predicted_will_buy_on_return_visit
  FROM ML.PREDICT(MODEL {CS_BQ_DATASET_NAME}.{CS_PROP_ML_MODEL_NAME},
  (
    SELECT
    fullVisitorId,
    bounces,
    time_on_site
    from {CS_BQ_DATASET_NAME}.{CS_PROP_INPUT_TABLE}
  ))
"""

df = bq_client.query(bqml_predict_query).to_dataframe()
df

Unnamed: 0,fullVisitorId,predicted_will_buy_on_return_visit
0,2706961341001088633,1
1,6957245643416321514,1
2,3924372865099736100,1
3,5564610750564086192,1
4,4691667039083430712,1
...,...,...
550032,5966409669067320485,0
550033,2990747519442265833,0
550034,2300144084357227289,0
550035,9686552805922145081,0


# Google Cloud Resource Clean up

## Delete the BigQuery Dataset

In [None]:
# deletes the dataset
# delete_dataset(CS_BQ_DATASET_NAME)

Deleted dataset 'cdp_cs'.


## Delete the Google Cloud Project
To avoid incurring charges to your Google Cloud Platform account for the resources used in this tutorial is to **Delete the project**.

The easiest way to eliminate billing is to delete the project you created for the tutorial.

**Caution**: Deleting a project has the following effects:
* *Everything in the project is deleted.* If you used an existing project for this tutorial, when you delete it, you also delete any other work you've done in the project.
* <b>Custom project IDs are lost. </b>When you created this project, you might have created a custom project ID that you want to use in the future. To preserve the URLs that use the project ID, such as an appspot.com</b> URL, delete selected resources inside the project instead of deleting the whole project. 

If you plan to explore multiple tutorials and quickstarts, reusing projects can help you avoid exceeding project quota limits.
<br>
<ol type="1">
    <li>In the Cloud Console, go to the <b>Manage resources</b> page.</li>
    Go to the <a href="https://console.cloud.google.com/iam-admin/projects">Manage resources page</a>
    <li>In the project list, select the project that you want to delete and then click <b>Delete</b> Trash icon.</li>
    <li>In the dialog, type the project ID and then click <b>Shut down</b> to delete the project. </li>
</ol>
