In [None]:
# Copyright 2022 Google LLC.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 1. Exploratory Data Analysis (EDA) for Lifetime Value (LTV) Modeling

This notebook helps to:

1. check feasibility of building LTV model;
2. inspect dataset fields in order to identify relevant information for features and targets (labels);
3. perform initial exploratory data analysis to identify insights that help with building LTV model.

[Google Merchandize Store GA360 dataset](https://support.google.com/analytics/answer/7586738?hl=en) is used as an example.

### Requirements

* [Google Analytics dataset stored in BigQuery.](https://support.google.com/analytics/answer/3437618?hl=en)

### Install and import required modules

In [None]:
# Uncomment to install required python modules
# !sh ../utils/setup.sh

In [None]:
# Add custom utils module to Python environment
import os
import sys
sys.path.append(os.path.abspath(os.pardir))

import pandas as pd

from gps_building_blocks.cloud.utils import bigquery as bigquery_utils

from utils import eda_ga
from utils import helpers

### Notebook custom settings

In [None]:
# Prints all the outputs from cell (instead of using display each time)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### Configuration

Edit `config.yaml` to update GCP configuration that is used across the package.

### Set parameters

In [None]:
configs = helpers.get_configs('config.yaml')
source_configs, dest_configs = configs.source, configs.destination

# GCP project ID where queries and other computation will be run.
PROJECT_ID = dest_configs.project_id
# BigQuery dataset name to store query results (if needed).
DATASET_NAME = dest_configs.dataset_name

In [None]:
# To specify how many rows to display when examining dataframes
N_ROWS = 5

In [None]:
params = {
  'project': PROJECT_ID,
  'dataset_path': f'{source_configs.project_id}.{source_configs.dataset_name}',
  'verbose': True
}

First, we initialize Analysis with config parameters.

In [None]:
bq_utils = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)
eda = eda_ga.Analysis(bq_utils=bq_utils, params=params)

### 1. Define the business and ML problem

Before proceeding into EDA for LTV Modeling, define the business problem and questions that need to be addressed by the LTV Model. Following are some high-level questions to answer before doing EDA:
* What is the business problem you are trying to solve?
* What are the success criteria of the project?
* What target do you want to predict?
* What are the essential fields to consider as the potential features?

### 2. Extract dataset schema and field descriptions

Following is an example of GA360 dataset schema and field descriptions [more details](https://support.google.com/analytics/answer/3437719?hl=en#) read into Pandas DataFrame for reference:

In [None]:
schema_html = 'https://support.google.com/analytics/answer/3437719?hl=en#'
df_schema = pd.read_html(schema_html)[0]
df_schema

### 3. Understand Dataset Structure

This section helps to answer the following questions:

* Is the dataset description available, and what does it say?
* How long does the dataset stretch for, i.e., what is the entire period, and how many daily tables does it have?
* How big are the daily tables?
* Are there any missing days?

If the data is stored in BigQuery, then its schema can be extracted via [INFORMATION_SCHEMA](https://cloud.google.com/bigquery/docs/information-schema-tables).

In [None]:
table_options, description = eda.get_ds_description()

### Check daily tables

In [None]:
tables = eda.get_tables_stats()

### Inspect sizes of the tables

In [None]:
# First set of tables.
tables[:N_ROWS]

In [None]:
# Last set of tables.
tables[-N_ROWS:]

### Check if there are missing tables

In [None]:
# Filter tables to analyse permanent `daily sessions` only
mask_not_intraday = (~tables['is_intraday'])
table_name = source_configs.table_name.replace('_*', '')
mask_sessions = tables['table_id'].str.startswith(table_name)
tables_permanent = tables[mask_sessions & mask_not_intraday].sort_values(
          'table_id', ascending=True)

helpers.generate_date_range_stats(tables_permanent['last_suffix'])

# References

* [Google Analytics Glossary](https://support.google.com/analytics/topic/6083659?hl=en&ref_topic=3544906)

* [Interactive visualization of the Google Analytics 360 BigQuery Export schema.](https://storage.googleapis.com/e-nor/visualizations/bigquery/ga360-schema.html#section-collapsible-tree)