# Get Raw GA360 Data

In [1]:
%load_ext autoreload
%autoreload 2

::: {.content-hidden}
Import necessary Python modules
:::

In [2]:
import os
import sys
from datetime import datetime, timedelta

import pandas as pd
import pytz

::: {.content-hidden}
Get relative path to project root directory
:::

In [3]:
PROJ_ROOT_DIR = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT_DIR, "src")
sys.path.append(src_dir)

::: {.content-hidden}
Import custom Python modules
:::

In [4]:
%aimport bigquery_auth_helpers
from bigquery_auth_helpers import auth_to_bigquery

%aimport sql_helpers
import sql_helpers as sqlh

%aimport transform_helpers
import transform_helpers as th

## About

Retrieve GA360 tracking data from public [sample data of Google merchandise store transactions](https://support.google.com/analytics/answer/7586738?hl=en&ref_topic=3416089&sjid=2689173516056332244-NA#zippy=%2Cin-this-article) and upload to BigQuery dataset table in personal Google Clout account.

## User Inputs

Define the following

1. start date and the number of days following this date for which raw data should be queried
2. list of nested columns in raw GA360 tracking data that is accessible in the sample merchandise store dataset

In [5]:
#| echo: true
# 1. start date and number of following days
date_start = datetime(2017, 5, 2)
num_days = 7

# 2. list of nested columns in raw data
nested_cols = [
    'totals',
    'trafficSource',
    'device',
    'geoNetwork',
    'customDimensions',
    'hits',
]

Get end date for which raw data should be queried

In [6]:
date_start_str = date_start.strftime("%Y%m%d")
date_end_str = (date_start + timedelta(days=num_days-1)).strftime("%Y%m%d")

::: {.content-hidden}
Get path to data sub-folders
:::

In [7]:
data_dir = os.path.join(PROJ_ROOT_DIR, "data")
raw_data_dir = os.path.join(data_dir, "raw")

Set datatypes for non-nested fields in raw GA360 tracking data

In [8]:
dtypes_dict_raw = {
    "visitorId": pd.Int64Dtype(),
    "visitNumber": pd.Int64Dtype(),
    "visitId": pd.Int64Dtype(),
    "visitStartTime": pd.Int64Dtype(),
    "date": pd.StringDtype(),
    "fullVisitorId": pd.StringDtype(),
    "userId": pd.StringDtype(),
    "channelGrouping": pd.StringDtype(),
    "socialEngagementType": pd.StringDtype(),
}

Define helper functions to

1. show datatypes and number of missing values in a `DataFrame`
2. export a `DataFrame` to a local `parquet` file with `gzip` compression

In [9]:
def load(df: pd.DataFrame, fpath: str) -> None:
    """Save DataFrame to parquet file."""
    df.to_parquet(fpath, index=False, compression='gzip', engine='pyarrow')


def summarize_df(df: pd.DataFrame) -> None:
    """Show datatypes and count missing values in columns of DataFrame."""
    display(
        df.dtypes.rename("dtype")
        .to_frame()
        .merge(
            df.isna().sum().rename("missing").to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .reset_index()
        .rename(columns={"index": "column"})
    )

::: {.content-hidden}
## Authenticate to `BigQuery`
:::

In [10]:
gcp_auth_dict = auth_to_bigquery(raw_data_dir)

## Get Data

Check number of observations in the single batch of raw GA360 tracking data that covers the start and end dates defined above

In [11]:
query_infer = f"""
              SELECT COUNT(fullVisitorId) AS num_visitors,
                     COUNT(visitId) AS num_visits,
                     COUNT(visitNumber) AS num_visitNumbers,
                     COUNT(visitStartTime) AS num_visitStartTime
              FROM `data-to-insights.ecommerce.web_analytics`
              WHERE date BETWEEN '{date_start_str}' AND '{date_end_str}'
              """
df = th.extract_data(query_infer, gcp_auth_dict)
df

Query execution start time = 2023-06-07 11:34:18.803...done at 2023-06-07 11:34:20.860 (2.057 seconds).
Query returned 1 rows


Unnamed: 0,num_visitors,num_visits,num_visitNumbers,num_visitStartTime
0,15885,15885,15885,15885


Read single batch of raw GA360 tracking data using a simple ETL workflow

1. extract raw data covering the start and end dates defined above
2. transform raw data by changing datatypes
3. export transformed data to local disk

In [12]:
query_infer = f"""
              SELECT visitorId,
                     visitNumber,
                     visitId,
                     visitStartTime,
                     date,
                     totals,
                     trafficSource,
                     device,
                     geoNetwork,
                     customDimensions,
                     hits,
                     fullVisitorId,
                     userId,
                     channelGrouping,
                     socialEngagementType
              FROM `data-to-insights.ecommerce.web_analytics`
              WHERE date BETWEEN '{date_start_str}' AND '{date_end_str}'
              """
# extract and transform
df = (
    th.extract_data(query_infer, gcp_auth_dict)
    .pipe(th.set_datatypes, dtypes_dict_raw)
)
# load
curr_dtime_str = (
    datetime.now()
    .replace(tzinfo=pytz.utc)
    .astimezone(pytz.timezone('US/Eastern'))
    .strftime('%Y%m%d_%H%M%S')
)
df.pipe(
    load,
    os.path.join(
        raw_data_dir,
        "ga_data",
        (
            f"d2i_ecwa__{date_start_str}__{date_end_str}__"
            f"{curr_dtime_str}.parquet.gzip"
        ),
    )
)
summarize_df(df)
df

Query execution start time = 2023-06-07 11:34:20.881...done at 2023-06-07 11:36:05.939 (105.057 seconds).
Query returned 15,885 rows


Unnamed: 0,column,dtype,missing
0,visitorId,Int64,15885
1,visitNumber,Int64,0
2,visitId,Int64,0
3,visitStartTime,Int64,0
4,date,string[python],0
5,totals,object,0
6,trafficSource,object,0
7,device,object,0
8,geoNetwork,object,0
9,customDimensions,object,0


Unnamed: 0,visitorId,visitNumber,visitId,visitStartTime,date,totals,trafficSource,device,geoNetwork,customDimensions,hits,fullVisitorId,userId,channelGrouping,socialEngagementType
0,,1,1494259348,1494259348,20170508,"{'visits': 1, 'hits': 5, 'pageviews': 5, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Sou...","[{'index': 4, 'value': 'South America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 9, 'minut...",1381334533431476543,,Direct,Not Socially Engaged
1,,1,1493732807,1493732807,20170502,"{'visits': 1, 'hits': 5, 'pageviews': 5, 'time...","{'referralPath': '/yt/about/pl/', 'campaign': ...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'North...","[{'index': 4, 'value': 'EMEA'}]","[{'hitNumber': 1, 'time': 0, 'hour': 6, 'minut...",921832832516971325,,Social,Not Socially Engaged
2,,5,1494092601,1494092601,20170506,"{'visits': 1, 'hits': 6, 'pageviews': 6, 'time...","{'referralPath': '/yt/about/it/', 'campaign': ...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'South...","[{'index': 4, 'value': 'EMEA'}]","[{'hitNumber': 1, 'time': 0, 'hour': 10, 'minu...",272300032244777177,,Social,Not Socially Engaged
3,,1,1494102223,1494102223,20170506,"{'visits': 1, 'hits': 6, 'pageviews': 3, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Asia', 'subContinent': 'Western...",[],"[{'hitNumber': 1, 'time': 0, 'hour': 13, 'minu...",2794802987057453006,,Organic Search,Not Socially Engaged
4,,1,1494000921,1494000921,20170505,"{'visits': 1, 'hits': 8, 'pageviews': 8, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'South...","[{'index': 4, 'value': 'EMEA'}]","[{'hitNumber': 1, 'time': 0, 'hour': 9, 'minut...",4943049632128085598,,Organic Search,Not Socially Engaged
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15880,,1,1494223664,1494223664,20170507,"{'visits': 1, 'hits': 4, 'pageviews': 4, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Safari', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...","[{'index': 4, 'value': 'North America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 23, 'minu...",9906885653874179178,,Organic Search,Not Socially Engaged
15881,,5,1493745097,1493745097,20170502,"{'visits': 1, 'hits': 4, 'pageviews': 4, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'South...","[{'index': 4, 'value': 'EMEA'}]","[{'hitNumber': 1, 'time': 0, 'hour': 10, 'minu...",7830248036973856928,,Direct,Not Socially Engaged
15882,,1,1493924365,1493924365,20170504,"{'visits': 1, 'hits': 4, 'pageviews': 4, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Asia', 'subContinent': 'Western...","[{'index': 4, 'value': 'EMEA'}]","[{'hitNumber': 1, 'time': 0, 'hour': 11, 'minu...",4453799518789966861,,Organic Search,Not Socially Engaged
15883,,1,1493869923,1493869923,20170503,"{'visits': 1, 'hits': 4, 'pageviews': 4, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...","[{'index': 4, 'value': 'North America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 20, 'minu...",7527379548406837364,,Organic Search,Not Socially Engaged
