# Upload Raw GA360 Data to BigQuery Table

In [1]:
%load_ext autoreload
%autoreload 2

::: {.content-hidden}
Import necessary Python modules
:::

In [2]:
import os
import sys
from datetime import datetime, timedelta
from glob import glob
from typing import Any, Dict

import pandas as pd
import pytz
from google.cloud import bigquery
from google.oauth2 import service_account

::: {.content-hidden}
Get relative path to project root directory
:::

In [3]:
PROJ_ROOT_DIR = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT_DIR, "src")
sys.path.append(src_dir)

::: {.content-hidden}
Import custom Python modules
:::

In [4]:
%aimport bigquery_auth_helpers
from bigquery_auth_helpers import auth_to_bigquery

%aimport sql_helpers
import sql_helpers as sqlh

%aimport transform_helpers
import transform_helpers as th

Show datatypes and number of missing values for all columns in a `DataFrame`

In [5]:
def summarize_df(df: pd.DataFrame) -> None:
    """Show datatypes and count missing values in columns of DataFrame."""
    display(
        df.dtypes.rename("dtype")
        .to_frame()
        .merge(
            df.isna().sum().rename("missing").to_frame(),
            left_index=True,
            right_index=True,
            how="left",
        )
        .reset_index()
        .rename(columns={"index": "column"})
    )

## About

## User Inputs

Define the following

1. start date and the number of days following this date for which raw data should be queried
2. list of nested columns in raw GA360 tracking data that is accessible in the sample merchandise store dataset
3. GCP
   - dataset id
   - table id

In [6]:
#| echo: true
# 1. start date and number of following days
date_start = datetime(2016, 11, 29)
num_days = 2

# 2. list of nested columns in raw data
nested_cols = [
    'totals',
    'trafficSource',
    'device',
    'geoNetwork',
    'customDimensions',
    'hits',
]

# 3. GCP resources
gbq_dataset_id = 'mydemo2asdf'
gbq_table_id = 'ecwa'

Get end date for which raw data should be queried

In [7]:
date_start_str = date_start.strftime("%Y%m%d")
date_end_str = (date_start + timedelta(days=num_days-1)).strftime("%Y%m%d")

::: {.content-hidden}
Get path to data sub-folders
:::

In [8]:
data_dir = os.path.join(PROJ_ROOT_DIR, "data")
raw_data_dir = os.path.join(data_dir, "raw")

Set datatypes for non-nested fields in raw GA360 tracking data

In [9]:
dtypes_dict_raw = {
    "visitorId": pd.Int64Dtype(),
    "visitNumber": pd.Int64Dtype(),
    "visitId": pd.Int64Dtype(),
    "visitStartTime": pd.Int64Dtype(),
    "date": pd.StringDtype(),
    "fullVisitorId": pd.StringDtype(),
    "userId": pd.StringDtype(),
    "channelGrouping": pd.StringDtype(),
    "socialEngagementType": pd.StringDtype(),
}

::: {.content-hidden}
Load Google Cloud authentication credentials for use with the native BigQuery Python client
:::

In [10]:
gcp_proj_id = os.environ["GCP_PROJECT_ID"]
gcp_creds_fpath = glob(os.path.join(raw_data_dir, "*.json"))[0]
gcp_creds = service_account.Credentials.from_service_account_file(
    gcp_creds_fpath
)

::: {.content-hidden}
Get fully resolved name of the BigQuery table
:::

In [11]:
gbq_table_fully_resolved = f"{gcp_proj_id}.{gbq_dataset_id}.{gbq_table_id}"

::: {.content-hidden}
Get path to all raw data files that were previously exported
:::

In [12]:
raw_data_files = sorted(glob(f"{raw_data_dir}/ga_data/d2i_{gbq_table_id}*.parquet.gzip"))

::: {.content-hidden}
Create authenticated native BigQuery Python client
:::

In [13]:
client = bigquery.Client(project=gcp_proj_id, credentials=gcp_creds)

## Read Data from Local Disk

Load batch(es) of raw data

In [None]:
%%time
dfs_local = [pd.read_parquet(f) for f in raw_data_files]
for local_fpath, df_local in zip(raw_data_files, dfs_local):
    print(
        f"Found data at {os.path.basename(local_fpath)} with "
        f"{len(df_local):,} rows"
    )

Show datatypes of columns in `DataFrame` for single batch of data

In [None]:
summarize_df(dfs_local[0])

## Upload Data to BigQuery Table

### Create Empty BigQuery Table

In [None]:
table = bigquery.Table(gbq_table_fully_resolved)
try:
    client.create_table(table)
    table = client.create_table(table)
    print(f"Created table {table.project}.{table.dataset_id}.{table.table_id}")
except Exception as e:
    if "Already Exists" in str(e):
        print(f"Found existing table {gbq_table_fully_resolved}")
    else:
        print(str(e))

### Define BigQuery Table Schema

In [None]:
job_config = bigquery.LoadJobConfig(schema=[
    bigquery.SchemaField("visitorId", "INTEGER", mode='NULLABLE'),
    bigquery.SchemaField("visitNumber", "INTEGER", mode='NULLABLE'),
    bigquery.SchemaField("visitId", "INTEGER", mode='NULLABLE'),
    bigquery.SchemaField("visitStartTime", "INTEGER", mode='NULLABLE'),
    bigquery.SchemaField("date", "STRING", mode='NULLABLE'),
    bigquery.SchemaField(
        "totals",
        "RECORD",
        mode="NULLABLE",
        fields=[
            bigquery.SchemaField("visits", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("hits", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("pageviews", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("timeOnSite", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("bounces", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("transactions", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("transactionRevenue", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("newVisits", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("screenviews", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("UniqueScreenViews", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("timeOnScreen", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("totalTransactionRevenue", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("sessionQualityDim", "INTEGER", mode="NULLABLE"),
        ],
    ),
    bigquery.SchemaField(
        "trafficSource",
        "RECORD",
        mode="NULLABLE",
        fields=[
            bigquery.SchemaField("referralPath", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("campaign", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("source", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("medium", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("keyword", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("adContent", "STRING", mode="NULLABLE"),
            bigquery.SchemaField(
                "adwordsClickInfo",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("campaignId", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adGroupId", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("creativeId", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("criteriaId", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("page", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("slot", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("criteriaParameters", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("gclId", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("customerId", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adNetworkType", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField(
                        "targetingCriteria",
                        "RECORD",
                        mode="NULLABLE",
                        fields=[bigquery.SchemaField("boomUserlistId", "INTEGER", mode="NULLABLE")],
                    ),
                    bigquery.SchemaField("isVideoId", "BOOLEAN", mode="NULLABLE"),
                ],
            ),
            bigquery.SchemaField("isTrueDirect", "BOOLEAN", mode="NULLABLE"),
            bigquery.SchemaField("campaignCode", "STRING", mode="NULLABLE"),
        ],
    ),
    bigquery.SchemaField(
        "device",
        "RECORD",
        mode="NULLABLE",
        fields=[
            bigquery.SchemaField("browser", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("browserVersion", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("browserSize", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("operatingSystem", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("operatingSystemVersion", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("isMobile", "BOOLEAN", mode="NULLABLE"),
            bigquery.SchemaField("mobileDeviceBranding", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("mobileDeviceModel", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("mobileInputSelector", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("mobileDeviceInfo", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("mobileDeviceMarketingName", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("flashVersion", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("javaEnabled", "BOOLEAN", mode="NULLABLE"),
            bigquery.SchemaField("language", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("screenColors", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("screenResolution", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("deviceCategory", "STRING", mode="NULLABLE"),
        ],
    ),
    bigquery.SchemaField(
        "geoNetwork",
        "RECORD",
        mode="NULLABLE",
        fields=[
            bigquery.SchemaField("continent", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("subContinent", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("country", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("region", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("metro", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("city", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("cityId", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("networkDomain", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("latitude", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("longitude", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("networkLocation", "STRING", mode="NULLABLE"),
        ],
    ),
    bigquery.SchemaField(
        "customDimensions",
        "RECORD",
        mode="REPEATED",
        fields=[
            bigquery.SchemaField("index", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("value", "STRING", mode="NULLABLE"),
        ],
    ),
    # hits
    bigquery.SchemaField(
        "hits",
        "RECORD",
        mode="REPEATED",
        fields=[
            bigquery.SchemaField("hitNumber", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("time", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("hour", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("minute", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("isSecure", "BOOLEAN", mode="NULLABLE"),
            bigquery.SchemaField("isInteraction", "BOOLEAN", mode="NULLABLE"),
            bigquery.SchemaField("isEntrance", "BOOLEAN", mode="NULLABLE"),
            bigquery.SchemaField("isExit", "BOOLEAN", mode="NULLABLE"),
            bigquery.SchemaField("referer", "STRING", mode="NULLABLE"),
            # page (NESTED)
            bigquery.SchemaField(
                "page",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("pagePath", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("hostname", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("pageTitle", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("searchKeyword", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("searchCategory", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("pagePathLevel1", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("pagePathLevel2", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("pagePathLevel3", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("pagePathLevel4", "STRING", mode="NULLABLE"),
                ],
            ),
            # transaction (NESTED)
            bigquery.SchemaField(
                "transaction",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("transactionId", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("transactionRevenue", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("transactionTax", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("transactionShipping", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("affiliation", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("currencyCode", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("localTransactionRevenue", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("localTransactionTax", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("localTransactionShipping", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("transactionCoupon", "STRING", mode="NULLABLE"),
                ],
            ),
            # item (NESTED)
            bigquery.SchemaField(
                "item",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("transactionId", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("productName", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("productCategory", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("productSku", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("itemQuantity", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("itemRevenue", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("currencyCode", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("localItemRevenue", "INTEGER", mode="NULLABLE"),
                ],
            ),
            # contentInfo (NESTED)
            bigquery.SchemaField(
                "contentInfo",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("contentDescription", "STRING", mode="NULLABLE"),
                ],
            ),
            # appInfo (NESTED)
            bigquery.SchemaField(
                "appInfo",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("version", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("id", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("installerId", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("appInstallerId", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("appName", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("appVersion", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("appId", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("screenName", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("landingScreenName", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("exitScreenName", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("screenDepth", "STRING", mode="NULLABLE"),
                ],
            ),
            # exceptionInfo (NESTED)
            bigquery.SchemaField(
                "exceptionInfo",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("description", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("isFatal", "BOOLEAN", mode="NULLABLE"),
                    bigquery.SchemaField("exceptions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("fatalExceptions", "INTEGER", mode="NULLABLE"),
                ],
            ),
            # eventInfo (NESTED)
            bigquery.SchemaField(
                "eventInfo",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("eventCategory", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("eventAction", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("eventLabel", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("eventValue", "INTEGER", mode="NULLABLE"),
                ],
            ),
            # product (NESTED)
            bigquery.SchemaField(
                "product",
                "RECORD",
                mode="REPEATED",
                fields=[
                    bigquery.SchemaField("productSKU", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("v2ProductName", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("v2ProductCategory", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("productVariant", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("productBrand", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("productRevenue", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("localProductRevenue", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("productPrice", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("localProductPrice", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("productQuantity", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("productRefundAmount", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("localProductRefundAmount", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("isImpression", "BOOLEAN", mode="NULLABLE"),
                    bigquery.SchemaField("isClick", "BOOLEAN", mode="NULLABLE"),
                    bigquery.SchemaField(
                        "customDimensions",
                        "RECORD",
                        mode="REPEATED",
                        fields=[
                            bigquery.SchemaField("index", "INTEGER", mode="NULLABLE"),
                            bigquery.SchemaField("value", "STRING", mode="NULLABLE"),
                        ],
                    ),
                    bigquery.SchemaField(
                        "customMetrics",
                        "RECORD",
                        mode="REPEATED",
                        fields=[
                            bigquery.SchemaField("index", "INTEGER", mode="NULLABLE"),
                            bigquery.SchemaField("value", "INTEGER", mode="NULLABLE"),
                        ],
                    ),
                    bigquery.SchemaField("productListName", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("productListPosition", "INTEGER", mode="NULLABLE"),
                ],
            ),
            # promotion (NESTED)
            bigquery.SchemaField(
                "promotion",
                "RECORD",
                mode="REPEATED",
                fields=[
                    bigquery.SchemaField("promoId", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("promoName", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("promoCreative", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("promoPosition", "STRING", mode="NULLABLE"),
                ],
            ),
            # promotionActionInfo (NESTED)
            bigquery.SchemaField(
                "promotionActionInfo",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("promoIsView", "BOOLEAN", mode="NULLABLE"),
                    bigquery.SchemaField("promoIsClick", "BOOLEAN", mode="NULLABLE"),
                ],
            ),
            # refund (NESTED)
            bigquery.SchemaField(
                "refund",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("refundAmount", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("localRefundAmount", "INTEGER", mode="NULLABLE"),
                ],
            ),
            # eCommerceAction (NESTED)
            bigquery.SchemaField(
                "eCommerceAction",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("action_type", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("step", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("option", "STRING", mode="NULLABLE"),
                ],
            ),
            # experiment (NESTED)
            bigquery.SchemaField(
                "experiment",
                "RECORD",
                mode="REPEATED",
                fields=[
                    bigquery.SchemaField("experimentId", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("experimentVariant", "STRING", mode="NULLABLE"),
                ],
            ),
            # publisher (NESTED)
            bigquery.SchemaField(
                "publisher",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("dfpClicks", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpMatchedQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpMeasurableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpRevenueCpm", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpRevenueCpc", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpViewableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpPagesViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpClicks", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpMatchedQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpMeasurableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpRevenueCpm", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpRevenueCpc", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpViewableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpPagesViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpClicks", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpMatchedQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpMeasurableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpRevenueCpm", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpRevenueCpc", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpViewableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpPagesViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxClicks", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxMatchedQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxMeasurableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxRevenue", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxViewableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxPagesViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsUnitsViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsUnitsMatched", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("viewableAdsViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("measurableAdsViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsPagesViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsClicked", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsRevenue", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpAdGroup", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("dfpAdUnits", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("dfpNetworkId", "STRING", mode="NULLABLE"),
                ],
            ),
            # customVariables (NESTED)
            bigquery.SchemaField(
                "customVariables",
                "RECORD",
                mode="REPEATED",
                fields=[
                    bigquery.SchemaField("index", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("customVarName", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("customVarValue", "STRING", mode="NULLABLE"),
                ],
            ),
            # customDimensions (NESTED)
            bigquery.SchemaField(
                "customDimensions",
                "RECORD",
                mode="REPEATED",
                fields=[
                    bigquery.SchemaField("index", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("value", "STRING", mode="NULLABLE"),
                ],
            ),
            # customMetrics (NESTED)
            bigquery.SchemaField(
                "customMetrics",
                "RECORD",
                mode="REPEATED",
                fields=[
                    bigquery.SchemaField("index", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("value", "INTEGER", mode="NULLABLE"),
                ],
            ),
            bigquery.SchemaField("type", "STRING", mode="NULLABLE"),
            # social (NESTED)
            bigquery.SchemaField(
                "social",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("socialInteractionNetwork", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("socialInteractionAction", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("socialInteractions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("socialInteractionTarget", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("socialNetwork", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("uniqueSocialInteractions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("hasSocialSourceReferral", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("socialInteractionNetworkAction", "STRING", mode="NULLABLE"),
                ],
            ),
            # latencyTracking (NESTED)
            bigquery.SchemaField(
                "latencyTracking",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("pageLoadSample", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("pageLoadTime", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("pageDownloadTime", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("redirectionTime", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("speedMetricsSample", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("domainLookupTime", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("serverConnectionTime", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("serverResponseTime", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("domLatencyMetricsSample", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("domInteractiveTime", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("domContentLoadedTime", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("userTimingValue", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("userTimingSample", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("userTimingVariable", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("userTimingCategory", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("userTimingLabel", "STRING", mode="NULLABLE"),
                ],
            ),
            # sourcePropertyInfo (NESTED)
            bigquery.SchemaField(
                "sourcePropertyInfo",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("sourcePropertyDisplayName", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("sourcePropertyTrackingId", "STRING", mode="NULLABLE"),
                ],
            ),
            # contentGroup (NESTED)
            bigquery.SchemaField(
                "contentGroup",
                "RECORD",
                mode="NULLABLE",
                fields=[
                    bigquery.SchemaField("contentGroup1", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("contentGroup2", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("contentGroup3", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("contentGroup4", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("contentGroup5", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("previousContentGroup1", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("previousContentGroup2", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("previousContentGroup3", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("previousContentGroup4", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("previousContentGroup5", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("contentGroupUniqueViews1", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("contentGroupUniqueViews2", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("contentGroupUniqueViews3", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("contentGroupUniqueViews4", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("contentGroupUniqueViews5", "INTEGER", mode="NULLABLE"),
                ],
            ),
            bigquery.SchemaField("dataSource", "STRING", mode="NULLABLE"),
            # publisher_infos (NESTED)
            bigquery.SchemaField(
                "publisher_infos",
                "RECORD",
                mode="REPEATED",
                fields=[
                    bigquery.SchemaField("dfpClicks", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpMatchedQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpMeasurableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpRevenueCpm", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpRevenueCpc", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpViewableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpPagesViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpClicks", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpMatchedQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpMeasurableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpRevenueCpm", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpRevenueCpc", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpViewableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsenseBackfillDfpPagesViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpClicks", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpMatchedQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpMeasurableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpRevenueCpm", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpRevenueCpc", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpViewableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxBackfillDfpPagesViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxClicks", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxMatchedQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxMeasurableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxQueries", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxRevenue", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxViewableImpressions", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adxPagesViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsUnitsViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsUnitsMatched", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("viewableAdsViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("measurableAdsViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsPagesViewed", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsClicked", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("adsRevenue", "INTEGER", mode="NULLABLE"),
                    bigquery.SchemaField("dfpAdGroup", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("dfpAdUnits", "STRING", mode="NULLABLE"),
                    bigquery.SchemaField("dfpNetworkId", "STRING", mode="NULLABLE"),
                ],
            ),
        ],
    ),
    bigquery.SchemaField("fullVisitorId", "STRING", mode='NULLABLE'),
    bigquery.SchemaField("userId", "STRING", mode='NULLABLE'),
    bigquery.SchemaField("channelGrouping", "STRING", mode='NULLABLE'),
    bigquery.SchemaField("socialEngagementType", "STRING", mode='NULLABLE'),
])
job_config.write_disposition = 'WRITE_APPEND'

Get list of columns in the table

In [None]:
cols_to_use = [f.name for f in job_config.schema]

### Appending Batch(es) of Data to BigQuery Table

Append `DataFrame` for single file of data to newly-created table in BigQuery dataset

In [None]:
%%time
job = client.load_table_from_dataframe(
    # dfs_local[2][cols_to_use].iloc[0:13_000],
    dfs_local[4][cols_to_use],
    destination=f"{gbq_dataset_id}.{gbq_table_id}",
    job_config=job_config,
)
_ = job.result()

Count number of rows in BigQuery table

In [None]:
table = client.get_table(gbq_table_fully_resolved)
print(
    f"Found {table.num_rows:,} rows and {len(table.schema):,} columns in "
    f"table {table.dataset_id}.{table.table_id}"
)

## Read Data From Newly-Created BigQuery Table

Authenticate to `BigQuery` using Google Cloud Service Account, in order to query BigQuery table using `pandas` (which uses the `pandas_gbq` Python library)

In [None]:
gcp_auth_dict = auth_to_bigquery(raw_data_dir)

### Basic Query

Read single batch of raw GA360 tracking data covering the start and end dates defined above, drop duplicated rows (if any) and change datatypes

In [None]:
%%time
query = f"""
        SELECT visitorId,
               visitNumber,
               visitId,
               visitStartTime,
               date,
               totals,
               trafficSource,
               device,
               geoNetwork,
               customDimensions,
               hits,
               fullVisitorId,
               userId,
               channelGrouping,
               socialEngagementType
        FROM `{gbq_table_fully_resolved}`
        -- LIMIT 4000
        WHERE date BETWEEN '{date_start_str}' AND '{date_end_str}'
        """
df_query = (
    th.extract_data(query, gcp_auth_dict)
    .pipe(th.set_datatypes, dtypes_dict_raw)
    .pipe(th.set_datatypes, dict(zip(nested_cols, [pd.StringDtype()]*len(nested_cols))))
)
print(
    len(df_query),
    len(df_query.drop_duplicates([c for c in cols_to_use if c not in nested_cols])),
    df_query['fullVisitorId'].nunique(),
)
summarize_df(df_query)

### Other Queries

Run some queries from [QuickLab](https://www.cloudskillsboost.google/focuses/1794?parent=catalog).

Task 1

In [None]:
%%time
query = f"""
        WITH visitors AS(
            SELECT COUNT(DISTINCT fullVisitorId) AS total_visitors
            FROM `{gbq_table_fully_resolved}`
            -- WHERE date BETWEEN '{date_start_str}' AND '{date_end_str}'
        ),
        purchasers AS (
            SELECT COUNT(DISTINCT fullVisitorId) AS total_purchasers
            FROM `{gbq_table_fully_resolved}`
            -- WHERE date BETWEEN '{date_start_str}' AND '{date_end_str}'
            WHERE totals.transactions IS NOT NULL
        )
        SELECT total_visitors,
               total_purchasers,
               100* (total_purchasers / total_visitors) AS conversion_rate
        FROM visitors, purchasers
        """
df_query = th.extract_data(query, gcp_auth_dict)
df_query

Task 2

In [None]:
%%time
query = f"""
        WITH t1 AS (
            SELECT hits
            FROM `{gbq_table_fully_resolved}`
        )
        SELECT p.v2ProductName,
               p.v2ProductCategory,
               SUM(p.productQuantity) AS units_sold,
               ROUND(SUM(p.localProductRevenue/1000000),2) AS revenue
        FROM t1,
        UNNEST(hits) AS h,
        UNNEST(h.product) AS p
        GROUP BY 1, 2
        ORDER BY revenue DESC
        LIMIT 5
        """
df_query = th.extract_data(query, gcp_auth_dict)
df_query

Task 3

In [None]:
%%time
query = f"""
        # visitors who bought on a return visit (could have bought on first as well
        WITH all_visitor_stats AS (
            SELECT fullvisitorid,
                   IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
            FROM `{gbq_table_fully_resolved}`
            GROUP BY fullvisitorid
        )
        SELECT will_buy_on_return_visit,
               COUNT(DISTINCT fullvisitorid) AS total_visitors
        FROM all_visitor_stats
        GROUP BY will_buy_on_return_visit
        """
df_query = th.extract_data(query, gcp_auth_dict)
df_query

Tasks 4 and 5

In [None]:
%%time
query = f"""
        WITH t1 AS (
            SELECT fullVisitorId,
                   IFNULL(totals.bounces, 0) AS bounces,
                   IFNULL(totals.timeOnSite, 0) AS time_on_site
            FROM `{gbq_table_fully_resolved}`
            WHERE totals.newVisits = 1
        ),
        t2 AS (
            SELECT fullvisitorid,
                   IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
            FROM `{gbq_table_fully_resolved}`
            GROUP BY fullvisitorid
        )
        SELECT *
        FROM t1 JOIN t2 USING (fullVisitorId)
        ORDER BY time_on_site DESC
        """
df_query = th.extract_data(query, gcp_auth_dict)
print(
    len(df_query),
    len(df_query.drop_duplicates(['fullVisitorId'])),
    df_query['fullVisitorId'].nunique(),
)
df_query