In [0]:
%pip install google-cloud-bigquery --quiet
%pip install pandas-gbq --quiet
%pip install google-auth --quiet

In [0]:
import pyspark.sql.functions as f
import pyspark.sql.utils
from pyspark.sql.types import *
from pyspark.sql.window import Window

import re
import pandas as pd
import requests as req
import base64
import json
import logging
import json

from google.oauth2 import service_account

import os
from operator import and_, or_
from functools import reduce

# Import helper functions from py file
from helpers import *

##### Get Events Schema from Registry API

In [0]:
# Get access_token for schema registry api 
access_token = '' 

In [0]:
# function to get schema for a given event
def event_schema(event_name):
    global access_token
    
    url = f"https://schema-registry-api....../events/{event_name}/sgtm"
    headers = {
                "Authorization": 'Bearer '  + access_token
            }
    response = req.get(url = url, headers = headers)
    dict_string = response.json()
    return dict_string

In [0]:
event_list = [
                "add_to_cart" ,"add_to_wishlist"  ,"begin_checkout" ,"join_group" ,"leave_group","load_content","login" ,"page_view","purchase"
               ,"remove_from_cart","remove_from_wishlist" ,"screen_view","search" ,"select_content" ,"select_item" ,"select_promotion" ,"sign_up"
               ,"video_complete" ,"video_pause" ,"video_progress" ,"video_start","view_cart" ,"view_content","view_item" ,"view_item_list","view_promotion"
            ]

In [0]:
# Creating a dict to store all events and their required fields
all_required_dict = {}
for event in event_list:
    required_fields = []
    schema = event_schema(event)
    fields = schema['$defs']['/events/config/client/config_schema.json']['required']
    for i in fields:
        required_fields.append(i)
    try:
        if schema['$defs'][f'/events/{event}/client/{event}_schema.json']['required'] == ['items']:
            item_fields = schema['$defs'][f'/modules/item.json']['required']
            for x in item_fields:
                required_fields.append(x)
    except:
        pass
    required_fields.append('legacy_event_name') # Not required in the schema api yet but it should be
    all_required_dict.update({f'{event}':required_fields})

#all_required_dict

In [0]:
# Creating a dict to store all events and case statement
required_case_statements = {}
for key in all_required_dict.keys():
    case_list = []
    i = 0
    while i < len(all_required_dict[key]):
        case_list.append( f" WHEN {all_required_dict[key][i]} IS NULL THEN '{all_required_dict[key][i]} is null' ")
        i += 1
    # Combine all cases into one statement
    cases= "CASE"
    for i in case_list: 
        cases = cases + i 
    case_statement = cases + "ELSE 'Valid' END AS RequiredFieldsTest"

    required_case_statements.update({f'{key}':case_statement})

##### Check Last GA4 Events Data from BigQuery

In [0]:
# Get Service Account Credentials from Databricks Scope
b64 = dbutils.secrets.get(scope="team-cope", key="b64_.....acking") 
creds = base64.b64decode(b64)
key = json.loads(creds)

In [0]:
# Set Credentials for GCS : https://docs.databricks.com/en/connect/storage/gcs.html

project_id = key["project_id"]
privateKeyId = key["private_key_id"]
privateKey = key["private_key"]
clientEmail = key["client_email"]


spark.conf.set("credentials", b64)
spark.conf.set("parentProject", project_id)
spark.conf.set("project", project_id)


sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.private.key.id", privateKeyId)
sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.private.key", privateKey)
sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.email", clientEmail)


temporaryGcsBucket = 'tracking-eu-spark-bigquery-temp'
materializationDataset = "spark_staging_EU" ## GCS Bucket for temporary data storage


service_account_b64 = key
credentials = service_account.Credentials.from_service_account_info(service_account_b64)

In [0]:
# Get dates from widgets. If it's not set, then process data for yesterday
try:
    start = datetime.strptime(dbutils.widgets.get("startDate"), "%Y-%m-%d")
    end = datetime.strptime(dbutils.widgets.get("endDate"), "%Y-%m-%d")
except:
    # start=datetime.strptime('2022-09-01','%Y-%m-%d')
    # end=datetime.strptime('2022-09-01','%Y-%m-%d')
    start = datetime.today() - timedelta(days=1)
    end = datetime.today() - timedelta(days=1)


yesterday = datetime.today() - timedelta(days=1)
end_utc_adjusted = end + timedelta(days=1)

# Prepare date filters for GA4 query
startDate = start.strftime("%Y-%m-%d")
endDate = end.strftime("%Y-%m-%d")
startDateBQ = start.strftime("%Y%m%d")

if yesterday.date() == end.date():
    endDateBQ = end.strftime("%Y%m%d")
    time_boundary = "EXTRACT(hour FROM TIMESTAMP_MICROS(event_timestamp))<22"
else:
    endDateBQ = end_utc_adjusted.strftime("%Y%m%d")
    time_boundary = "1=1"


dates = [startDate, endDate]
#dates_regx = generate_json_list(startDate, endDate)
print(dates)
print(startDate, endDate)
print(startDateBQ, endDateBQ)

In [0]:
# Assign manually
#startDateBQ = 20240707
#endDateBQ = 20240707

In [0]:
# Check if the last available date data in GA4 dataset by function from helpers.py 
ga4_check(
    credentials=credentials,
    check_dataset=4......63,
    end_date=endDateBQ,
    anon=True,
    maxTries=3,
)

In [0]:
# Create DF for unvalid events
GA4_schema = ( StructType()
             .add('event_name', StringType())
             .add('dt', StringType()) 
             .add('event_timestamp', LongType())
             .add('event_previous_timestamp', LongType())
             .add('event_server_timestamp_offset', LongType())
             .add('event_value_in_usd', DoubleType())
             .add('event_bundle_sequence_id', LongType())
             .add('value', DoubleType())
             .add('event_trigger_timestamp', StringType())
             .add('legacy_event_name', StringType())
             .add('parent_component_name', StringType())
             .add('component_name', StringType())
             .add('content_group', StringType())
             .add('content_group_detail', StringType())
             .add('market', StringType())
             .add('browser_tab_id', StringType())
             .add('deeplink', StringType())
             .add('page_title', StringType())
             .add('page_location', StringType())
             .add('session_engaged', StringType())
             .add('navigation_target_group', StringType())
             .add('shop_language', StringType())
             .add('currency', StringType())
             .add('initial_flow_id', StringType())
             .add('ignore_referrer', StringType())
             .add('method',StringType())
             .add('firebase_screen',StringType())
             .add('firebase_screen_class',StringType())
             .add('ga_session_id', LongType())
             .add('ga_session_number', LongType())
             .add('engagement_time_msec', LongType())
             .add('firebase_conversion', LongType())
             .add('batch_ordering_id', LongType())
             .add('batch_page_id', LongType())
             .add('engaged_session_event', LongType())
             .add('client_id', StringType())
             .add('device_consent_id', StringType())
             .add('user_id', StringType())
             .add('user_pseudo_id', StringType())
             .add('is_active_user', BooleanType())
             .add('analytics_storage', StringType())
             .add('ads_storage', StringType())
             .add('uses_transient_token', StringType())
             .add('category', StringType())
             .add('mobile_brand_name', StringType())
             .add('mobile_model_name', StringType())
             .add('mobile_marketing_name', StringType())
             .add('operating_system', StringType())
             .add('language', StringType())
             .add('is_limited_ad_tracking', StringType())
             .add('browser', StringType())
             .add('browser_version', StringType())
             .add('hostname', StringType())
             .add('platform', StringType())
             .add('id', StringType())
             .add('version', StringType())
             .add('install_store', StringType())
             .add('firebase_app_id', StringType())
             .add('install_source', StringType())
             .add('city', StringType())
             .add('country', StringType())
             .add('continent', StringType())
             .add('region', StringType())
             .add('sub_continent', StringType())
             .add('stream_id', StringType())
             .add('total_item_quantity', LongType())
             .add('purchase_revenue_in_usd', DoubleType())
             .add('purchase_revenue', DoubleType())
             .add('refund_value_in_usd', DoubleType())
             .add('refund_value', DoubleType())
             .add('shipping_value_in_usd', DoubleType())
             .add('shipping_value', DoubleType())
             .add('tax_value_in_usd', DoubleType())
             .add('tax_value', DoubleType())
             .add('unique_items', LongType())
             .add('transaction_id', StringType())
             .add('item_id', StringType())
             .add('item_name', StringType())
             .add('item_brand', StringType())
             .add('item_variant', StringType())
             .add('item_category', StringType())
             .add('price_in_usd', DoubleType())
             .add('price', DoubleType())
             .add('quantity', LongType())
             .add('item_revenue_in_usd', DoubleType())
             .add('item_revenue', DoubleType())
             .add('item_refund', DoubleType())
             .add('coupon', StringType())
             .add('affiliation', StringType())
             .add('item_flags', StringType())
             .add('size_availability', StringType())
             .add('last_dt', LongType())
             .add('FieldsValueTest', StringType())
             .add('RequiredFieldsTest', StringType())
             .add('PseudonymizationTest',StringType())
            )

In [0]:
# Iterate each event and retrieve unvalid rows
ga4_invalid_events = spark.createDataFrame(data = [], schema = GA4_schema)

for event in event_list:
  ga4_parsed_query = f'''

CREATE TEMP FUNCTION IsDate(x STRING) AS 
(x IS NULL OR SAFE_CAST(x AS DATE) IS NOT NULL);  

CREATE TEMP FUNCTION IsString(x STRING) AS 
(x IS NULL OR SAFE_CAST(x AS STRING) IS NOT NULL);  

CREATE TEMP FUNCTION IsInteger(x INT64) AS 
(x IS NULL OR SAFE_CAST(x AS INT64) IS NOT NULL); 

CREATE TEMP FUNCTION IsFloat(x FLOAT64) AS 
(x IS NULL OR SAFE_CAST(x AS FLOAT64) IS NOT NULL);  

WITH RAW_EVENTS AS (
SELECT DISTINCT
         event_name
        ,event_date AS dt
        ,(SELECT value.double_value FROM UNNEST(event_params) WHERE key = 'value') AS value
        ,(SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'event_trigger_timestamp') AS event_trigger_timestamp
        ......
        ,items.coupon
        ,items.affiliation
        ,(SELECT value.string_value from UNNEST(item_params) WHERE key = 'item_flags') AS item_flags
        ,(SELECT value.string_value from UNNEST(item_params) WHERE key = 'size_availability') AS size_availability
        ,0 AS last_dt
        ,CASE 
            WHEN platform IS NOT NULL AND platform NOT IN ('WEB','IOS','ANDROID')
              THEN 'platform is invalid'
            WHEN device.category IS NOT NULL AND device.category NOT IN ('smart tv', 'tablet', 'mobile', 'desktop') 
              THEN 'device category is invalid'
            WHEN (SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'market') IS NOT NULL 
                  AND LENGTH((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'market')) <> 2 
              THEN 'market is invalid'
            ELSE 'Valid'
          END AS FieldsValueTest
  FROM `project.datasetname.events_*` , UNNEST(items) as items
  WHERE 1=1
  AND _table_suffix between '{startDateBQ}' and '{endDateBQ}'
  AND event_name = '{event}'
  )

,FINAL AS (
  SELECT *
         ,{required_case_statements[event]}
         ,CASE 
              WHEN user_id IS NOT NULL 
                AND user_id NOT LIKE 'pa.%' 
                AND user_id != 'PSEUDONYMIZATION_NOT_ALLOWED_01' 
                  THEN 'user_id Not Pseudonimized'
              ELSE 'Valid'
          END AS PseudonymizationTest
  FROM RAW_EVENTS
)

SELECT *
FROM FINAL 
WHERE FieldsValueTest <> 'Valid' OR RequiredFieldsTest <> 'Valid' OR PseudonymizationTest <> 'Valid'
'''

  ga4_parsed = (
          spark.read.format("bigquery")
          .option("query", ga4_parsed_query)
          .option("materializationDataset", "spark_staging_EU")
          .option("arrowCompressionCodec","ZSTD")
          .load()
      )

  ga4_invalid_events = ga4_invalid_events.unionAll(ga4_parsed)

In [0]:
#print(ga4_parsed_query)
#ga4_invalid_events.display()

##### Write final data to BQ

In [0]:
now=datetime.now()
print(now)

In [0]:
# Write to BigQuery dataset table
dataset='projectname.dashboard.GA4_invalid_events' 
write_proposition='append'

ga4_invalid_events\
    .withColumn('last_executed',f.lit(now))\
    .write\
    .format("bigquery") \
    .mode(write_proposition) \
    .option("temporaryGcsBucket", temporaryGcsBucket) \
    .option("table", f"{dataset}") \
    .save()

In [0]:
# Set last_dt
lastday_query = f"""
                    UPDATE {dataset} 
                    SET last_dt = 0 
                    WHERE last_dt = 1 ;

                    UPDATE {dataset}
                    SET last_dt = 1
                    WHERE dt = (SELECT MAX(dt) FROM {dataset})
                 """

with bigquery.Client(credentials=credentials) as bq_client:
    job=bq_client.query(lastday_query)
    try:
        job.result()
        print("Updated Successfully.")
    except Exception as e:  
        print('Error on query data')
        print(e)
        raise      

In [0]:
# Duplication Check on the target BQ table
deduplication_query = f""" DELETE FROM {dataset_c} WHERE dt between '{startDateBQ}' and '{endDateBQ}' and  last_executed <> '{now}'   """

with bigquery.Client(credentials=credentials) as bq_client:
    job=bq_client.query(deduplication_query)
    try:
        job.result()
        print("Deleted the previous load Successfully.")
    except Exception  as e:  
        print('Error on deduplicating data')
        print(e)
        raise     