In [0]:
# Create a dict in seperate py file called validation.py and define all the filter rules for each event such as below
add_to_cart = {

     "event_name"     : 'add_to_cart'

    ,"RequiredFields" : '''         
                    CASE WHEN content_group IS NULL THEN 'content_group is null'
                        WHEN price IS NULL THEN 'price is null'
                        ELSE 'Valid'
                    END AS RequiredFieldsTest
                '''
    
    ,"StringFields"   : '''
                    CASE WHEN IsString(content_group) = FALSE THEN 'content_group is not string'
                        WHEN IsString(initial_flow_id) = FALSE THEN 'initial_flow_id is not string'
                        ELSE 'Valid'
                    END AS StringFieldsTest
                '''

    ,"IntegerFields"  : '''         
                CASE WHEN IsInteger(ga_session_id) = FALSE THEN 'ga_session_id is not integer'
                    ELSE 'Valid'
                END AS IntegerFieldsTest
                '''

    ,"FloatFields"    : '''
                CASE WHEN IsFloat(price) = FALSE THEN 'price is not float'
                ELSE 'Valid'
                END AS FloatFieldsTest
                '''

    ,"PseudonymizationFields" : '''
                CASE 
                    WHEN user_id IS NOT NULL 
                        AND user_id NOT LIKE 'pa.%' 
                        AND user_id != 'PSEUDONYMIZATION_NOT_ALLOWED_01' 
                    THEN 'user_id Not Pseudonimized'
                    ELSE 'Valid'
                END AS PseudonymizationTest
                '''
}

add_to_wishlist = { '''
                   .......
                   '''

}



event_list = [
                add_to_cart
               ,add_to_wishlist
            ]


# Import this py file in main notebook
from validations import *

In [0]:
%pip install google-cloud-bigquery --quiet
%pip install pandas-gbq --quiet
%pip install google-auth --quiet

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import pyspark.sql.functions as f
import pyspark.sql.utils
from pyspark.sql.types import *
from pyspark.sql.window import Window

import re
import pandas as pd
import requests as req
import base64
import json
import logging

from google.oauth2 import service_account

import os
from operator import and_, or_
from functools import reduce

# Import helper functions from py file
from helpers import *

In [0]:
# Get Service Account Credentials from Databricks Scope
b64 = dbutils.secrets.get(scope="team-tracking-scope", key="b64_databricks_eu@team-tracking") 
creds = base64.b64decode(b64)
key = json.loads(creds)

In [0]:
# Set Credentials for GCS : https://docs.databricks.com/en/connect/storage/gcs.html

project_id = key["project_id"]
privateKeyId = key["private_key_id"]
privateKey = key["private_key"]
clientEmail = key["client_email"]


spark.conf.set("credentials", b64)
spark.conf.set("parentProject", project_id)
spark.conf.set("project", project_id)


sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.private.key.id", privateKeyId)
sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.private.key", privateKey)
sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.email", clientEmail)


temporaryGcsBucket = 'tracking-eu-spark-bigquery-temp'
materializationDataset = "spark_staging_EU" ## GCS Bucket for temporary data storage


service_account_b64 = key
credentials = service_account.Credentials.from_service_account_info(service_account_b64)

In [0]:
# Get dates from widgets. If it's not set, then process data for yesterday
try:
    start = datetime.strptime(dbutils.widgets.get("startDate"), "%Y-%m-%d")
    end = datetime.strptime(dbutils.widgets.get("endDate"), "%Y-%m-%d")
except:
    # start=datetime.strptime('2022-09-01','%Y-%m-%d')
    # end=datetime.strptime('2022-09-01','%Y-%m-%d')
    start = datetime.today() - timedelta(days=1)
    end = datetime.today() - timedelta(days=1)


yesterday = datetime.today() - timedelta(days=1)
end_utc_adjusted = end + timedelta(days=1)

# Prepare date filters for GA4 query
startDate = start.strftime("%Y-%m-%d")
endDate = end.strftime("%Y-%m-%d")
startDateBQ = start.strftime("%Y%m%d")

if yesterday.date() == end.date():
    endDateBQ = end.strftime("%Y%m%d")
    time_boundary = "EXTRACT(hour FROM TIMESTAMP_MICROS(event_timestamp))<22"
else:
    endDateBQ = end_utc_adjusted.strftime("%Y%m%d")
    time_boundary = "1=1"


dates = [startDate, endDate]
#dates_regx = generate_json_list(startDate, endDate)
print(dates)
print(startDate, endDate)
print(startDateBQ, endDateBQ)

['2024-07-18', '2024-07-18']
2024-07-18 2024-07-18
20240718 20240718


In [0]:
# Create DF for unvalid events
GA4_schema = ( StructType()
             .add('event_name', StringType())
             .add('dt', StringType()) 
             .add('event_timestamp', LongType())
             .add('event_previous_timestamp', LongType())
             .add('event_server_timestamp_offset', LongType())
             .add('event_value_in_usd', DoubleType())
             .add('event_bundle_sequence_id', LongType())
             .add('value', DoubleType())
             .add('event_trigger_timestamp', StringType())
             .add('legacy_event_name', StringType())
             .add('parent_component_name', StringType())
             .add('component_name', StringType())
             .add('content_group', StringType())
             .add('content_group_detail', StringType())
             .add('market', StringType())
             .add('browser_tab_id', StringType())
             .add('deeplink', StringType())
             .add('page_title', StringType())
             .add('page_location', StringType())
             .add('session_engaged', StringType())
             .add('navigation_target_group', StringType())
             .add('shop_language', StringType())
             .add('currency', StringType())
             .add('initial_flow_id', StringType())
             .add('ignore_referrer', StringType())
             .add('method',StringType())
             .add('firebase_screen',StringType())
             .add('firebase_screen_class',StringType())
             .add('ga_session_id', LongType())
             .add('ga_session_number', LongType())
             .add('engagement_time_msec', LongType())
             .add('firebase_conversion', LongType())
             .add('batch_ordering_id', LongType())
             .add('batch_page_id', LongType())
             .add('engaged_session_event', LongType())
             .add('client_id', StringType())
             .add('device_consent_id', StringType())
             .add('user_id', StringType())
             .add('user_pseudo_id', StringType())
             .add('is_active_user', BooleanType())
             .add('analytics_storage', StringType())
             .add('ads_storage', StringType())
             .add('uses_transient_token', StringType())
             .add('category', StringType())
             .add('mobile_brand_name', StringType())
             .add('mobile_model_name', StringType())
             .add('mobile_marketing_name', StringType())
             .add('operating_system', StringType())
             .add('language', StringType())
             .add('is_limited_ad_tracking', StringType())
             .add('browser', StringType())
             .add('browser_version', StringType())
             .add('hostname', StringType())
             .add('platform', StringType())
             .add('id', StringType())
             .add('version', StringType())
             .add('install_store', StringType())
             .add('firebase_app_id', StringType())
             .add('install_source', StringType())
             .add('city', StringType())
             .add('country', StringType())
             .add('continent', StringType())
             .add('region', StringType())
             .add('sub_continent', StringType())
             .add('stream_id', StringType())
             .add('total_item_quantity', LongType())
             .add('purchase_revenue_in_usd', DoubleType())
             .add('purchase_revenue', DoubleType())
             .add('refund_value_in_usd', DoubleType())
             .add('refund_value', DoubleType())
             .add('shipping_value_in_usd', DoubleType())
             .add('shipping_value', DoubleType())
             .add('tax_value_in_usd', DoubleType())
             .add('tax_value', DoubleType())
             .add('unique_items', LongType())
             .add('transaction_id', StringType())
             .add('item_id', StringType())
             .add('item_name', StringType())
             .add('item_brand', StringType())
             .add('item_variant', StringType())
             .add('item_category', StringType())
             .add('price_in_usd', DoubleType())
             .add('price', DoubleType())
             .add('quantity', LongType())
             .add('item_revenue_in_usd', DoubleType())
             .add('item_revenue', DoubleType())
             .add('item_refund', DoubleType())
             .add('coupon', StringType())
             .add('affiliation', StringType())
             .add('item_flags', StringType())
             .add('size_availability', StringType())
             .add('last_dt', LongType())
             .add('FieldsValueTest', StringType())
             .add('RequiredFieldsTest', StringType())
             .add('StringFieldsTest', StringType())
             .add('IntegerFieldsTest', StringType())
             .add('FloatFieldsTest', StringType())
             .add('PseudonymizationTest',StringType())
            )

In [0]:
# Iterate each event and retrieve unvalid rows
ga4_unvalid_events = spark.createDataFrame(data = [], schema = GA4_schema)

for event in event_list:
  ga4_parsed_query = f'''

CREATE TEMP FUNCTION IsDate(x STRING) AS 
(x IS NULL OR SAFE_CAST(x AS DATE) IS NOT NULL);  

CREATE TEMP FUNCTION IsString(x STRING) AS 
(x IS NULL OR SAFE_CAST(x AS STRING) IS NOT NULL);  

CREATE TEMP FUNCTION IsInteger(x INT64) AS 
(x IS NULL OR SAFE_CAST(x AS INT64) IS NOT NULL); 

CREATE TEMP FUNCTION IsFloat(x FLOAT64) AS 
(x IS NULL OR SAFE_CAST(x AS FLOAT64) IS NOT NULL);  

WITH CTE AS (
SELECT DISTINCT
         event_name
        ,event_date AS dt
        ,(SELECT value.double_value FROM UNNEST(event_params) WHERE key = 'value') AS value
        ,0 AS last_dt
        ,CASE 
            WHEN platform IS NOT NULL AND platform NOT IN ('WEB','IOS','ANDROID')
              THEN 'platform is invalid'
            WHEN device.category IS NOT NULL AND device.category NOT IN ('smart tv', 'tablet', 'mobile', 'desktop') 
              THEN 'device category is invalid'
            WHEN LENGTH((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'market')) > 2 
              THEN 'market is longer than 2 letter'
            ELSE 'Valid'
          END AS FieldsValueTest
  FROM `fs-raw-data.analytics_419243363.p_events_*` , UNNEST(items) as items
  WHERE _table_suffix between '{startDateBQ}' and '{endDateBQ}'
  AND event_name = '{event['event_name']}'
  )

  SELECT *
         ,{event['RequiredFields']}
         ,{event['StringFields']}
         ,{event['IntegerFields']}
         ,{event['FloatFields']}
         ,{event['PseudonymizationFields']}
  FROM CTE
'''

  ga4_parsed = (
          spark.read.format("bigquery")
          .option("query", ga4_parsed_query)
          .option("materializationDataset", "spark_staging_EU")
          .load()
      )

  # Filter the invalid records
  temp_df = ga4_parsed.filter(~f.col('FieldsValueTest').isin('Valid') |
                              ~f.col('RequiredFieldsTest').isin('Valid') | 
                              ~f.col('StringFieldsTest').isin('Valid') |  
                              ~f.col('IntegerFieldsTest').isin('Valid') | 
                              ~f.col('FloatFieldsTest').isin('Valid') | 
                              ~f.col('PseudonymizationTest').isin('Valid')
                              )
  ga4_unvalid_events = ga4_unvalid_events.unionAll(temp_df)

In [0]:
#ga4_unvalid_events.display()

##### Write final data to BQ

In [0]:
now=datetime.now()
print(now)

In [0]:
# Write to BigQuery dataset table
dataset='project_name.datasetname.GA4_unvalid_events' 
write_proposition='append'

ga4_unvalid_events\
    .withColumn('last_executed',f.lit(now))\
    .write\
    .format("bigquery") \
    .mode(write_proposition) \
    .option("temporaryGcsBucket", temporaryGcsBucket) \
    .option("table", f"{dataset}") \
    .save()

In [0]:
# Set last_dt
lastday_query = f"""
                    UPDATE {dataset} 
                    SET last_dt = 0 
                    WHERE 1=1 ;

                    UPDATE {dataset}
                    SEANT last_dt = 1
                    WHERE dt = (SELECT MAX(dt) FROM {dataset})
                 """

with bigquery.Client(credentials=credentials) as bq_client:
    job=bq_client.query(lastday_query)
    try:
        job.result()
        print("Updated Successfully.")
    except Exception as e:  
        print('Error on query data')
        print(e)
        raise      