## Task 2: Writing Dataflow batch pipelines

In this task, you use Apache Beam and Dataflow to run a batch processing pipeline to accomplish the same job as in the previous task. Read the data from Cloud Storage, parse it, and write it to BigQuery using a schema that is optimized for analytics.

Using Apache Beam, create a pipeline to migrate the clickstream data to BigQuery in accordance with the schema you created earlier. Program the pipeline in a Jupyter Notebook.

Once you have the pipeline tested, run it using Google Cloud Dataflow.



In [None]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.bigquery import BigQueryDisposition
import json
from datetime import datetime

import os

from dotenv import load_dotenv
load_dotenv('../.env')

In [None]:
# Get parse_visit function
# import sys
# lib_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'dsllib'))
# if lib_path not in sys.path:
#     print(f"Appending path {lib_path}")
#     sys.path.append(lib_path)

# from dsllib.visits import parse_visit

In [None]:
import google.auth
credentials, project = google.auth.default()
print(project)

In [None]:
# Load environment variables
PROJECT_ID=os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION=os.environ.get("REGION")
BUCKET=os.environ.get("BUCKET")
DATASET=os.environ.get("DATASET")
TABLE=os.environ.get("TABLE")
PROJECT_ID

In [None]:
def parse_visit(element: str):
    """
    Parses a JSON string representing a user visit and extracts relevant information.

    Args:
        element (str): A JSON string containing visit data.
    Returns:
        dict: A dictionary containing parsed visit data, or None if an error occurs.
    """
    
    import json
    from datetime import datetime

    try:
        visit_data = json.loads(element)

        session_id = visit_data.get("session_id")
        user_id = visit_data.get("user_id")
        device_type = visit_data.get("device_type")
        
        #Parse geolocation data
        geo_str = visit_data.get("geolocation")
        lat, lon = geo_str.split(',')
        geolocation = f"POINT({lon} {lat})"  # Convert to WKT

        user_agent = visit_data.get("user_agent")

        events = visit_data.get("events", [])
        
        visit_start_time = None
        visit_end_time = None
        
        formatted_events = []
        for event_data in events:
            event = event_data.get("event", {})
            event_type = event.get("event_type")
            timestamp_str = event.get("timestamp")
            timestamp = datetime.fromisoformat(timestamp_str)
            
            if visit_start_time is None or timestamp < visit_start_time:
                visit_start_time = timestamp
            if visit_end_time is None or timestamp > visit_end_time:
                visit_end_time = timestamp

            details = event.get("details", {})
            page_view_details = {}
            add_cart_details = {}
            purchase_details = {}

            if event_type == "page_view":
                page_view_details = {
                    "page_url": details.get("page_url"),
                    "referrer_url": details.get("referrer_url"),
                }
            elif event_type == "add_item_to_cart":
                add_cart_details = {
                    "product_id": details.get("product_id"),
                    "product_name": details.get("product_name"),
                    "category": details.get("category"),
                    "price": details.get("price"),
                    "quantity": details.get("quantity"),
                }
            elif event_type == "purchase":
                purchase_details = {
                    "order_id": details.get("order_id"),
                    "amount": details.get("amount"),
                    "currency": details.get("currency"),
                    "items": details.get("items"),
                }

            formatted_events.append(
                {
                    "event_type": event_type,
                    "event_timestamp": timestamp.isoformat(),
                    "page_view": page_view_details,
                    "add_cart": add_cart_details,
                    "purchase": purchase_details,
                }
            )

        row = {
            "session_id": session_id,
            "user_id": user_id,
            "device_type": device_type,
            "geolocation": geolocation,
            "user_agent": user_agent,
            "visit_start_time": visit_start_time.isoformat() if visit_start_time else None,
            "visit_end_time": visit_end_time.isoformat() if visit_end_time else None,
            "events": formatted_events,
        }
        
        return (row)
    except Exception as e:
        print(f"Error processing element: {e}")
        print(f"Problematic element: {element}")

In [None]:
#ts=datetime.now().isoformat()

def run_pipeline(input_path, output_table, runner, project_id, region):
    #ts=datetime.now().isoformat()
    ts=datetime.now().strftime("%Y-%m-%dt%H%M%S")

    # pipe_opts = {
    #     'runner': runner,
    #     'project': project_id,
    #     'job_name': f"load-events-pipeline-{ts}",
    #     'save_main_session': True,
    #     'region': region,
    #     'temp_location': f"gs://{BUCKET}/tmp-{ts}",
    #     'staging_location': f"gs://{BUCKET}/staging-{ts}",
    # }

    # if runner is 'DataflowRunner':
    #     df_opts = {
    #         'region': region,
    #         'temp_location': f"gs://{BUCKET}/tmp-{ts}",
    #         'staging_location': f"gs://{BUCKET}/staging-{ts}",
    #     }
    #     pipe_opts.update(df_opts)


    #pipeline_options = PipelineOptions(flags=[], **pipe_opts)

    beam_options = PipelineOptions(
        flags=[],
        runner=runner,
        project=project_id,
        region=region,
        temp_location=f"gs://{BUCKET}/tmp-{ts}/",
        staging_location=f"gs://{BUCKET}/staging-{ts}",
        job_name=f"load-events-pipeline-{ts}",
        #save_main_session=True
    )

    with beam.Pipeline(options=beam_options) as pipeline:
        (
            pipeline
            | "ReadFromGCS" >> beam.io.ReadFromText(input_path) # type: ignore
            | "ParseVisit" >> beam.Map(parse_visit)
            | "WriteToBQ" >> beam.io.WriteToBigQuery(
                table=output_table,
                #table=f"{TABLE}_test",
                #dataset=DATASET,
                #project=PROJECT_ID,
                custom_gcs_temp_location=f"gs://{BUCKET}/bqtmp-{ts}",
                #This table should be created ahead of time. If it doesn't exist, don't create it.
                create_disposition=BigQueryDisposition.CREATE_NEVER,
                #This pipeline only appends data. Don't overwrite.
                write_disposition=BigQueryDisposition.WRITE_APPEND,
                #method=beam.io.WriteToBigQuery.Method.STREAMING_INSERTS
                #method=beam.io.WriteToBigQuery.Method.STORAGE_WRITE_API
            )
            # | "WriteToBigQuery" >> beam.io.WriteToBigQuery(
            #     output_table,
            #     schema=bq_schema,
            #     #This table should be created ahead of time. If it doesn't exist, don't create it.
            #     create_disposition=BigQueryDisposition.CREATE_NEVER,
            #     #This pipeline only appends data. Don't overwrite.
            #     write_disposition=BigQueryDisposition.WRITE_APPEND,
            #     method=beam.io.WriteToBigQuery.Method.STREAMING_INSERTS
            #     #method=beam.io.WriteToBigQuery.Method.FILE_LOADS
            #     )
        )

In [None]:
#import apache_beam.runners.interactive.interactive_beam as ib

In [None]:
pipe = beam.Pipeline(runner='InteractiveRunner')
text_coll = pipe | "ReadFromLocal" >> beam.io.ReadFromText("../challenge-clickstream/data/visits-2024-07-01.jsonl")
json_coll = text_coll | "ParseVisit" >> beam.Map(parse_visit)

In [None]:
local_out_coll = json_coll | "WriteToFile" >> beam.io.WriteToText("out/testme")
#pipe.run()

In [None]:
#ts=datetime.now().isoformat()
ts=datetime.now().strftime("%Y-%m-%dT%H%M%S")

In [None]:
#output_table = f"{PROJECT_ID}:{DATASET}.{TABLE}"
output_table = f"{DATASET}.{TABLE}_test"

print(f"Output table: {output_table}")

In [None]:
bq_result = json_coll | "WriteToBQ" >> beam.io.WriteToBigQuery(
                table=f"{TABLE}_test",
                dataset=DATASET,
                project=PROJECT_ID,
                #schema=bq_schema,
                custom_gcs_temp_location=f"gs://{BUCKET}/tmp-{ts}",
                #This table should be created ahead of time. If it doesn't exist, don't create it.
                create_disposition=BigQueryDisposition.CREATE_NEVER,
                #This pipeline only appends data. Don't overwrite.
                write_disposition=BigQueryDisposition.WRITE_APPEND,
                #method=beam.io.WriteToBigQuery.Method.STREAMING_INSERTS
                #method=beam.io.WriteToBigQuery.Method.STORAGE_WRITE_API
                )

pipe.run()

In [None]:
#pipe.run()

# vals = ib.collect(json_coll)
# vals

In [None]:
input_path = f"gs://jfdemo3-dsl/data/visits-*.jsonl"
output_table = f"{DATASET}.{TABLE}"
project_id = PROJECT_ID
ts=datetime.now().strftime("%Y-%m-%dT%H%M%S")
#runner="DirectRunner"
runner="DataflowRunner"
run_pipeline(input_path, output_table, runner, project_id, region=REGION)