## Task 2: Writing Dataflow batch pipelines

In this task, you use Apache Beam and Dataflow to run a batch processing pipeline to accomplish the same job as in the previous task. Read the data from Cloud Storage, parse it, and write it to BigQuery using a schema that is optimized for analytics.

Using Apache Beam, create a pipeline to migrate the clickstream data to BigQuery in accordance with the schema you created earlier. Program the pipeline in a Jupyter Notebook.

Once you have the pipeline tested, run it using Google Cloud Dataflow.

In [None]:
pip install --upgrade apache-beam[gcp]

In [None]:
import IPython
from IPython.display import display

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [19]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions
from google.cloud import bigquery
import json
import os
from datetime import datetime

# Set your Google Cloud project ID and dataset
PROJECT_ID = "jellyfish-training-demo-6"
DATASET_ID = "dsl_project"
TABLE_ID = "website-visits"
TEMP_LOCATION = f"gs://{PROJECT_ID}/temp"
REGION = "us-central1"

# Create the unique job name by appending the timestamp
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
JOB_NAME = f"storage-to-bq-{timestamp}"

In [None]:
# Initialize BigQuery client
client = bigquery.Client(project=PROJECT_ID)

In [None]:

# Specify your GCS JSONL file path
input_path = "gs://guillermo-lake/visits-*.jsonl"

# Define the BigQuery schema as a JSON object (not a plain string)
SCHEMA = {
    "fields": [
        {"name": "session_id", "type": "STRING"},
        {"name": "user_id", "type": "STRING"},
        {"name": "device_type", "type": "STRING"},
        {"name": "geolocation", "type": "STRING"},
        {"name": "user_agent", "type": "STRING"},
        {
            "name": "events",
            "type": "RECORD",
            "mode": "REPEATED",
            "fields": [
                {
                    "name": "event",
                    "type": "RECORD",
                    "fields": [
                        {"name": "event_type", "type": "STRING"},
                        {"name": "timestamp", "type": "TIMESTAMP"},
                        {
                            "name": "details",
                            "type": "RECORD",
                            "fields": [
                                {"name": "page_url", "type": "STRING"},
                                {"name": "referrer_url", "type": "STRING"},
                                {"name": "product_id", "type": "STRING"},
                                {"name": "product_name", "type": "STRING"},
                                {"name": "category", "type": "STRING"},
                                {"name": "price", "type": "FLOAT"},
                                {"name": "quantity", "type": "INTEGER"},
                                {"name": "order_id", "type": "STRING"},
                                {"name": "amount", "type": "FLOAT"},
                                {"name": "currency", "type": "STRING"},
                                {
                                    "name": "items",
                                    "type": "RECORD",
                                    "mode": "REPEATED",
                                    "fields": [
                                        {"name": "product_id", "type": "STRING"},
                                        {"name": "product_name", "type": "STRING"},
                                        {"name": "category", "type": "STRING"},
                                        {"name": "price", "type": "FLOAT"},
                                        {"name": "quantity", "type": "INTEGER"},
                                    ],
                                },
                            ],
                        },
                    ],
                }
            ],
        },
    ]
}

class ParseJsonlFn(beam.DoFn):
    def process(self, element):
        try:
            record = json.loads(element)
            yield record
        except Exception as e:
            print(f"Error parsing record: {e}")

# Initialize BigQuery client
client = bigquery.Client(project=PROJECT_ID)

def create_table():
    """Creates the BigQuery table if it does not exist."""
    dataset_ref = client.dataset(DATASET_ID)
    table_ref = dataset_ref.table(TABLE_ID)
    table = bigquery.Table(table_ref, schema=[bigquery.SchemaField.from_api_repr(field) for field in SCHEMA["fields"]])
    
    try:
        client.create_table(table)
        print(f"Table {TABLE_ID} created successfully.")
    except Exception as e:
        print(f"Table creation failed: {e}")

def run_pipeline(input_path):
    """Runs the Apache Beam pipeline to process JSONL files."""
    options = PipelineOptions(flags=[
        "--project", PROJECT_ID,
        "--runner", "DirectRunner",  # Runs locally
        "--temp_location", f"gs://{PROJECT_ID}/temp"
    ])

    with beam.Pipeline(options=options) as pipeline:
        (
            pipeline
            | "Read JSONL File" >> beam.io.ReadFromText(input_path)
            | "Parse JSON" >> beam.ParDo(ParseJsonlFn())
            | "Write to BigQuery" >> beam.io.WriteToBigQuery(
                table=f"{PROJECT_ID}:{DATASET_ID}.{TABLE_ID}",
                schema=SCHEMA,  # Pass the schema directly as a Python dictionary
                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                custom_gcs_temp_location=f"gs://{PROJECT_ID}/temp"
            )
        )

# Run table creation
#create_table()



In [None]:

# Run the pipeline locally
run_pipeline(input_path)



In [None]:
# Define and run the pipeline options for dataflow

options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = PROJECT_ID
google_cloud_options.job_name = JOB_NAME
google_cloud_options.staging_location = f'gs://{PROJECT_ID}/staging'
google_cloud_options.temp_location = f'gs://{PROJECT_ID}/temp'
google_cloud_options.region = REGION  
options.view_as(beam.options.pipeline_options.StandardOptions).runner = 'DataflowRunner'


with beam.Pipeline(options=options) as pipeline:
        (
            pipeline
            | "Read JSONL File" >> beam.io.ReadFromText(input_path)
            | "Parse JSON" >> beam.ParDo(ParseJsonlFn())
            | "Write to BigQuery" >> beam.io.WriteToBigQuery(
                f"{PROJECT_ID}:{DATASET_ID}.{TABLE_ID}",
                schema=SCHEMA,  # Pass the schema directly as a Python dictionary
                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,

            )
        )
    



