In [None]:
# Step 1: Project Setup and Configuration
# This cell sets up your shell environment with the necessary project configurations and enables the required Google Cloud APIs.

# Set your Google Cloud Project ID
export PROJECT_ID="jellyfish-training-demo-6"
gcloud config set project $PROJECT_ID

# Enable necessary APIs
gcloud services enable \
  run.googleapis.com \
  pubsub.googleapis.com \
  bigquery.googleapis.com \
  artifactregistry.googleapis.com \
  cloudbuild.googleapis.com


In [None]:

# Create a location for your container images in Artifact Registry
gcloud artifacts repositories create cloud-run-source-repo \
    --repository-format=docker \
    --location=us-central1 \
    --description="Docker repository for Cloud Run source"

In [None]:
# Step 3: Write the Python Application Code
# This cell contains the updated Python code for our Flask web server. 
# It now transforms the incoming JSON to match the nested BigQuery schema before insertion.

# Save this code in a file named main.py

# main.py
import base64
import json
import os
from datetime import datetime
from flask import Flask, request
from google.cloud import bigquery

app = Flask(__name__)
client = bigquery.Client()

PROJECT_ID = os.environ.get("PROJECT_ID")
BIGQUERY_DATASET = os.environ.get("BIGQUERY_DATASET", "dsl_project")
BIGQUERY_TABLE = os.environ.get("BIGQUERY_TABLE", "web_visits")
TABLE_ID = f"{PROJECT_ID}.{BIGQUERY_DATASET}.{BIGQUERY_TABLE}"

def transform_event(event_item):
    """Transforms a single event from the source JSON to the BigQuery schema."""
    event_data = event_item.get("event", {})
    event_type = event_data.get("event_type")
    details = event_data.get("details", {})

    transformed = {
        "event_type": event_type,
        "event_timestamp": event_data.get("timestamp"),
        "page_view": None,
        "add_cart": None,
        "purchase": None,
    }

    if event_type == "page_view":
        transformed["page_view"] = {
            "page_url": details.get("page_url"),
            "referrer_url": details.get("referrer_url"),
        }
    elif event_type == "add_item_to_cart":
        transformed["add_cart"] = {
            "product_id": details.get("product_id"),
            "product_name": details.get("product_name"),
            "category": details.get("category"),
            "price": details.get("price"),
            "quantity": details.get("quantity"),
        }
    # Add logic for 'purchase' event type if it exists in your data
    # elif event_type == "purchase":
    #     transformed["purchase"] = { ... }

    return transformed

@app.route("/", methods=["POST"])
def index():
    """Receives and processes a push message from a Pub/Sub subscription."""
    envelope = request.get_json()
    if not envelope or "message" not in envelope:
        msg = "invalid Pub/Sub message format"
        print(f"error: {msg}")
        return f"Bad Request: {msg}", 400

    message = envelope["message"]
    rows_to_insert = []

    if "data" in message:
        data_str = base64.b64decode(message["data"]).decode("utf-8").strip()
        for line in data_str.splitlines():
            try:
                data = json.loads(line)
                events = data.get("events", [])
                
                if not events:
                    continue

                # Calculate visit start and end times
                timestamps = [e["event"]["timestamp"] for e in events if "timestamp" in e.get("event", {})]
                date_timestamps = [datetime.fromisoformat(ts) for ts in timestamps]
                
                # Transform events to match BQ schema
                transformed_events = [transform_event(e) for e in events]

                row = {
                    "session_id": data.get("session_id"),
                    "user_id": data.get("user_id"),
                    "device_type": data.get("device_type"),
                    "geolocation": data.get("geolocation"),
                    "user_agent": data.get("user_agent"),
                    "visit_start_time": min(date_timestamps).isoformat() if date_timestamps else None,
                    "visit_end_time": max(date_timestamps).isoformat() if date_timestamps else None,
                    "events": transformed_events,
                }
                rows_to_insert.append(row)

            except (json.JSONDecodeError, ValueError) as e:
                print(f"Error processing line: {e} - Line: '{line}'")
                continue

    if not rows_to_insert:
        print("No rows to insert.")
        return "Success: No data to process", 200

    errors = client.insert_rows_json(TABLE_ID, rows_to_insert)
    if not errors:
        print(f"Successfully inserted {len(rows_to_insert)} rows into {TABLE_ID}")
        return "Success", 204
    else:
        print(f"Encountered errors while inserting rows: {errors}")
        return f"Error inserting data: {errors}", 500

if __name__ == "__main__":
    PORT = int(os.environ.get("PORT", 8080))
    app.run(host="0.0.0.0", port=PORT, debug=True)

In [None]:
# requirements.txt
Flask==2.3.2
gunicorn==20.1.0
google-cloud-bigquery==3.11.2

In [None]:
# Step 5: Create a Dockerfile
# This Dockerfile defines the environment for our application. It copies the code, installs dependencies, 
# and specifies the command to start the Gunicorn server, which is a production-grade WSGI server.

# Dockerfile
# Use the official lightweight Python image.
FROM python:3.9-slim

# Set the working directory
WORKDIR /app

# Copy local code to the container image.
COPY . .

# Install production dependencies.
RUN pip install --no-cache-dir -r requirements.txt

# Run the web service on container startup.
# Gunicorn is used for production deployment.
CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app

In [None]:
# Step 6: Build and Push the Container Image
# This command uses Cloud Build to build our Docker image and push it to the Artifact Registry repository we created in Step 1.

# Build the container image using Google Cloud Build
gcloud builds submit --tag us-central1-docker.pkg.dev/$PROJECT_ID/cloud-run-source-repo/clickstream-push-bq


In [None]:
# Step 7: Deploy the Container to Cloud Run
# Now we deploy our container image to Cloud Run. We configure it to be private (--no-allow-unauthenticated) 
# and pass our BigQuery table details as environment variables.

# Deploy the service to Cloud Run
gcloud run deploy clickstream-push-bq \
  --image us-central1-docker.pkg.dev/$PROJECT_ID/cloud-run-source-repo/clickstream-push-bq \
  --region us-central1 \
  --no-allow-unauthenticated \
  --set-env-vars="PROJECT_ID=$PROJECT_ID,BIGQUERY_DATASET=dsl_project,BIGQUERY_TABLE=web_visits"


In [None]:
# Step 8: Create a Service Account and Grant Permissions
# We'll create a dedicated service account for our Pub/Sub subscription to use when invoking the Cloud Run service.

# Create a service account for the Pub/Sub subscription
gcloud iam service-accounts create pubsub-invoker-sa \
  --display-name="Pub/Sub to Cloud Run Invoker"

# Allow the new service account to invoke the Cloud Run service
gcloud run services add-iam-policy-binding clickstream-push-bq \
  --member="serviceAccount:pubsub-invoker-sa@$PROJECT_ID.iam.gserviceaccount.com" \
  --role="roles/run.invoker" \
  --region=us-central1

# Grant the Cloud Run service's identity the permission to write to BigQuery
# This uses the Compute Engine default service account.
gcloud projects add-iam-policy-binding $PROJECT_ID \
  --member="serviceAccount:$(gcloud projects describe $PROJECT_ID --format='value(projectNumber)')-compute@developer.gserviceaccount.com" \
  --role="roles/bigquery.dataEditor"


In [None]:
# Step 9: Create the Pub/Sub Push Subscription
# Finally, create the push subscription to connect the Pub/Sub topic to our Cloud Run service.

# Get the Cloud Run service URL
SERVICE_URL=$(gcloud run services describe clickstream-push-bq --platform managed --region us-central1 --format 'value(status.url)')

# Create the push subscription
gcloud pubsub subscriptions create dsl-clickstream-push-sub \
  --topic dsl-project-clickstream \
  --push-endpoint=$SERVICE_URL \
  --push-auth-service-account="pubsub-invoker-sa@$PROJECT_ID.iam.gserviceaccount.com"


In [None]:
# Step 10: Test the Pipeline
# Publish a sample message. The test query is updated to UNNEST the events array to verify the nested data.

# Publish a sample JSONL message to the topic
gcloud pubsub topics publish dsl-project-clickstream --message \
  '{"session_id": "SID-2689", "user_id": "UID-9529", "device_type": "desktop", "geolocation": "44.199851,-171.907106", "user_agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1", "events": [{"event": {"event_type": "page_view", "timestamp": "2025-06-01T09:00:00", "details": {"page_url": "[https://example.com/home](https://example.com/home)", "referrer_url": null}}}, {"event": {"event_type": "page_view", "timestamp": "2025-06-01T09:01:00", "details": {"page_url": "[https://example.com/products](https://example.com/products)", "referrer_url": "[https://example.com/home](https://example.com/home)"}}}, {"event": {"event_type": "add_item_to_cart", "timestamp": "2025-06-01T09:02:00", "details": {"product_id": "SFT-004", "product_name": "Project Manager Plus", "category": "software", "price": 299.99, "quantity": 1}}}]}'

# Wait a few moments for processing, then query BigQuery to see the results
sleep 10
bq query --use_legacy_sql=false \
"SELECT
  session_id,
  visit_start_time,
  e.event_type,
  e.add_cart.product_name,
  e.page_view.page_url
FROM
  \`$PROJECT_ID.dsl_project.web_visits\`,
  UNNEST(events) AS e
WHERE
  session_id = 'SID-2689'"
