# Demo job

This demo notebook is executed using Databricks Workflows as defined in resources/watchtower.demo.job.yml.

In [2]:
import logging
import json
from pyspark.sql import SparkSession

In [None]:
# Set up structured logging

class JSONFormatter(logging.Formatter):
    """Structured JSON formatter for logging."""

    def format(self, record: logging.LogRecord) -> str:
        """Formats log records as JSON."""
        log_record = {
            'timestamp': self.formatTime(record, self.datefmt),
            'level': record.levelname,
            'message': record.getMessage(),
            'logger': record.name,
            'line': record.lineno,
        }
        return json.dumps(log_record)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create a stream (console) handler
# and set the formatter for the handler
handler = logging.StreamHandler()
formatter = JSONFormatter(datefmt='%Y-%m-%dT%H:%M:%S') # ISO-8601 format
handler.setFormatter(formatter)
logger.addHandler(handler)

In [None]:
# Use the logger object to log messages instead of print()
logger.debug("This is a debug message")
logger.info("This is an info message")
logger.warning("This is a warning message")

# Handle exceptions with logging.
try:
    raise RuntimeError("This is a runtime error")
except RuntimeError:
    logger.error("This is an error message", exc_info=True)


In [0]:
# Intentionally setting a small executor memory
# to demonstrate spill to disk.
spark = SparkSession.builder \
    .config("spark.executor.memory", "512m") \
    .getOrCreate()

# FIXME: It is a best practice to remove .show() and display()
#        from code before it is deployed to production.
spark.range(10).show()

In [None]:
# FIXME: we should not log PII or sensitive data.
logger.info("user email: john.doe@example.com")
logger.info("phone: 5015551234")

In [None]:
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()

notebook_path = (
    w.dbutils.notebook.entry_point
        .getDbutils().notebook().getContext()
        .notebookPath().get()
)

logger.info(f"Notebook path: {notebook_path}")

In [None]:
from pyspark.sql.streaming import StreamingQueryListener
from pyspark.sql.streaming.listener import QueryStartedEvent, QueryProgressEvent, QueryTerminatedEvent


class SparkStreamingLogger(StreamingQueryListener):
    def onQueryStarted(self, event: QueryStartedEvent):
        logger.info(f"Query started: {event.name} ({event.id})")

    def onQueryProgress(self, event: QueryProgressEvent):
        logger.info(f"Query progress: {event.progress.json}")

    def onQueryTerminated(self, event: QueryTerminatedEvent):
        if event.exception:
            logger.error(f"Query terminated: {event.id} ({event.exception})")
        else:
            logger.info(f"Query terminated: {event.id}")

In [None]:
# Register the logging listener
spark.streams.addListener(SparkStreamingLogger())