##Read Raw Storage Logs in Azure and persist to a view

In [0]:
dbutils.widgets.text('catalog', "slog")
dbutils.widgets.text('schema', "default")
dbutils.widgets.text('table', "azure_raw_storage_log")
dbutils.widgets.text('checkpoint', "abfss://slog@stsezsandbox07.dfs.core.windows.net/checkpoints/azure_raw_storage_log")

In [0]:
full_name_space = f"{dbutils.widgets.get('catalog')}.{dbutils.widgets.get('schema')}.{dbutils.widgets.get('table')}"
checkpoint = f"{dbutils.widgets.get('checkpoint')}"

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType, MapType, ArrayType
from pyspark.sql.functions import col, from_json, unbase64, explode, variant_get

# Define the schema for Event Hub storage logs
# Define the schema for the JSON data
eventhub_logs_schema = StructType([
    StructField("records", ArrayType(StructType([
        StructField("time", StringType(), True),
        StructField("resourceId", StringType(), True),
        StructField("category", StringType(), True),
        StructField("operationName", StringType(), True),
        StructField("operationVersion", StringType(), True),
        StructField("schemaVersion", StringType(), True),
        StructField("statusCode", LongType(), True),
        StructField("statusText", StringType(), True),
        StructField("durationMs", LongType(), True),
        StructField("callerIpAddress", StringType(), True),
        StructField("correlationId", StringType(), True),
        StructField("identity", StructType([
            StructField("type", StringType(), True),
            StructField("tokenHash", StringType(), True),
            StructField("authorization", ArrayType(StructType([
                StructField("action", StringType(), True),
                StructField("roleAssignmentId", StringType(), True),
                StructField("roleDefinitionId", StringType(), True),
                StructField("principals", ArrayType(StructType([
                    StructField("id", StringType(), True),
                    StructField("type", StringType(), True)
                ])), True),
                StructField("denyAssignmentId", StringType(), True),
                StructField("type", StringType(), True),
                StructField("result", StringType(), True),
                StructField("reason", StringType(), True)
            ])), True),
            StructField("requester", StructType([
                StructField("objectId", StringType(), True),
                StructField("tenantId", StringType(), True)
            ]), True)
        ]), True),
        StructField("location", StringType(), True),
        StructField("properties", MapType(StringType(), StringType()), True),
        StructField("uri", StringType(), True),
        StructField("protocol", StringType(), True),
        StructField("resourceType", StringType(), True)
    ])), True)
])


In [0]:
# Configuration (store secrets in Databricks Secrets)
EH_NAMESPACE = dbutils.secrets.get("slog-scope", "eh-namespace")
EH_NAME = "slog"
CONN_STRING = dbutils.secrets.get("slog-scope", "eh-slog")

kafka_options = {
    "kafka.bootstrap.servers": f"{EH_NAMESPACE}:9093",
    "subscribe": EH_NAME,
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.security.protocol": "SASL_SSL",
    "kafka.sasl.jaas.config": f'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{CONN_STRING}";',
    "startingOffsets": "earliest",
    "failOnDataLoss": "false"
}

# Read the data as a stream from Event Hubs
df = spark.readStream.format("kafka").options(**kafka_options).load()

parsed_df = df.select(
    from_json(
        col("value").cast("string"),  # Convert binary to string
        eventhub_logs_schema
    ).alias("data")
).select("data.records")

# Explode the records array to get individual records
exploded_df = parsed_df.select(explode(col("records")).alias("record")).select("record.*")

query = (
    exploded_df.writeStream
    .format("delta")
    .outputMode("append")  # or "update" for aggregation queries
    .option("checkpointLocation", checkpoint)
    .trigger(availableNow=True)  # Use AvailableNow trigger
    .toTable(full_name_space)  
)

# Start the streaming query
query.awaitTermination()