In [0]:
from pyspark.sql.functions import *
import re

In [0]:
#Routes
input_path = "/Volumes/workspace/default/data_stage/"
checkpoint = "/Volumes/workspace/default/checkpoints/bronze/data_prod_cp"
schema_location = "/Volumes/workspace/default/schema/bronze/data_prod_schema"

In [0]:
%sql
-- Create volumes for landing, checkpointing and schema tracking
CREATE VOLUME IF NOT EXISTS workspace.default.data_stage COMMENT 'landing_zone';
CREATE VOLUME IF NOT EXISTS workspace.default.checkpoints COMMENT 'Checkpoint storage';
CREATE VOLUME IF NOT EXISTS workspace.default.schema_tracking COMMENT 'Schema tracking for Auto Loader';

-- Create delta table for bronze table 
CREATE TABLE IF NOT EXISTS workspace.default.bronze_pmx_prod
USING DELTA;

#### Landing to Bronze

In [0]:
#Read streaming table with cloudFiles Format
df_raw_stream = (
    spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.inferColumnTypes", "true") #Infer column type for new columns 
    .option("cloudFiles.schemaLocation","/Volumes/workspace/default/schema_tracking")
    .option("header", "true")
    .option("cloudFiles.schemaEvolutionMode", "addNewColumns") #Add new columns to schema
    .load(input_path)
)

raw_cols = df_raw_stream.columns #Get columns from raw table


df_rename_col = df_raw_stream\
    .select(
        [col(c).alias(re.sub(r"\s+", "_", re.sub(r"\s*\(.*\)", "", c).strip())) for c in raw_cols] # Rename columns using regex
    )

# Adding new columns for date and ingestion date
df_transformed = (
    df_rename_col
    .withColumn("FECHA", to_date(col("FECHA"), "yyyy/MM"))
    .withColumn("ingestion_date", current_timestamp()) 
    .withColumn("year", year(col("FECHA")))
    .withColumn("month", month(col("FECHA")))
)


# write stream
query = (
    df_transformed.writeStream
    .format("delta")
    .option("checkpointLocation", "/Volumes/workspace/default/checkpoints") 
    .option("mergeSchema", "true")
    .trigger(once=True) 
    .outputMode("append")
    .table("default.bronze_pmx_prod")
)

#### Bronze to Silver

In [0]:
df_bronze = spark.readStream.table("default.bronze_pmx_prod")
display(df_bronze)