In [0]:
from pyspark.sql.functions import *
import re

In [0]:
input_path = "/Volumes/workspace/default/data_stage/"
checkpoint = "/Volumes/workspace/default/checkpoints/bronze/data_prod_cp"
schema_location = "/Volumes/workspace/default/schema/bronze/data_prod_schema"

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS workspace.default.checkpoints COMMENT 'Checkpoint storage';
CREATE VOLUME IF NOT EXISTS workspace.default.schema_tracking COMMENT 'Schema tracking for Auto Loader';
CREATE TABLE IF NOT EXISTS workspace.default.bronze_pmx_prod
USING DELTA;

In [0]:
%sql
SHOW VOLUMES

In [0]:
%sql
describe extended default.bronze_pmx_prod


In [0]:
df_raw_stream = (
    spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.schemaLocation","/Volumes/workspace/default/schema_tracking")
    .option("header", "true")
    .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
    .load(input_path)
)

sample_cols = (
    spark.read.format("csv")
    .option("header", "true")
    .load(input_path)
    .columns
)


df_rename_col = df_raw_stream\
    .select(
        [col(c).alias(re.sub(r"\s*\(.*\)", "", c).strip()) for c in sample_cols]
    )

df_transformed = (
    df_rename_col
    .withColumn("FECHA", to_date(col("FECHA"), "yyyy/MM"))
    .withColumn("ingestion_date", current_timestamp())
    .withColumn("year", year(col("FECHA")))
    .withColumn("month", month(col("FECHA")))
)

query = (
    df_transformed.writeStream
    .format("delta")
    .option("checkpointLocation", "/Volumes/workspace/default/checkpoints") 
    .option("mergeSchema", "true")
    .trigger(once=True) 
    .outputMode("append")
    .table("default.bronze_pmx_prod")
)