In [0]:
from pyspark.sql.functions import *
import re

In [0]:
#Routes
input_path = "/Volumes/workspace/default/data_stage/"
checkpoint = "/Volumes/workspace/default/checkpoints/bronze/data_prod_cp"
schema_location = "/Volumes/workspace/default/schema/bronze/data_prod_schema"

#### Landing to Bronze

In [0]:
#Read streaming table with cloudFiles Format
df_raw_stream = (
    spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.inferColumnTypes", "true") #Infer column type for new columns 
    .option("cloudFiles.schemaLocation","/Volumes/workspace/default/schema_tracking")
    .option("header", "true")
    .option("cloudFiles.schemaEvolutionMode", "addNewColumns") #Add new columns to schema
    .load(input_path)
)


#Normalize columns
def normalize_cols(df_raw_stream,cols):
    df_rename_col = df_raw_stream\
        .select(
            [col(c).alias(re.sub(r"\s+", "_", re.sub(r"\s*\(.*\)", "", c).strip())) for c in raw_cols] # Rename columns using regex
        )
    return df_rename_col

# Adding new columns for date and ingestion date
def add_ingestion_date(df_rename_col):
    return df_rename_col.withColumn("ingestion_date", current_timestamp())


raw_cols = df_raw_stream.columns #Get columns from raw table
df_rename = normalize_cols(df_raw_stream,raw_cols)
df_transformed = add_ingestion_date(df_rename)

# write stream
query = (
    df_transformed.writeStream
    .format("delta")
    .option("checkpointLocation", "/Volumes/workspace/default/checkpoints") 
    .option("mergeSchema", "true")
    .trigger(once=True) 
    .outputMode("append")
    .table("pmx_etl.bronze_pmx_prod")
)