In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.functions import col
from time import time
import sys

In [0]:
sys.path.append("/Workspace/Users/faraaz_khan@epam.com/PEI/utils")

In [0]:
from common_util import normalize_columns

In [0]:
start_time = time()

In [0]:
source_path = "/Volumes/fz_catalog/landing/orders/"
checkpoint_path = "/Volumes/fz_catalog/landing/_checkpoints/orders_bronze/"

In [0]:

orders_schema = StructType(
    [
        StructField("Row ID", IntegerType(), True),
        StructField("Order ID", StringType(), True),
        StructField("Order Date", StringType(), True),
        StructField("Ship Date", StringType(), True),
        StructField("Ship Mode", StringType(), True),
        StructField("Customer ID", StringType(), True),
        StructField("Product ID", StringType(), True),
        StructField("Quantity", IntegerType(), True),
        StructField("Price", DoubleType(), True),
        StructField("Discount", DoubleType(), True),
        StructField("Profit", DoubleType(), True),
    ]
)


df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("multiLine", "true")
    .schema(orders_schema)
    .load(source_path)
)


df_renamed = df_renamed = normalize_columns(df)

df_with_audit = df_renamed.withColumn("input_file_name", col("_metadata.file_name")).withColumn(
    "file_modification_time", col("_metadata.file_modification_time")
)



df_with_audit.writeStream.format("delta")\
.outputMode("append")\
.option("checkpointLocation", checkpoint_path)\
.trigger(availableNow=True)\
.toTable("fz_catalog.bronze.orders")

In [0]:
end_time = time()

In [0]:
dbutils.notebook.exit(f"Run Time: {end_time - start_time} seconds")