In [0]:
"""
Load Product Data into Bronze Layer in Databricks

1. Load the product JSON file from the specified path into a DataFrame.
2. Add an `ingestion_timestamp` column to the DataFrame.
3. Append the DataFrame to the Delta table named `bronze_products` in the `globalretail_bronze` database.
4. Move the processed JSON file to an archive folder with a unique timestamped name.

This process ensures that the product data is loaded efficiently and incrementally into the bronze layer, with data lineage and auditability preserved.
"""

In [0]:
"""
Reads a Parquet file from the specified DBFS path into a Spark DataFrame.

Parameters:
- filePath: str
    The path to the Parquet file in Databricks File System (DBFS).

Returns:
- df: pyspark.sql.DataFrame
    The resulting Spark DataFrame containing the Parquet data.
"""

filePath = "dbfs:/FileStore/GlobalRetail/bronze_layer/transaction/transaction_snappy.parquet"
df = spark.read.parquet(filePath)
display(df)

In [0]:
"""
Converts the 'transaction_date' column in the DataFrame 'df' to a timestamp type using to_timestamp,
and displays the resulting DataFrame.

- 'col': Refers to the column in the DataFrame to be transformed.
- 'to_timestamp': Converts a string or numeric column to timestamp type.

Returns:
- new_df: DataFrame with 'transaction_date' as a timestamp column.
"""

from pyspark.sql.functions import to_timestamp, col
new_df = df.withColumn("transaction_date", to_timestamp(col("transaction_date")))
new_df.printSchema()
display(new_df)

In [0]:
"""
Adds an 'ingestion_timestamp' column with the current timestamp to the DataFrame 'new_df'
and displays the resulting DataFrame.

Note: new_df.withColumn returns a new DataFrame; the original 'new_df' remains unchanged.
"""
from pyspark.sql.functions import current_timestamp
final_df = new_df.withColumn("ingestion_timestamp", current_timestamp())
display(final_df)

In [0]:
"""
Appends the DataFrame 'final_df' to the 'bronze_transactions' Delta table in the 'globalretail_bronze' database.
Uses Delta format for ACID transactions, scalable metadata handling, and unified streaming/batch data processing.
"""
spark.sql("use globalretail_bronze")
final_df.write.format("delta").mode("append").saveAsTable("bronze_transactions")

In [0]:
"""
Queries the 'bronze_transactions' Delta table and returns the first 100 rows.
"""

spark.sql("select * from bronze_transactions limit 100").show()

In [0]:
"""
Moves the transaction Parquet file to an archive folder in DBFS with a unique timestamped name.

Archiving the ingested file ensures data lineage, auditability, and recovery by preserving the original raw data.
This prevents accidental reprocessing, supports compliance, and enables troubleshooting or re-ingestion if needed.

Prints the archive file path for logging.
"""
import datetime
archive_folder = "dbfs:/FileStore/GlobalRetail/bronze_layer/transaction/archive/"
archive_filepath = archive_folder +'_'+datetime.datetime.now().strftime("%Y%m%d%H%M%s")
dbutils.fs.mv(filePath, archive_filepath)
print(archive_filepath)