In [0]:
"""
Load Customer Data into Bronze Layer in Databricks

1. Create a new notebook named `Bronze_layer_Customer_load`.
2. Load the customer CSV file from the specified path into a DataFrame.
3. Add an `ingestion_timestamp` column to the DataFrame.
4. Save the DataFrame as a Delta table named `bronze_customer` in the `globalretail_bronze` database.
5. Move the processed CSV file to an archive folder.
"""

In [0]:
"""
Reads a CSV file from the specified DBFS path into a Spark DataFrame.

Parameters:
- filePath: str
    The path to the CSV file in Databricks File System (DBFS).
- header: bool, default=True
    Indicates whether the first line of the file contains column names.
- inferSchema: bool, default=True
    If True, automatically infers the data types of each column.

Returns:
- df: pyspark.sql.DataFrame
    The resulting Spark DataFrame containing the CSV data.
"""

filePath = "dbfs:/FileStore/GlobalRetail/bronze_layer/customer_data/customer.csv"
df = spark.read.csv(filePath, header=True, inferSchema=True)
df.show()

In [0]:
"""
Adds an 'ingestion_timestamp' column with the current timestamp to the DataFrame 'df'
and displays the resulting DataFrame.

Note: df.withColumn returns a new DataFrame; the original 'df' remains unchanged.
"""
from pyspark.sql.functions import current_timestamp

df_new = df.withColumn("ingestion_timestamp", current_timestamp())
display(df_new)

In [0]:
"""
Appends the DataFrame 'df_new' to the 'bronze_customer' Delta table in the 'globalretail_bronze' schema.
Uses Delta format for ACID transactions, scalable metadata handling, and unified streaming/batch data processing.
"""
spark.sql("use globalretail_bronze")
df_new.write.format("delta").mode("append").saveAsTable("bronze_customer")

In [0]:
spark.sql("select * from bronze_customer limit 100").show()

In [0]:
"""
Moves the customer data file to an archive folder in DBFS with a unique timestamped name.

Archiving the ingested file ensures data lineage, auditability, and recovery by preserving the original raw data.
This prevents accidental reprocessing, supports compliance, and enables troubleshooting or re-ingestion if needed.

Catches FileNotFoundException if the source file does not exist.
Prints the archive file path for logging.
"""
import datetime
archive_folder = "dbfs:/FileStore/GlobalRetail/bronze_layer/customer_data/archive/"
archive_filepath = archive_folder +'_'+datetime.datetime.now().strftime("%Y%m%d%H%M%s")
dbutils.fs.mv(filePath, archive_filepath)
print(archive_filepath)