In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType
from pyspark.sql.functions import current_timestamp, lit, col
import re

SOURCE_FILE_PATH = "/Volumes/workspace/retail/sales/online_retail_II.csv"
TARGET_TABLE = 'workspace.retail.bronze_sales'

def normalize_column_name(name):
    name = name.replace(' ', '_')   
    name = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', name)
    name = re.sub(r'_+', '_', name)
    return name.lower()


def transform(file_path, schema):
    df = spark.read \
      .format("csv") \
      .option("header", "true") \
      .schema(schema) \
      .load(SOURCE_FILE_PATH) \
      .withColumn("ingestion_time", current_timestamp()) \
      .withColumn("file_path", col("_metadata.file_path")) \
      .withColumn("env", lit("dev"))

    final_columns = [normalize_column_name(c) for c in df.columns]
    bronze_df = df.toDF(*final_columns)

    return bronze_df


def main():
    schema = StructType([
        StructField("Invoice", StringType(), True),
        StructField("StockCode", StringType(), True),
        StructField("Description", StringType(), True),
        StructField("Quantity", IntegerType(), True),
        StructField("InvoiceDate", TimestampType(), True),
        StructField("Price", DoubleType(), True),
        StructField("Customer ID", DoubleType(), True),
        StructField("Country", StringType(), True)
    ]) 

    bronze_df = transform(SOURCE_FILE_PATH, schema)
    
    bronze_df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(TARGET_TABLE)


if __name__ == "__main__":
    main()



























