In [0]:
%sql
-- Pre-check cell to inspect the csv file. Do not include it later in the job.
/*
select * 
from read_files(
  '/Volumes/job_catalog/default/dataset/sales/',
  format => 'csv'
  );
*/

In [0]:
from pyspark.sql.functions import from_json, col, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

#
# Ingest data from sales.csv file and create the bronze_sales table
#

#Step 1: Read the csv file and create a Spark DataFrame
file_path = "/Volumes/job_catalog/default/dataset/sales/"

# Define schema for the product JSON
product_schema = StructType([
    StructField("curr", StringType()),
    StructField("id", StringType()),
    StructField("name", StringType()),
    StructField("price", IntegerType()),
    StructField("qty", IntegerType()),
    StructField("unit", StringType())
])

df = (
    spark.read
    .format("csv")
    .option("header", "true")
    .option("quote", '"')
    .option("escape", '"')
    .load(file_path)
    .withColumn("processing_time", current_timestamp())
    .withColumn("file_name", col("_metadata.file_path"))
)

df = df.withColumn("product_json", from_json(col("product"), product_schema))
df = df.drop("product").withColumnRenamed("product_json", "product")
# Rearrange the columns so it looks like in the demo video
df = df.select(
    "customer_id",
    "customer_name",
    "product_name",
    "order_date",
    "product_category",
    "product",
    "total_price",
    "processing_time",
    "file_name"
)
#display(df)

# Step 2: Create the delta table
(df
.write
.mode("overwrite")
.format("delta")
.saveAsTable("job_catalog.default.bronze_sales")
)


In [0]:
# Post-check cell. Do not include it later in the job
#spark.read.table("job_catalog.default.bronze_sales").display()
# OBS! the spark.read.table() reads a table from the catalog and saves it in a data frame