In [0]:
# Mounting container
dbutils.fs.mount(
  source = "wasbs://retail@hierarchstorage25.blob.core.windows.net",
  mount_point = "/mnt/retail_project",
  extra_configs = {"fs.azure.account.key.hierarchstorage25.blob.core.windows.net":"Access Key"})


True

In [0]:
dbutils.fs.ls('/mnt/retail_project/bronze/retail/')

[FileInfo(path='dbfs:/mnt/retail_project/bronze/retail/customers/', name='customers/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/retail_project/bronze/retail/products/', name='products/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/retail_project/bronze/retail/stores/', name='stores/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/retail_project/bronze/retail/transactions/', name='transactions/', size=0, modificationTime=0)]

In [0]:
# Read raw data from Bronze layer
df_transactions = spark.read.format('parquet').options(header=True, inferSchema=True).load('/mnt/retail_project/bronze/retail/transactions/')
df_products = spark.read.format('parquet').options(header=True, inferSchema=True).load('/mnt/retail_project/bronze/retail/products/')
df_stores = spark.read.format('parquet').options(header=True, inferSchema=True).load('/mnt/retail_project/bronze/retail/stores/')
df_cus = spark.read.format('parquet').options(header=True, inferSchema=True).load('/mnt/retail_project/bronze/retail/customers/eli-shen08/adf-dev/refs/heads/main/data/')
display(df_cus)

customer_id,first_name,last_name,email,phone,city,registration_date
101,Ravi,Yadav,user101@example.com,9887654321,Delhi,2023-09-14
102,Nina,Joshi,user102@example.com,9876543210,Mumbai,2024-01-21
103,Sonal,Sharma,user103@example.com,9865432109,Bangalore,2023-07-10
104,Karan,Patel,user104@example.com,9854321098,Hyderabad,2024-02-05
105,Riya,Singh,user105@example.com,9843210987,Chennai,2023-06-28
106,Ajay,Mishra,user106@example.com,9832109876,Pune,2024-03-10
107,Priya,Kapoor,user107@example.com,9821098765,Ahmedabad,2023-05-12
108,Rahul,Verma,user108@example.com,9810987654,Kolkata,2023-08-19
109,Pooja,Mehta,user109@example.com,9809876543,Delhi,2024-04-01
110,Deepak,Nair,user110@example.com,9798765432,Mumbai,2023-10-14


In [0]:
# Data Cleaning
from pyspark.sql.functions import *
from pyspark.sql.types import *

df_transactions.printSchema(), df_products.printSchema(), df_stores.printSchema(), df_cus.printSchema()

root
 |-- transaction_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- transaction_date: date (nullable = true)

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: decimal(10,2) (nullable = true)

root
 |-- store_id: integer (nullable = true)
 |-- store_name: string (nullable = true)
 |-- location: string (nullable = true)

root
 |-- customer_id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- city: string (nullable = true)
 |-- registration_date: string (nullable = true)



(None, None, None, None)

In [0]:
display(df_transactions)

transaction_id,customer_id,product_id,store_id,quantity,transaction_date
1,127,8,4,4,2025-03-31
2,105,3,5,4,2024-11-12
3,116,2,3,2,2025-05-01
4,120,8,1,1,2024-11-02
5,105,5,1,2,2025-03-17
6,110,7,5,3,2025-01-04
7,110,7,5,2,2025-01-01
8,126,7,2,5,2025-06-08
9,123,1,2,3,2024-10-08
10,124,2,5,2,2024-08-27


In [0]:
# Casting columns to correct format and dropping duplicates
# Since I have created the data myself I am not checking for NULL values.
df_cus = df_cus.select(
    col('customer_id').cast(IntegerType()),
    col('first_name'),
    col('last_name'),
    col('email'),
    col('phone')).dropDuplicates(["customer_id"])
df_cus = df_cus.withColumn('FullName', concat_ws(" ", col('first_name'), col('last_name')))
    


df_transactions = df_transactions.select(
    col("transaction_id").cast(IntegerType()),
    col("customer_id").cast(IntegerType()),
    col("product_id").cast(IntegerType()),
    col("store_id").cast(IntegerType()),
    col("quantity").cast(IntegerType()),
    col("transaction_date").cast(DateType())
)

df_transactions = df_transactions.withColumn('month', date_format(col('transaction_date'), 'MM')) \
                                . withColumn('year', date_format(col('transaction_date'), 'yyyy')) \
                                    .withColumn('day', date_format(col('transaction_date'), 'dd'))

df_products = df_products.select(
    col("product_id").cast(IntegerType()),
    col("product_name"),
    col("category"),
    col("price").cast(DoubleType())
)

df_stores = df_stores.select(
    col("store_id").cast(IntegerType()),
    col("store_name"),
    col("location")
)

display(df_transactions)

transaction_id,customer_id,product_id,store_id,quantity,transaction_date,month,year,day
1,127,8,4,4,2025-03-31,3,2025,31
2,105,3,5,4,2024-11-12,11,2024,12
3,116,2,3,2,2025-05-01,5,2025,1
4,120,8,1,1,2024-11-02,11,2024,2
5,105,5,1,2,2025-03-17,3,2025,17
6,110,7,5,3,2025-01-04,1,2025,4
7,110,7,5,2,2025-01-01,1,2025,1
8,126,7,2,5,2025-06-08,6,2025,8
9,123,1,2,3,2024-10-08,10,2024,8
10,124,2,5,2,2024-08-27,8,2024,27


In [0]:
silver_base = '/mnt/retail_project/silver/retail/'
df_cus.write.format('delta').mode('overwrite').save(f'{silver_base}/customers')
df_products.write.format('delta').mode('overwrite').save(f'{silver_base}/products')
df_stores.write.format('delta').mode('overwrite').save(f'{silver_base}/stores')
df_transactions.write.format('delta').mode('overwrite').save(f'{silver_base}/transactions')