In [None]:
# Define paths
storage_account_name = "<YOUR_STORAGE_ACCOUNT_NAME>"
container_name = "<YOUR_WORKSPACE_NAME>"
source_path = f"abfss://{storage_account_name}@{container_name}.dfs.core.windows.net/synapseExample/00_source/"
bronze_path = f"abfss://{storage_account_name}@{container_name}.dfs.core.windows.net/synapseExample/01_bronze/"
silver_path = f"abfss://{storage_account_name}@{container_name}.dfs.core.windows.net/synapseExample/02_silver/"
gold_path = f"abfss://{storage_account_name}@{container_name}.dfs.core.windows.net/synapseExample/03_gold/"

In [None]:
# Load source data
customers_df = spark.read.csv(f"{source_path}customers.csv", header=True, inferSchema=True)
products_df = spark.read.csv(f"{source_path}products.csv", header=True, inferSchema=True)
orders_df = spark.read.csv(f"{source_path}orders.csv", header=True, inferSchema=True)

In [None]:
# Write each dataframe to bronze folder in Parquet format
customers_df.write.mode("overwrite").parquet(f"{bronze_path}customers/")
products_df.write.mode("overwrite").parquet(f"{bronze_path}products/")
orders_df.write.mode("overwrite").parquet(f"{bronze_path}orders/")

In [None]:
# Example querying using SQL syntax
customers_df.createOrReplaceTempView("Customers")
result = spark.sql("SELECT * FROM Customers where Email like 'ant%'")
result.show(10)

In [None]:
# Transform from Bronze to Silver
from pyspark.sql import functions as F

# Load data from Bronze layer
customers_bronze_df = spark.read.parquet(f"{bronze_path}customers/")
orders_bronze_df = spark.read.parquet(f"{bronze_path}orders/")
products_bronze_df = spark.read.parquet(f"{bronze_path}products/")


# Step 1: Clean and Standardize Customers Data
# Remove duplicates based on CustomerID and standardize phone numbers and emails
customers_silver_df = (
    customers_bronze_df
    .dropDuplicates(["CustomerID"])
    .filter(F.col("Email").contains("@"))  # Basic email validation
    .withColumn("PhoneNumber", F.regexp_replace("PhoneNumber", r"[^\d]", ""))  # Remove non-numeric characters
)

# Step 2: Clean and Standardize Orders Data
# Remove any records with null OrderID, CustomerID, ProductID, and ensure valid quantity and total amount
orders_silver_df = (
    orders_bronze_df
    .dropDuplicates(["OrderID"])
    .filter(F.col("CustomerID").isNotNull() & F.col("ProductID").isNotNull())
    .filter((F.col("Quantity") > 0) & (F.col("TotalAmount") > 0))  # Ensure valid values
    .withColumn("OrderDate", F.to_date("OrderDate"))  # Standardize date format
)

# Step 3: Clean and Standardize Products Data
# Remove duplicates based on ProductID and ensure positive stock and price
products_silver_df = (
    products_bronze_df
    .dropDuplicates(["ProductID"])
    .filter(F.col("Price") > 0)  # Ensure positive price
    .filter(F.col("Stock") >= 0)  # Stock should not be negative
)

# Write data to Silver layer
customers_silver_df.write.mode("overwrite").parquet(f"{silver_path}customers/")
orders_silver_df.write.mode("overwrite").parquet(f"{silver_path}orders/")
products_silver_df.write.mode("overwrite").parquet(f"{silver_path}products/")

In [None]:
# Transform to Gold Layer

# Load data from Silver layer
customers_silver_df = spark.read.parquet(f"{silver_path}customers/")
orders_silver_df = spark.read.parquet(f"{silver_path}orders/")
products_silver_df = spark.read.parquet(f"{silver_path}products/")

# Step 1: Enrich Orders with Customer and Product Information
# Join orders with customers and products to enrich order data
orders_enriched_df = (
    orders_silver_df
    .join(customers_silver_df, on="CustomerID", how="left")
    .join(products_silver_df, on="ProductID", how="left")
)

# Step 2: Create an Aggregated Customer Orders Summary
# Calculate total orders, total quantity, and total amount spent per customer
customer_summary_df = (
    orders_enriched_df
    .groupBy("CustomerID", "CustomerName", "Email", "Country")
    .agg(
        F.count("OrderID").alias("TotalOrders"),
        F.sum("Quantity").alias("TotalQuantityPurchased"),
        F.sum("TotalAmount").alias("TotalAmountSpent")
    )
)

# Step 3: Create a Product Sales Summary
# Calculate total quantity sold and total revenue per product
product_summary_df = (
    orders_enriched_df
    .groupBy("ProductID", "ProductName", "Category")
    .agg(
        F.sum("Quantity").alias("TotalQuantitySold"),
        F.sum("TotalAmount").alias("TotalRevenue")
    )
)

# Step 4: Create a Daily Sales Summary
# Calculate total orders, quantity, and revenue per day
daily_sales_summary_df = (
    orders_enriched_df
    .groupBy("OrderDate")
    .agg(
        F.count("OrderID").alias("TotalOrders"),
        F.sum("Quantity").alias("TotalQuantitySold"),
        F.sum("TotalAmount").alias("TotalRevenue")
    )
)

# Step 5: Write Data to Gold Layer
customer_summary_df.write.mode("overwrite").parquet(f"{gold_path}customer_summary/")
product_summary_df.write.mode("overwrite").parquet(f"{gold_path}product_summary/")
daily_sales_summary_df.write.mode("overwrite").parquet(f"{gold_path}daily_sales_summary/")
