In [0]:
from pyspark.sql.functions import coalesce, lit, year, col, sum

In [0]:
df_orders = spark.read.table('fz_catalog.silver.orders')
df_products = spark.read.table('fz_catalog.silver.products')
df_customers = spark.read.table('fz_catalog.silver.customers')

Create an aggregate table that shows profit by 
Year
Product Category
Product Sub Category
Customer


In [0]:
# Join orders, products, and customers tables
df_joined = (
    df_orders
        .join(df_products, on="product_id", how="left")
        .join(df_customers, on="customer_id", how="left")
        .select(
            "customer_id",
            "order_date",
            coalesce("category", lit("Unknown")).alias("category"),
            coalesce("sub_category", lit("Unknown")).alias("sub_category"),
            "profit"
        )
)

In [0]:
# Aggregate data for profit aggregation by year, product category, sub-category, and customer
df_gold = df_joined.groupBy(
    year(col("order_date")).alias("Year"),
    col("category"),
    col("sub_category"),
    col("customer_id")
).agg(
    sum(col("profit")).alias("Total_Profit")
)

In [0]:
df_gold.write.format("delta").mode("overwrite").saveAsTable(
    "fz_catalog.gold.full_order_info"
)