In [12]:
# --- Load Data from Explicitly Staged Tables ---
# Reads the tables written by Dataflow Gen2 (DF Gen2)

# Load the explicitly staged tables from the Lakehouse 'Tables' section
df_orders = spark.sql("SELECT * FROM RawOrders_Staging")
df_products = spark.sql("SELECT * FROM RawOrderProducts_Staging")

print("Raw Orders Count:", df_orders.count())
print("Raw Products Count:", df_products.count())
display(df_orders.limit(5))

StatementMeta(, 77a04ff0-3a3e-4346-88a3-11fece60260a, 14, Finished, Available, Finished)

Raw Orders Count: 3421083
Raw Products Count: 32434489


SynapseWidget(Synapse.DataFrame, 1d6a7f0e-0733-4711-a5ad-2155377575fe)

In [16]:
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType, DoubleType

# Load the explicitly staged tables (assuming this part is already successful)
df_orders = spark.sql("SELECT * FROM RawOrders_Staging")
df_products = spark.sql("SELECT * FROM RawOrderProducts_Staging")

# --- 1. Explicitly Cast the Join Keys ---
df_orders = df_orders.withColumn("order_id", col("order_id").cast(IntegerType()))
df_products = df_products.withColumn("order_id", col("order_id").cast(IntegerType()))
df_products = df_products.withColumn("product_id", col("product_id").cast(IntegerType()))

# --- 2. Perform the Join ---
fact_orders_joined_df = df_products.join(
    df_orders,
    on="order_id",
    how="inner" 
)

# --- 3. Final Select, Type Casting, AND NULL HANDLING (The Fix!) ---
final_fact_df = fact_orders_joined_df.select(
    col("order_id").cast(IntegerType()),
    col("product_id").cast(IntegerType()),
    col("add_to_cart_order").cast(IntegerType()),
    col("reordered").cast(IntegerType()),
    col("user_id").cast(IntegerType()),
    col("order_number").cast(IntegerType()),
    col("order_dow").cast(IntegerType()),
    col("order_hour_of_day").cast(IntegerType()),
    
    # *** THE NULL FIX IS APPLIED HERE ***
    when(col("days_since_prior_order").isNull(), 0)
        .otherwise(col("days_since_prior_order"))
        .alias("days_since_prior_order") # Rename the column back
)

# Cast the final days_since_prior_order column to the correct DoubleType
final_fact_df = final_fact_df.withColumn(
    "days_since_prior_order", 
    col("days_since_prior_order").cast(DoubleType())
)

print("Final Fact Schema (with correct data types and NULL fix):")
final_fact_df.printSchema()
display(final_fact_df.limit(10))

StatementMeta(, 77a04ff0-3a3e-4346-88a3-11fece60260a, 18, Finished, Available, Finished)

Final Fact Schema (with correct data types and NULL fix):
root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- add_to_cart_order: integer (nullable = true)
 |-- reordered: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)



SynapseWidget(Synapse.DataFrame, bb59facf-4bb8-4921-8ad0-5a3f2ba14af0)

In [17]:
# Write to Lakehouse Tables (Final Load)
final_fact_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("FactOrders")

StatementMeta(, 77a04ff0-3a3e-4346-88a3-11fece60260a, 19, Finished, Available, Finished)