In [2]:
# Spark setup
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, min, datediff, avg, countDistinct, when

spark = SparkSession.builder.getOrCreate()

# Load fact and dims
fact = spark.read.parquet("/workspaces/inpost_analytics/warehouse/FactShippingEvent")
dim_shipping = spark.read.parquet("/workspaces/inpost_analytics/warehouse/DimShipping")
dim_client = spark.read.parquet("/workspaces/inpost_analytics/warehouse/DimClient")
dim_location = spark.read.parquet("/workspaces/inpost_analytics/warehouse/DimLocation")
dim_date = spark.read.parquet("/workspaces/inpost_analytics/warehouse/DimDate")

# Join fact with dim_shipping
enriched = fact.join(dim_shipping, on="shipping_id", how="left") \
    .join(dim_client, on="client_id", how="left") \
    .join(dim_date.withColumnRenamed("date", "event_date_dim"), on="date_id", how="left") \
    .join(dim_location.withColumnRenamed("location_id", "collection_location_id"), on="collection_location_id", how="left") \
    .withColumnRenamed("pays", "collection_country") \
    .withColumnRenamed("codeAgence", "collection_agency")

# Filter and label events
filtered = enriched.filter(col("event_code").isin("PEC", "TRN", "LIV"))
labeled = filtered.withColumn(
    "event_type",
    when((col("event_code") == "PEC") & col("event_sub_code").isin("REL", "APM"), "SENT")
    .when((col("event_code") == "TRN") & col("event_sub_code").isin("REL", "APM"), "DELIVERED_TO_POINT")
    .when((col("event_code") == "LIV"), "PICKED_UP")
)

# Pivot per shipping_id
pivoted = labeled.groupBy("shipping_id").agg(
    min(when(col("event_type") == "SENT", col("event_date"))).alias("sent_time"),
    min(when(col("event_type") == "DELIVERED_TO_POINT", col("event_date"))).alias("delivered_time"),
    min(when(col("event_type") == "PICKED_UP", col("event_date"))).alias("picked_time")
)

# Calculate durations
with_durations = pivoted.withColumn(
    "delivery_duration", datediff("delivered_time", "sent_time")
).withColumn(
    "lifecycle_duration", datediff("picked_time", "sent_time")
).withColumn(
    "pickup_duration", datediff("picked_time", "delivered_time")
)

# Final metrics
metrics = with_durations.agg(
    avg("delivery_duration").alias("avg_delivery_days"),
    avg("lifecycle_duration").alias("avg_lifecycle_days"),
    avg("pickup_duration").alias("avg_pickup_days"),
    countDistinct("shipping_id").alias("total_packages")
)

metrics.show()


+------------------+------------------+-----------------+--------------+
| avg_delivery_days|avg_lifecycle_days|  avg_pickup_days|total_packages|
+------------------+------------------+-----------------+--------------+
|3.5640852541228663| 4.669826791021831|1.097261039686976|         46019|
+------------------+------------------+-----------------+--------------+

