In [0]:
from pyspark.sql import functions as F

PIPELINE_NAME = "gold_aggregation_pipeline"

control_df = spark.table("gold_db.pipeline_control") \
    .filter(F.col("pipeline_name") == PIPELINE_NAME)

last_processed_date = control_df.select("last_processed_date").collect()[0][0]

silver_incremental_df = spark.table("silver_db.transactions_clean") \
    .filter(F.col("transaction_date") > F.lit(last_processed_date))

In [0]:
daily_metrics_df = silver_incremental_df.groupBy(
    "transaction_date", "currency_code"
).agg(
    F.count("*").alias("total_transactions"),
    F.sum(F.when(F.col("transaction_status") == "SUCCESS", 1).otherwise(0)).alias("successful_transactions"),
    F.sum(F.when(F.col("transaction_status") == "FAILED", 1).otherwise(0)).alias("failed_transactions"),
    F.sum("amount").alias("total_amount"),
    F.sum(F.when(F.col("transaction_status") == "SUCCESS", F.col("amount")).otherwise(0)).alias("success_amount")
)

In [0]:
partner_summary_df = silver_incremental_df.groupBy(
    "transaction_date", "partner_id"
).agg(
    F.count("*").alias("total_transactions"),
    F.sum(F.when(F.col("transaction_status") == "SUCCESS", 1).otherwise(0)).alias("successful_transactions"),
    F.sum("amount").alias("total_amount")
)

In [0]:
from delta.tables import DeltaTable

daily_table = DeltaTable.forName(spark, "gold_db.daily_transaction_metrics")

daily_table.alias("t").merge(
    daily_metrics_df.alias("s"),
    """
    t.transaction_date = s.transaction_date
    AND t.currency_code = s.currency_code
    """
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

In [0]:
partner_table = DeltaTable.forName(spark, "gold_db.partner_daily_summary")

partner_table.alias("t").merge(
    partner_summary_df.alias("s"),
    """
    t.transaction_date = s.transaction_date
    AND t.partner_id = s.partner_id
    """
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

In [0]:
max_txn_date = silver_incremental_df \
    .select(F.max("transaction_date")).collect()[0][0]

spark.sql(f"""
UPDATE gold_db.pipeline_control
SET
    last_processed_date = DATE('{max_txn_date}'),
    updated_at = current_timestamp()
WHERE pipeline_name = '{PIPELINE_NAME}'
""")