In [0]:
# Catalog Name
catalog = "workspace"

# Source Schema
source_schema = "silver"

# Source Object
source_object = "silver_bookings"

# CDC Column
cdc_column = "modifiedDate"

# Backdated Refresh
backdated_refresh = ""

# Source Fact Table
fact_table = f"{catalog}.{source_schema}.{source_object}"

# Target Schema
target_schema = "gold"

# Target Object
target_object = "FactBookings"

# Fact Key Cols List
fact_key_cols = ["DimPassengersKey", "DimFlightsKey", "DimAirportsKey", "booking_date"]

In [0]:
dimensions = [
    {
        "table": f"{catalog}.{target_schema}.DimPassengers",
        "alias": "DimPassengers",
        "join_keys": [("passenger_id", "passenger_id")] # (fact_col, dim_col)
    },
    {
        "table": f"{catalog}.{target_schema}.DimFlights",
        "alias": "DimFlights",
        "join_keys": [("flight_id", "flight_id")] # (fact_col, dim_col)
    },
    {
        "table": f"{catalog}.{target_schema}.DimAirports",
        "alias": "DimAirports",
        "join_keys": [("airport_id", "airport_id")] # (fact_col, dim_col)
    }
]
    
# Columns you want to keep from Fact table (besides the surrogate keys)
fact_columns = ["amount", "booking_date", "modifiedDate"]

## **Last Load Date**

In [0]:
# No backdated refresh
if len(backdated_refresh) == 0:

    # If table exists in the destination, select max value of cdc column
    if spark.catalog.tableExists(f"{catalog}.{target_schema}.{target_object}"):
        last_load = spark.sql(f"SELECT max({cdc_column}) FROM workspace.{target_schema}.{target_object}").collect()[0][0]
    
    # If table does not exist in the destination
    else:
        last_load = "1900-01-01 00:00:00"

# Yes Backdated refresh
else:
    last_load = backdated_refresh

# Test the last load
last_load

## **Dynamic Fact Query [Bring Keys]**

In [0]:
def generate_fact_query_incremental(fact_table, dimensions, fact_columns, cdc_column, processing_date):
    fact_alias = "f"

    # Base columns to select
    select_cols = [f"{fact_alias}.{col}" for col in fact_columns]

    # Build joins dynamically
    join_clauses = []
    for dim in dimensions:
        table_full = dim["table"]
        alias = dim["alias"]
        table_name = table_full.split(".")[-1]
        surrogate_key = f"{alias}.{table_name}Key"
        join_keys = dim["join_keys"]
        select_cols.append(surrogate_key)

        # Build ON clause
        on_conditions = [
            f"{fact_alias}.{fk} = {alias}.{dk}" for fk, dk in dim["join_keys"]
        ]
        join_clause = f"LEFT JOIN {table_full} AS {alias} ON " + " AND ".join(on_conditions)
        join_clauses.append(join_clause)
    
    # Final SELECT and JOIN clauses
    select_clause = ",    \n".join(select_cols)
    joins = "\n".join(join_clauses)

    # WHERE clause for incremental filtering
    where_clause = f"{fact_alias}.{cdc_column} >= DATE('{last_load}')"

    # Final Query
    query = f"""
        SELECT
            {select_clause}
        FROM
            {fact_table} AS {fact_alias}
        {joins}
        WHERE
            {where_clause}
    """.strip()
    return query

In [0]:
query = generate_fact_query_incremental(fact_table, dimensions, fact_columns, cdc_column, last_load)
print(query)

## **DF_FACT**

In [0]:
df_fact = spark.sql(query)


In [0]:
df_fact.display()

In [0]:
df_fact.groupBy('DimPassengersKey','DimFlightsKey','DimAirportsKey').count().display()

## **Upsert**

In [0]:
# Fact Key Columns Merge Condition
fact_key_cols_str = " AND ".join([f"src.{col} = tgt.{col}" for col in fact_key_cols])
fact_key_cols_str

In [0]:
from delta.tables import DeltaTable

In [0]:
if spark.catalog.tableExists(f"{catalog}.{target_schema}.{target_object}"):
    dlt_obj = DeltaTable.forName(spark, f"{catalog}.{target_schema}.{target_object}")
    dlt_obj.alias("tgt").merge(
        df_fact.alias("src"),
        fact_key_cols_str)\
        .whenMatchedUpdateAll(condition = f"src.{cdc_column} >= tgt.{cdc_column}")\
        .whenNotMatchedInsertAll()\
        .execute()
else:
    df_fact.write.format("delta")\
        .mode("append")\
        .saveAsTable(f"{catalog}.{target_schema}.{target_object}")

In [0]:
%sql
select * from workspace.gold.factbookings