#### Aggregate Report Dimensions 

##### Data ingestion strategy:
<mark style="background: #88D5FF;">**REPLACE**</mark>

##### Related pipeline(s):

**Ext_Load_PBI_Workspace_Datasets_E2E**

##### Source:

**Tables** from FUAM_Ext_Lakehouse table **gold_table_name** variable

##### Target:

**1 Delta table** in FUAM_Ext_Lakehouse 
- **agg_gold_table_name** variable value

In [None]:
## Parameters
display_data = True

print("Successfully configured all paramaters for this run.")

In [None]:
## Variables
lakehouse_name = "FUAM_Ext_Lakehouse"
gold_table_name = "workspace_datasets"
agg_gold_table_name = "usage_workspaces"

print("Successfully configured all variables for this run.")

In [None]:
from delta.tables import DeltaTable # type: ignore
from pyspark.sql import functions as F # type: ignore
from pyspark.sql import DataFrame, SparkSession # type: ignore

print("Successfully imported all packages for this notebook.")

In [None]:
#
# Create the Spark session
#
app_name = "AggregateReportDimensions"

# Get the current Spark session
spark = SparkSession.builder \
    .appName(app_name) \
    .getOrCreate()

print(f"Spark session {app_name} has been created successfully.")

In [None]:
def upsert_table(df: DataFrame, table_name: str, primary_key: str, lakehouse_name: str = None) -> int:
    """
    Performs an upsert (merge) of the input DataFrame into a Delta Lake table.

    Args:
        df (DataFrame): The input PySpark DataFrame to be upserted.
        table_name (str): The target Delta table name.
        primary_key (str): Column used as the primary key for matching rows.
        lakehouse_name (str, optional): Name of the lakehouse database.

    Returns:
        int: Number of rows processed (from the input DataFrame).
    """
    temp_view_name = "temp_upsert_view"
    df.createOrReplaceTempView(temp_view_name)

    # Fully qualified table name
    qualified_table_name = f"{lakehouse_name}.{table_name}" if lakehouse_name else table_name

    # Count rows in source DataFrame
    row_count = df.count()

    # Check if table exists
    if spark._jsparkSession.catalog().tableExists(qualified_table_name): # type: ignore
        merge_sql = f"""
        MERGE INTO {qualified_table_name} AS target
        USING {temp_view_name} AS source
        ON target.{primary_key} = source.{primary_key}
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
        """
        spark.sql(merge_sql) # type: ignor
    else:
        df.write.format("delta").saveAsTable(qualified_table_name)

    return row_count

print("The function upsert_table has been created successfully.")

In [None]:
# Load the table
df = spark.read.table(gold_table_name)

# Select only workspace-level columns
workspace_columns = [
    "WorkspaceId", "WorkspaceName", "WorkspaceDescription", "HasWorkspaceLevelSettings", "State", "Type",
    "CapacityId", "IsOnDedicatedCapacity", "IsReadOnly",
    "fuam_modified_at", "fuam_deleted"
]

# Group by WorkspaceId and get the first (or max, etc.) of each other column
flattened_df = df.groupBy("WorkspaceId").agg(
    *[F.first(col).alias(col) for col in workspace_columns if col != "WorkspaceId"]
)

print(f"The flatten data frame from the {gold_table_name} table has been created successfully.")

In [None]:
if display_data:
    display(flattened_df)

In [None]:
#
# Upsert the flattened DataFrame into the Microsoft Fabric Lakehouse
# 	🔄 Update rows where WorkspaceId matches
# 	➕ Insert new rows not already present
# 	✅ Leave unmatched rows untouched
#
rows_processed = upsert_table(flattened_df, table_name=agg_gold_table_name, primary_key="WorkspaceId", lakehouse_name=lakehouse_name)

print(f"Upsert process completed successfully into table {agg_gold_table_name} w/ {rows_processed} rows processed.")

In [None]:
#
# Stop the Spark session
# NOTE: frees up limited F2 SKU capacity resources
#
spark.stop()

print("Spark session has been stopped successfully.")