#### Report Usage Dimensions

##### Data ingestion strategy:
<mark style="background: #88D5FF;">**REPLACE**</mark>

##### Related pipeline:

**Ext_Load_PBI_Report_Usage_E2E**

##### Source:

**Files** from FUAM_Ext_Lakehouse folder **bronze_file_location** variable

##### Target:

**1 Delta table** in FUAM_Ext_Lakehouse 
- **gold_table_name** variable value


In [None]:
## Parameters
display_data = True

usage_table_name = "Reports"
bronze_file_location = "Files/raw/report_usage/dimensions/usage_reports/"
gold_table_name = "usage_reports"
usage_dataset_id = "28678a20-198b-4fa5-8cb2-d211f273af85"

print("Successfully configured all paramaters for this run.")

In [None]:
import datetime
from delta.tables import * # type: ignore
from delta.exceptions import ConcurrentAppendException # type: ignore
from notebookutils import mssparkutils # type: ignore
from pyspark.sql.functions import col, explode, to_date, date_format, lit, upper # type: ignore
from pyspark.sql import SparkSession # type: ignore
import re
import time

print("Successfully imported all packages for this notebook.")

In [None]:
#
# Create the Spark session
#
app_name = "TransferReportDimensions"

# Get the current Spark session
spark = SparkSession.builder \
    .appName(app_name) \
    .getOrCreate()
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

print(f"Spark session {app_name} has been created successfully.")

In [None]:
def gold_table_exists(gold_table_name: str, spark) -> bool:
    """
    Checks if a table exists in the FUAM_Ext_Lakehouse catalog.

    Args:
        gold_table_name (str): Name of the table to check.
        spark (SparkSession): The active Spark session.

    Returns:
        bool: True if the table exists, False otherwise.
    """
    table_exists = spark._jsparkSession.catalog().tableExists('FUAM_Ext_Lakehouse', gold_table_name)
    return table_exists

print("The function 'gold_table_exists' has been created successfully.") 

In [None]:
# Get the report usage data from the BRONZE layer
# Read the JSON files using 'multiline' since it's pretty-printed
raw_location = f"{bronze_file_location}{usage_dataset_id.upper()}.json"
bronze_df = spark.read.option("multiline", "true").json(raw_location)

print(f"Bronze data from {raw_location} has been read successfully.")

In [None]:
if display_data:
    display(bronze_df)

In [None]:
# Explode results -> tables -> rows
exploded_results = bronze_df.select(explode("results").alias("result"))
exploded_tables = exploded_results.select(explode("result.tables").alias("table"))
exploded_rows = exploded_tables.select(explode("table.rows").alias("row"))

print(f"Bronze data from {bronze_file_location} has been extracted and transformed.")

In [None]:
# Check if exploded_rows is not empty before trying to expand row.* in subsequent notebook cells
# Exit early, if nescessay
num_exploded_rows = exploded_rows.count()
if num_exploded_rows == 0:
    mssparkutils.notebook.exit(f"Nothing to do for bronze layer {usage_table_name}. Notebook completed early with success.")
else:
    print(f"The number of rows in the bronze layer {usage_table_name} is {num_exploded_rows}.")

In [None]:
if display_data:
    display(exploded_rows)

In [None]:
# Create the silver dataframe
silver_df = exploded_rows.select(
    lit(usage_dataset_id).alias("UsageDatasetId"), # Add to enable the append processing logic
    col("row.*")
)

# Rename columns with brackets to just the inner name
# Ensure the column name is compatible with Delta Lake’s restrictions in Microsoft Fabric
for col_name in silver_df.columns:
    match = re.search(r"\[(.*?)\]", col_name)
    if match:
        # Extract inner name
        inner_name = match.group(1)
        # Sanitize: remove invalid characters (could also replace spaces with underscores)
        sanitized_name = re.sub(r"[ ,;{}()\n\t=]", "", inner_name)
        # Rename column
        silver_df = silver_df.withColumnRenamed(col_name, sanitized_name)

# Transformation to standardize the time column
if "ReportGuid" in silver_df.columns and "ReportId" not in silver_df.columns:
    silver_df = silver_df.withColumnRenamed("ReportGuid", "ReportId")

# Put selected ID columns to Upper Case
for co in silver_df.columns:
    if co in ['CapacityId','WorkspaceId', 'ReportId', 'UsageDatasetId']:
        silver_df = silver_df.withColumn(co, upper(silver_df[co]))

print(f"Silver dataframe has been created successfully with {silver_df.count()} rows.")

In [None]:
if display_data:
    display(silver_df)

In [None]:
# Write silver_df to gold delta table
#   "selective" REPLACE: only partitions for this dataset's dates being refreshed
#   ToDo: check on ("overwriteSchema", "true")
silver_df \
    .write \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .format("delta") \
    .partitionBy("UsageDatasetId") \
    .saveAsTable(gold_table_name)

print(f"Gold table {gold_table_name} has been updated successfully.")

In [None]:
#
# Write history of bronze files
#
raw_path = bronze_file_location.replace("*/", '', )
history_path = raw_path.replace("Files/raw/", "Files/history/")
mssparkutils.fs.cp(raw_path, history_path + datetime.datetime.now().strftime('%Y/%m/%d') + "/", True) # type: ignore

print(f"History data copied to {history_path} successfully.")