In [None]:
## Parameters
display_data = True

usage_table_name = "Report views"
bronze_file_location = "Files/raw/report_usage/facts/report_views/"
gold_table_name = "report_views"
usage_dataset_id = "28678a20-198b-4fa5-8cb2-d211f273af85"

print("Successfully configured all paramaters for this run.")

In [None]:
## Import all packages used in this notebook
import datetime
from pyspark.sql.functions import col, explode, to_date, date_format, lit, upper # type: ignore
import pyspark.sql.functions as f # type: ignore
from pyspark.sql import SparkSession # type: ignore
import re
import time

print("Successfully imported all packages for this notebook.")

In [None]:
#
# Create the Spark session
#
app_name = "CreateFactTable"

# Get the current Spark session
spark = SparkSession.builder \
    .appName(app_name) \
    .getOrCreate()

print(f"Spark session {app_name} has been created successfully.")

In [None]:
#
# Function to get check if the gold layer table already exists
#
def gold_table_exists(gold_table_name: str, spark) -> bool:
    """
    Checks if a table exists in the FUAM_Ext_Lakehouse catalog.

    Args:
        gold_table_name (str): Name of the table to check.
        spark (SparkSession): The active Spark session.

    Returns:
        bool: True if the table exists, False otherwise.
    """
    table_exists = spark._jsparkSession.catalog().tableExists('FUAM_Ext_Lakehouse', gold_table_name)
    return table_exists

print("The function 'gold_table_exists' has been created successfully.") 

In [None]:
#
# Get the report usage data from the BRONZE layer
# Read the JSON files using 'multiline' since it's pretty-printed
#
raw_location = f"{bronze_file_location}{usage_dataset_id.upper()}.json"
bronze_df = spark.read.option("multiline", "true").json(raw_location)

print(f"Bronze data from {raw_location} has been read successfully.")

In [None]:
if display_data:
    display(bronze_df)

In [None]:
#
# Explode results -> tables -> rows
#
exploded_results = bronze_df.select(explode("results").alias("result"))
exploded_tables = exploded_results.select(explode("result.tables").alias("table"))
exploded_rows = exploded_tables.select(explode("table.rows").alias("row"))

print(f"Bronze data from {raw_location} has been extracted and transformed.")

In [None]:
if display_data:
    display(exploded_rows)

In [None]:
#
# Create the silver layer dataframe
# Dataset IDs in the Lakehouse are expected to be uppercase
#
usage_dataset_id = usage_dataset_id.upper()

# Create the silver dataframe
silver_df = exploded_rows.select(
    lit(usage_dataset_id).alias("UsageDatasetId"), # Add to enable the append processing logic
    col("row.*")
)

# Rename columns with brackets to just the inner name
for col_name in silver_df.columns:
    match = re.search(r"\[(.*?)\]", col_name)
    if match:
        new_col_name = match.group(1)
        silver_df = silver_df.withColumnRenamed(col_name, new_col_name)

# Put selected ID columns to Upper Case
for co in silver_df.columns:
    if co in ['CapacityId','WorkspaceId', 'ReportId']:
        silver_df = silver_df.withColumn(co, f.upper(silver_df[co]))

# Transformation to standardize the time column
if "Timestamp" in silver_df.columns and "CreationTime" not in silver_df.columns:
    silver_df = silver_df.withColumnRenamed("Timestamp", "CreationTime")

# Transform the time stamp string data field
silver_df = silver_df.withColumn("CreationDate", to_date(col("CreationTime").substr(1, 10), "yyyy-MM-dd")) \
                    .withColumn("CreationDateKey", date_format(col("CreationTime"), "yyyyMMdd")) \
                    .withColumn("CreationHour", date_format(col("CreationTime"), "H")) \
                    .withColumn("CreationMinute", date_format(col("CreationTime"), "mm"))

print(f"Silver dataframe has been created successfully with {silver_df.count()} rows.")

In [None]:
if display_data:
    display(silver_df)

In [None]:
#
# Write silver_df as a partitioned Delta table in Fabric lakehouse
#
silver_df.write \
    .format("delta") \
    .partitionBy("UsageDatasetId", "ReportId") \
    .mode("errorifexists") \
    .saveAsTable(gold_table_name)

print(f"Gold table {gold_table_name} has been created successfully.")

In [None]:
#
# Write history of bronze files
#
raw_path = bronze_file_location.replace("*/", '', )
history_path = raw_path.replace("Files/raw/", "Files/history/")
mssparkutils.fs.cp(raw_path, history_path + datetime.datetime.now().strftime('%Y/%m/%d') + "/", True) # type: ignore

print(f"History data copied to {history_path} successfully.")

In [None]:
#
# Stop the Spark session
# NOTE: frees up limited F2 SKU capacity resources
#
spark.stop()

print("Spark session has been stopped successfully.")