#### Workspace Datasets 

##### Data ingestion strategy:
<mark style="background: #88D5FF;">**REPLACE**</mark>

##### Related pipeline:

**Ext_Load_PBI_Workspace_Datasets_E2E**

##### Source:

**Files** from FUAM_Ext_Lakehouse folder **bronze_file_location** variable

##### Target:

**1 Delta table** in FUAM_Ext_Lakehouse 
- **gold_table_name** variable value


In [None]:
import requests
from pyspark.sql.functions import col, lit, udf, explode, to_date, json_tuple, from_json, schema_of_json, get_json_object
from pyspark.sql.types import StringType, json
from pyspark.sql import SparkSession
import json
from delta.tables import *
import pyspark.sql.functions as f
from pyspark.sql.types import *
import datetime

print("Successfully imported all packages for this notebook.")

In [None]:
#
# Create the Spark session
#
app_name = "TransferWorkspaceDatasets"

# Get the current Spark session
spark = SparkSession.builder \
    .appName(app_name) \
    .getOrCreate()

spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true") # needed for automatic schema evolution in merge

print(f"Spark session {app_name} has been created successfully.")

In [None]:
## Parameters
display_data = True

## Variables
bronze_file_location = f"Files/raw/workspace_datasets/"
silver_table_name = "FUAM_Ext_Staging_Lakehouse.workspace_datasets_silver"
gold_table_name = "workspace_datasets"
gold_table_name_with_prefix = f"Tables/{gold_table_name}"

print("Successfully configured all paramaters for this run.")

In [None]:
# Clean Silver table, if exists
if spark.catalog.tableExists(silver_table_name):
    del_query = "DELETE FROM " + silver_table_name
    spark.sql(del_query)
    print(f"Silver table {silver_table_name} has been cleaned successfully.")
else:
    print(f"Silver table {silver_table_name} does not exist.")

In [None]:
# Get Bronze data
bronze_df = spark.read.option("multiline", "true").json(bronze_file_location)

# Explode json subset structures
workspace_df = bronze_df.select(explode("value").alias("workspace"))
exploded_df = workspace_df.select(
    "workspace.*",  # select all top-level workspace fields
    explode("workspace.datasets").alias("dataset")  # explode datasets
)

# Handle field name collisions (e.g., 'id', 'name') and get top-level workspace fields
workspace_fields = [f.name for f in exploded_df.schema if f.name not in ("dataset", "datasets")]

# Rename dataset fields with prefix if they conflict with workspace fields
dataset_fields = []
for field in exploded_df.schema["dataset"].dataType.fields:
    field_name = field.name
    new_name = f"dataset{field_name[0].upper()}{field_name[1:]}" if field_name in workspace_fields else field_name
    dataset_fields.append(col(f"dataset.{field_name}").alias(new_name))

# Extract json objects to tabular form
extracted_df = exploded_df.select(
    *[col(f) for f in workspace_fields],  # original workspace fields
    *dataset_fields                       # prefixed dataset fields
)

# Convert key(s) to upper case
extracted_df = extracted_df.withColumn("id", f.upper(f.col("id")))
extracted_df = extracted_df.withColumn("capacityId", f.upper(f.col("capacityId")))
extracted_df = extracted_df.withColumn("datasetId", f.upper(f.col("datasetId")))

# Generate empty description column in case it is not available
if  not ("description" in extracted_df.columns):
    print("Created an empty description column")
    extracted_df = extracted_df.withColumn("w.description", lit(""))

print(f"Bronze data from {bronze_file_location} has been extracted and transformed.")

In [None]:
if display_data:
    display(extracted_df)

In [None]:
# Select columns required for the silver layer table
silver_df = extracted_df.select(
    col("capacityId").alias("CapacityId"),
    col("id").alias("WorkspaceId"),
    col("description").alias("WorkspaceDescription"),
    col("hasWorkspaceLevelSettings ").alias("HasWorkspaceLevelSettings"),
    col("isOnDedicatedCapacity").alias("IsOnDedicatedCapacity"),
    col("isReadOnly").alias("IsReadOnly"),
    col("name").alias("WorkspaceName"),
    col("state").alias("State"),
    col("type").alias("Type"),
    col("datasetId").alias("DatasetId"),
    col("datasetName").alias("DatasetName"),
    col("configuredBy").alias("DatasetConfiguredBy"),
    col("isRefreshable").alias("IsDatasetRefreshable"),
    col("createdDate").alias("DatasetCreatedDate")
    )

print(f"Silver layer table columns have been extracted successfully.")

In [None]:
if display_data:
    display(silver_df)

In [None]:
# Write prepared bronze_df to silver delta table
silver_df.write.mode("append").option("mergeSchema", "true").format("delta").saveAsTable(silver_table_name)

print(f"Silver layer table {silver_table_name} has been created successfully.")

In [None]:

# This function maps and merges the silver data to gold dynamically
def write_silver_to_gold(silver_table_name, gold_table_name, ids):
    query = "SELECT *, current_timestamp() AS fuam_modified_at, False as fuam_deleted  FROM " + silver_table_name 
    silver_df = spark.sql(query)
    
    if spark.catalog.tableExists(gold_table_name):
        # if exists -> MERGE to gold
        print("Gold table exists and will be merged.")
        gold_df = DeltaTable.forName(spark, gold_table_name)


        gold_columns = gold_df.toDF().columns
        silver_columns = silver_df.columns
        combined_columns = list(set(gold_columns) | set(silver_columns))
        id_cols = {}
        merge_id_stmt = ''
        for col in combined_columns:
            if col in ids:
                merge_id_stmt =  merge_id_stmt +  " t." + col + " = s." + col + " and"
                id_cols[col] = "s." + col

                
        # delete last and in merge id statement
        merge_id_stmt = merge_id_stmt[:-4]


        # Merge silver (s = source) to gold (t = target)
        try:
            merge = (gold_df.alias('t') \
            .merge(silver_df.alias('s'), merge_id_stmt )) \
            .whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
            .whenNotMatchedBySourceUpdate( condition = "t.fuam_deleted == False or t.fuam_deleted IS NULL", set = {"fuam_deleted" : "True", "fuam_modified_at": "current_timestamp()"} )
            
            merge.execute()
        except:
        # In case the tables already exist, but the fuam column are not existent because of an old version do merge whenNotMatchedBySourceUpdate
            merge = (gold_df.alias('t') \
            .merge(silver_df.alias('s'), merge_id_stmt )) \
            .whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
                        
            merge.execute()

    else:
        # else -> INSERT to gold
        print("Gold table will be created.")

        silver_df.write.mode("append").option("mergeSchema", "true").format("delta").saveAsTable(gold_table_name)

print("The function 'write_silver_to_gold' has been created successfully.")

In [None]:
# Merge semantic model refreshes to gold table
write_silver_to_gold(silver_table_name, gold_table_name, ['WorkspaceId', 'DatasetId'])

print(f"Gold layer table {gold_table_name} has been created successfully.")

In [None]:
# Write history of bronze files
history_file_location = bronze_file_location.replace("Files/raw/", "Files/history/") + datetime.datetime.now().strftime('%Y/%m/%d') + "/"

files = mssparkutils.fs.ls(bronze_file_location) # type: ignore
for file in files:
    if not file.isDir:  # skip subdirectories, just in case
        dest_path = history_file_location + file.name
        mssparkutils.fs.cp(file.path, dest_path, True) # type: ignore

print(f"Bronze layer raw files have been copied successfully to {history_file_location}.")