#### User Dimension

##### Data ingestion strategy:
<mark style="background: #88D5FF;">**REPLACE**</mark>

##### Related pipeline(s):
 
**Ext_Load_PBI_Report_Usage_E2E**

##### Source:

**Files** from FUAM_Ext_Lakehouse folder **bronze_file_location** variable

##### Target:

**1 Delta table** in FUAM_Ext_Lakehouse 
- **gold_table_name** variable value


In [None]:
## Parameters
display_data = True
lakehouse_name = "FUAM_Ext_Lakehouse"
gold_table_name = "users"

print("Successfully configured all paramaters for this run.")

In [None]:
import json
from notebookutils import mssparkutils # type: ignore
from pyspark.sql import DataFrame, SparkSession # type: ignore
from pyspark.sql.types import StructType, StructField, StringType # type: ignore
import requests

print("Successfully imported all packages for this notebook.")

In [None]:
#
# Create the Spark session
#
app_name = "TransferUserDimension"

# Get the current Spark session
spark = SparkSession.builder \
    .appName(app_name) \
    .getOrCreate()
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

print(f"Spark session {app_name} has been created successfully.")

In [None]:
def upsert_table(df: DataFrame, table_name: str, primary_key: str, lakehouse_name: str = None) -> int:
    """
    Performs an upsert (merge) of the input DataFrame into a Delta Lake table.

    Args:
        df (DataFrame): The input PySpark DataFrame to be upserted.
        table_name (str): The target Delta table name.
        primary_key (str): Column used as the primary key for matching rows.
        lakehouse_name (str, optional): Name of the lakehouse database.

    Returns:
        int: Number of rows processed (from the input DataFrame).
    """
    temp_view_name = "temp_upsert_view"
    df.createOrReplaceTempView(temp_view_name)

    # Fully qualified table name
    qualified_table_name = f"{lakehouse_name}.{table_name}" if lakehouse_name else table_name

    # Count rows in source DataFrame
    row_count = df.count()

    # Check if table exists
    if spark._jsparkSession.catalog().tableExists(qualified_table_name):
        merge_sql = f"""
        MERGE INTO {qualified_table_name} AS target
        USING {temp_view_name} AS source
        ON target.{primary_key} = source.{primary_key}
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
        """
        spark.sql(merge_sql)
    else:
        df.write.format("delta").saveAsTable(qualified_table_name)

    return row_count

print("The function upsert_table has been created successfully.")

In [None]:
#
# Get required secrets from the key vault
#
vault_uri = "https://kv-fabric-dev-eastus2.vault.azure.net/"

# Retrieve secret from Key Vault using mssparkutils
TENANT_ID = mssparkutils.credentials.getSecret(vault_uri, "TENANT-ID")
CLIENT_ID = mssparkutils.credentials.getSecret(vault_uri, "CLIENT-ID")
CLIENT_SECRET = mssparkutils.credentials.getSecret(vault_uri, "CLIENT-SECRET-KEY")

# Use the secret securely without printing
print("Secrets retrieved successfully (not displayed for security reasons).")

In [None]:
#
# Verify that key vault items cannot be viewed in clear text
#
print(f"The value of the tenant ID is {TENANT_ID}")
print(f"The value of the client ID is {CLIENT_ID}")
print(f"The value of the client secret is {CLIENT_SECRET}")

In [None]:
#
# Connect to Fabric and get the authorization token
#

# Auth config
scope = "https://graph.microsoft.com/.default"
token_url = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token"

# Get access token
token_data = {
    "client_id": CLIENT_ID,
    "client_secret": CLIENT_SECRET,
    "grant_type": "client_credentials",
    "scope": scope
}
token_response = requests.post(token_url, data=token_data)
access_token = token_response.json()["access_token"]

print(f"Access token retrieved successfully!")

In [None]:
#
# Call Graph API to get users
# NOTE: DEVIATION FROM FUAM STANDARD ARCHITECTURE
# We are doing this in a Notebook because Data Factory in Microsoft Fabric doesn't currently support a Web API or Web page connector in data pipelines.
# REF: https://learn.microsoft.com/en-us/fabric/data-factory/connector-web-overview
#

headers = {"Authorization": f"Bearer {access_token}"}
users_url = "https://graph.microsoft.com/v1.0/users?$select=id,displayName,userPrincipalName,mail,givenName,surname,officeLocation"

users = []
next_link = users_url

while next_link:
    response = requests.get(next_link, headers=headers)
    data = response.json()
    users.extend(data.get("value", []))
    next_link = data.get("@odata.nextLink")

# Load into Spark DataFrame
spark = SparkSession.builder.getOrCreate()
schema = StructType([
    StructField("id", StringType(), True),
    StructField("displayName", StringType(), True),
    StructField("userPrincipalName", StringType(), True),
    StructField("mail", StringType(), True),
    StructField("givenName", StringType(), True),
    StructField("surname", StringType(), True),
    StructField("officeLocation", StringType(), True)
])
users_df = spark.createDataFrame(users, schema=schema)

# Standardize column names
renamed_cols = [
    "UserId" if col_name == "id" else col_name[:1].upper() + col_name[1:]
    for col_name in users_df.columns
]

# Apply the renamed columns to the DataFrame
users_df = users_df.toDF(*renamed_cols)

print("Successfully created spark dataframe for 'users' table sourced fro Graph API.")

In [None]:
if display_data:
    display(users_df)

In [None]:
# Upsert the flattened DataFrame into the Microsoft Fabric Lakehouse
# 	🔄 Update rows where WorkspaceId matches
# 	➕ Insert new rows not already present
# 	✅ Leave unmatched rows untouched
rows_processed = upsert_table(users_df, table_name=gold_table_name, primary_key="UserId", lakehouse_name=lakehouse_name)

print(f"Upsert process completed successfully into table {gold_table_name} w/ {rows_processed} rows processed.")

In [None]:
#
# Stop the Spark session
# NOTE: frees up limited F2 SKU capacity resources
#
spark.stop()

print("Spark session has been stopped successfully.")