In [None]:
## Parameters
display_data = True
table_name = 'report_views'

print(f" Table name for deduplification is set at {table_name}.")

In [None]:
## Import all packages used in this notebook
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql import SparkSession # type: ignore

In [None]:
#
# Create the Spark session
#
app_name = "DedupeFactTable"

# Get the current Spark session
spark = SparkSession.builder \
    .appName(app_name) \
    .getOrCreate()

print(f"Spark session {app_name} has been created successfully.")

In [None]:
#
# Remove duplicates from the fact table
#

# Load the table
df = spark.read.table(table_name)

# Define a window to identify duplicates
if table_name == 'report_views':
    window_spec = Window.partitionBy("ReportId", "CreationTime", "UserId", "OriginalConsumptionMethod").orderBy("CreationTime")
else:
    window_spec = Window.partitionBy("ReportId", "CreationTime", "UserId", "Client").orderBy("CreationTime")

# Add row numbers and keep only the first occurrence of each duplicate
deduped_df = df.withColumn("rn", row_number().over(window_spec)).filter("rn = 1").drop("rn")

print(f"{table_name} before deduping contains {df.count()} rows.")
print(f"{table_name} after deduping contains {deduped_df.count()} rows.")

In [None]:
if display_data:
    display(deduped_df)

In [None]:
#
# Overwrite the original table with the deduplicated data
#
deduped_df \
    .write \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .format("delta") \
    .partitionBy("CreationDate", "ReportId") \
    .saveAsTable(table_name)

print(f"{table_name} has been successfully rewritten without duplicates.")

In [None]:
#
# When Power BI connects to a Fabric Lakehouse in Import mode via the SQL Analytics endpoint, it may query a snapshot of the Delta table 
# that hasn’t yet caught up with the latest physical data update. This is particularly true when:
# 	•	You’re writing to the Lakehouse using notebooks or pipelines.
# 	•	The updates are made via overwrite or non-transactional file-level operations.
# 	•	Power BI’s import query pulls from a delta table snapshot, and the _delta_log has not fully committed or compacted.
#
# ✅ Recommendation
# 	1.	Force a newer snapshot via the OPTIMIZE command after your Lakehouse update step to commit a clean version.
#
spark.sql(f"OPTIMIZE {table_name}")

print(f"Optimizing table {table_name} has been completed successfully")