In [0]:
dbutils.widgets.text("catalog_name","")
dbutils.widgets.text("schema_name","")
catalog_name=dbutils.widgets.get("catalog_name")
schema_name=dbutils.widgets.get("schema_name")

In [0]:
from pyspark.sql.functions import *

In [0]:
df = spark.read.table(f"{catalog_name}.{schema_name}.medical_history_data")
display(df)

In [0]:
df = spark.read.table(f"{catalog_name}.{schema_name}.medical_history_data")
bronze_df = (df.withColumn(
    "ongoing_treatment",
    when(col("ongoing_treatment").isin("Yes", "Y"), "Y").when(col("ongoing_treatment").isin("No", "N"), "N"))
    .withColumn("diagnosis_date",to_date(col("diagnosis_date"))).drop("_rescued_data")
)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window
from delta.tables import DeltaTable



# Read the existing Silver Table (Delta Table)
try:
    silver_table = DeltaTable.forName(spark, f"{catalog_name}.silver.medical_history_data")
except:
    silver_table = None

# Define window to keep latest record per `patient_id`
window_spec = Window.partitionBy("patient_id").orderBy(col("source_file").desc())

# Deduplicate latest records
bronze_dedup_df = (
    bronze_df.withColumn("row_num", row_number().over(window_spec))
    .filter(col("row_num") == 1)
    .drop("row_num")
)

# Merge new data into Silver Table
if silver_table:
    silver_table.alias("silver").merge(
        bronze_dedup_df.alias("bronze"),
        "silver.patient_id = bronze.patient_id"
    ).whenMatchedUpdate(set={
        "condition": col("bronze.condition"),
        "diagnosis_date": col("bronze.diagnosis_date"),
        "doctor_name": col("bronze.doctor_name"),
        "hospital_name": col("bronze.hospital_name"),
        "ongoing_treatment": col("bronze.ongoing_treatment"),
        "source_file": col("bronze.source_file"),
    }).whenNotMatchedInsert(values={
        "patient_id": col("bronze.patient_id"),
        "condition": col("bronze.condition"),
        "diagnosis_date": col("bronze.diagnosis_date"),
        "doctor_name": col("bronze.doctor_name"),
        "hospital_name": col("bronze.hospital_name"),
        "ongoing_treatment": col("bronze.ongoing_treatment"),
       
        "source_file": col("bronze.source_file"),
    }).execute()
else:
    bronze_dedup_df.write.format("delta").saveAsTable(f"{catalog_name}.silver.medical_history_data")

print("Incremental load to Silver Layer completed successfully!")

In [0]:
%sql
select * from geekcoders_dev.silver.medical_history_data