In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

#ADLS configuration 
# spark.conf.set(
#   "fs.azure.account.key.<<Storageaccount_name>>.dfs.core.windows.net",
#   "<<Storage_Account_access_key>>"
# )

spark.conf.set(
    "fs.azure.account.key.storageaccounthealthcare.dfs.core.windows.net",
    dbutils.secrets.get(scope="healthcarekeyvaultscope", key="storage-connection")
)

bronze_path = "abfss://bronze@storageaccounthealthcare.dfs.core.windows.net/patient_flow"
silver_path = "abfss://silver@storageaccounthealthcare.dfs.core.windows.net/patient_flow"

#read from bronze
bronze_df = (
    spark.readStream
    .format("delta")
    .load(bronze_path)
)

#Define Schema
schema = StructType([
    StructField("patient_id", StringType()),
    StructField("gender", StringType()),
    StructField("age", IntegerType()),
    StructField("department", StringType()),
    StructField("admission_time", StringType()),
    StructField("discharge_time", StringType()),
    StructField("bed_id", IntegerType()),
    StructField("hospital_id", IntegerType())
])

#Parse it to dataframe
parsed_df = bronze_df.withColumn("data",from_json(col("raw_json"),schema)).select("data.*")

#convert type to Timestamp
clean_df = parsed_df.withColumn("admission_time", to_timestamp("admission_time"))
clean_df = clean_df.withColumn("discharge_time", to_timestamp("discharge_time"))

#invalid admission_times
clean_df = clean_df.withColumn("admission_time",
                               when(
                                   col("admission_time").isNull() | (col("admission_time") > current_timestamp()),
                                   current_timestamp())
                               .otherwise(col("admission_time")))

#Handle Invalid Age
clean_df = clean_df.withColumn("age",
                               when(col("age")>100,floor(rand()*90+1).cast("int"))
                               .otherwise(col("age"))
                               )

#schema evolution
expected_cols = ["patient_id", "gender", "age", "department", "admission_time", "discharge_time", "bed_id", "hospital_id"]

for col_name in expected_cols:
    if col_name not in clean_df.columns:
        clean_df = clean_df.withColumn(col_name, lit(None))

#Write to silver table
(
    clean_df.writeStream
    .format("delta")
    .outputMode("append")
    .option("mergeSchema","true")
    .option("checkpointLocation", silver_path + "_checkpoint")
    .start(silver_path)
)


<pyspark.sql.connect.streaming.query.StreamingQuery at 0x7f8f7be7d5e0>

In [0]:
display(spark.read.format('delta').load(silver_path))

patient_id,gender,age,department,admission_time,discharge_time,bed_id,hospital_id
0fb2fa94-0015-4f51-8afe-7841af3552a5,Female,48,Surgery,2026-01-01T17:24:58.229227Z,2026-01-01T18:24:58.229227Z,446,4
2488dec3-a16f-487f-8d5c-24f08e51fb84,Female,64,Cardiology,2026-01-01T02:24:59.234817Z,2026-01-02T06:24:59.234817Z,206,5
5a5fdc0c-98d4-48db-9a0d-44227db67d34,Male,18,Oncology,2025-12-31T08:25:00.236683Z,2026-01-02T07:25:00.236683Z,455,1
437345c0-3edf-424d-9b51-4dce1debb9f4,Female,47,ICU,2025-12-31T15:25:01.2426Z,2026-01-02T02:25:01.2426Z,137,6
b0cf19ba-b2a1-40a9-af31-d583de9f1eba,Female,18,ICU,2026-01-01T01:25:02.249402Z,2026-01-03T12:25:02.249402Z,270,1
8b7daa6c-b11e-4545-8099-ada36ce91671,Male,52,Pediatrics,2025-12-30T10:25:40.419884Z,2025-12-31T18:25:40.419884Z,392,6
d4953453-ba8e-40bf-ba46-b602d0a05528,Male,98,Surgery,2025-12-30T06:25:41.421032Z,2025-12-30T10:25:41.421032Z,451,6
77bb1a20-c918-4598-bbf2-59fed3da51ab,Male,28,Cardiology,2026-01-01T15:25:42.427989Z,2026-01-02T23:25:42.427989Z,433,4
a6e4d084-5ca7-4c03-88f9-4bca97177fca,Female,42,Maternity,2025-12-30T22:24:23.064377Z,2026-01-01T21:24:23.064377Z,483,5
fb2af98e-92f7-4f60-b792-e019e17c5c5e,Male,19,Emergency,2026-01-02T03:46:00.74Z,2026-01-04T06:24:24.069824Z,250,7
