In [0]:
# DLT pipeline for Silver layer transformations

import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

#Silver Layer: Claims Data

@dlt.table(
  name="silver_claims",
  comment="Cleaned and validated claims data with enforced quality.",
  table_properties={
    "quality": "silver"
  }
)
@dlt.expect("valid_claim_id", "claim_id IS NOT NULL")
@dlt.expect_or_drop("valid_member_id", "member_id IS NOT NULL")
@dlt.expect_or_drop("valid_provider_id", "provider_id IS NOT NULL")
@dlt.expect("positive_claim_amount", "claim_amount > 0")
def silver_claims():
  return (
    dlt.read("medisure_jen.bronze.claims_batch")
      .withColumn("ingestion_date", current_date())  # Add processing date
      .dropDuplicates(["claim_id"])  # Deduplication on claim ID
  )

#Silver Layer: Members Data

@dlt.table(
  name="silver_members",
  comment="Cleaned member data with PII protection.",
  table_properties={
    "quality": "silver"
  }
)
@dlt.expect_or_drop("valid_member_id", "member_id IS NOT NULL")
@dlt.expect("valid_email_format", "email LIKE '%@%'")  # Basic email validation
def silver_members():
  return (
    dlt.read("medisure_jen.bronze.members")
      .dropDuplicates(["member_id"])  # Deduplication on member ID
  )

#Silver Layer: Providers Data

@dlt.table(
  name="silver_providers",
  comment="Parsed and flattened provider data from nested JSON.",
  table_properties={
    "quality": "silver"
  }
)
@dlt.expect_or_drop("valid_provider_id", "provider_id IS NOT NULL")
def silver_providers():
  raw_providers = dlt.read("medisure_jen.bronze.providers")
  
  # Parse nested JSON structures (assuming specialties is an array)
  return (
    raw_providers
      .select("provider_id", "provider_name", "address", 
              explode("specialties").alias("specialty"))  # Explode array to rows
      .dropDuplicates(["provider_id", "specialty"])  # Deduplication
  )

#Silver Layer: Diagnosis Reference Data

@dlt.table(
  name="silver_diagnosis",
  comment="Validated diagnosis code reference data.",
  table_properties={
    "quality": "silver"
  }
)
@dlt.expect_or_drop("valid_diagnosis_code", "diagnosis_code IS NOT NULL")
@dlt.expect_or_drop("valid_diagnosis_description", "diagnosis_description IS NOT NULL")
def silver_diagnosis():
  return (
    dlt.read("medisure_jen.bronze.diagnosis_ref")
      .dropDuplicates(["diagnosis_code"])  # Deduplication on diagnosis code
  )

#Silver Layer: Enriched Claims

@dlt.table(
  name="silver_claims_enriched",
  comment="Claims data enriched with member, provider, and diagnosis information.",
  table_properties={
    "quality": "silver"
  }
)
@dlt.expect("valid_claim_amount", "claim_amount BETWEEN 1 AND 1000000") #validating claim amount
def silver_claims_enriched():
  claims = dlt.read("medisure_jen.bronze.claims_batch")
  members = dlt.read("medisure_jen.bronze.members")
  providers = dlt.read("medisure_jen.bronze.providers")
  diagnosis = dlt.read("medisure_jen.bronze.diagnosis_ref")
  
  return (
    claims
      # Validate foreign key relationships
      .join(members, "member_id", "left_semi")  # Keep claims with valid members
      .join(providers, "provider_id", "left_semi")  # Keep claims with valid providers
      .join(diagnosis, "diagnosis_code", "left")
      .select("claim_id", "member_id", "provider_id", "diagnosis_code",
              "claim_amount", "claim_date", "procedure_code",
              col("diagnosis_description").alias("primary_diagnosis"))
  )