In [0]:
%python
# DLT pipeline for Silver layer transformations
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Silver Layer: Claims Data Batch
@dlt.table(
  name="silver_claims_batch",
  comment="Cleaned and validated claims data from batch sources.",
  table_properties={"quality": "silver"}
)
@dlt.expect("valid_claim_id", "claim_id IS NOT NULL")
@dlt.expect_or_drop("valid_member_id", "member_id IS NOT NULL")
@dlt.expect_or_drop("valid_provider_id", "provider_id IS NOT NULL")
@dlt.expect("positive_claim_amount", "claim_amount > 0")
def silver_claims_batch():
    batch_claims = dlt.read("medisure_jen.bronze.claims_batch").select(
        col("ClaimID").alias("claim_id"),
        col("MemberID").alias("member_id"),
        col("ProviderID").alias("provider_id"),
        col("ClaimDate").alias("claim_date"),
        col("ServiceDate").alias("service_date"),
        col("Amount").alias("claim_amount"),
        col("Status").alias("status"),
        col("ICD10Codes").alias("diagnosis_code"),
        col("CPTCodes").alias("procedure_code"),
        col("ClaimType").alias("claim_type"),
        col("SubmissionChannel").alias("channel"),
        col("Notes").alias("notes"),
        col("IngestTimestamp").alias("event_timestamp")
    )
    return batch_claims.withColumn("ingestion_date", current_date()).dropDuplicates(["claim_id"])

# Silver Layer: Claims Data (Streaming)
@dlt.table(
  name="silver_claims_stream",
  comment="Cleaned and validated claims data from streaming sources.",
  table_properties={"quality": "silver"}
)
@dlt.expect("valid_claim_id", "claim_id IS NOT NULL")
@dlt.expect_or_drop("valid_member_id", "member_id IS NOT NULL")
@dlt.expect_or_drop("valid_provider_id", "provider_id IS NOT NULL")
@dlt.expect("positive_claim_amount", "claim_amount > 0")
def silver_claims_stream():
    streaming_claims = dlt.read_stream("medisure_jen.bronze.claims_stream").select(
        col("ClaimID").alias("claim_id"),
        col("MemberID").alias("member_id"),
        col("ProviderID").alias("provider_id"),
        col("ICD10Codes").alias("diagnosis_code"),
        col("ClaimDate").alias("claim_date"),
        col("Amount").alias("claim_amount"),
        col("Status").alias("status"),
        col("CPTCodes").alias("procedure_code"),
        col("EventTimestamp").alias("event_timestamp")
    )
    streaming_with_nulls = streaming_claims.withColumn(
        "service_date", lit(None).cast("date")
    ).withColumn(
        "claim_type", lit(None).cast("string")
    ).withColumn(
        "channel", lit(None).cast("string")
    ).withColumn(
        "notes", lit(None).cast("string")
    )
    return streaming_with_nulls.withColumn("ingestion_date", current_date()).dropDuplicates(["claim_id"])

# Silver Layer: Combined Claims Data
@dlt.table(
  name="silver_claims_combined",
  comment="Combined claims data from batch and streaming sources.",
  table_properties={"quality": "silver"}
)
def silver_claims_combined():
    batch_claims = dlt.read("silver_claims_batch")
    streaming_claims = dlt.read("silver_claims_stream")
    combined_claims = batch_claims.unionByName(streaming_claims, allowMissingColumns=True)
    return combined_claims

# Silver Layer: Members Data
@dlt.table(
  name="silver_members",
  comment="Cleaned member data with PII protection.",
  table_properties={"quality": "silver"}
)
@dlt.expect_or_drop("valid_member_id", "member_id IS NOT NULL")
@dlt.expect("valid_email_format", "email LIKE '%@%'")  # Basic email validation
def silver_members():
  return (
    dlt.read("medisure_jen.bronze.members").select(
        col("MemberID").alias("member_id"),
        col("Name").alias("member_name"),
        col("DOB").alias("birth_date"),
        col("Gender").alias("gender"),
        col("Region").alias("religion"),
        col("PlanType").alias("plan"),
        col("EffectiveDate").alias("effective_date"),
        col("Email").alias("email"),
        col("IsActive").alias("status"),
        col("LastUpdated").alias("modified_date")
    ).dropDuplicates(["member_id","member_name"])  # Use the new alias in dropDuplicates
  )
# Silver Layer: Provider Locations
@dlt.table(
  name="silver_provider_locations",
  comment="Provider location details.",
  table_properties={"quality": "silver"}
)
@dlt.expect_or_drop("valid_provider_id", "ProviderID IS NOT NULL")
def silver_provider_locations():
    raw_providers = dlt.read("medisure_jen.bronze.providers")
    
    return (
        raw_providers
        .withColumn("location", explode(col("Locations")))
        .withColumn("Address", col("location.Address"))
        .withColumn("City", col("location.City"))
        .withColumn("State", col("location.State"))
        .select(
            "ProviderID", 
            "Name", 
            "Address",
            "City", 
            "State",
            col("ProviderID").alias("provider_id")
        )
        .dropDuplicates(["ProviderID", "Address", "City", "State"])
    )

# Silver Layer: Provider Specialties
@dlt.table(
  name="silver_provider_specialties",
  comment="Provider specialty information.",
  table_properties={"quality": "silver"}
)
@dlt.expect_or_drop("valid_provider_id", "ProviderID IS NOT NULL")
def silver_provider_specialties():
    raw_providers = dlt.read("medisure_jen.bronze.providers")
    
    return (
        raw_providers
        .withColumn("specialties_str", concat_ws(",", col("Specialties")))
        .withColumn("specialty", 
                   explode(split(
                       regexp_replace(
                           regexp_replace(col("specialties_str"), "> \\[|\\]", ""),
                           "'", ""
                       ),
                       ","
                   )))
        .select(
            "ProviderID", 
            "Name",
            trim(col("specialty")).alias("specialty"),
            col("ProviderID").alias("provider_id")
        )
        .dropDuplicates(["ProviderID", "specialty"])
        .filter(col("specialty") != "")
    )
@dlt.table(
  name="silver_providers",
  comment="Parsed and flattened provider data with separated location components.",
  table_properties={"quality": "silver"}
)
@dlt.expect_or_drop("valid_provider_id", "provider_id IS NOT NULL")
def silver_providers():
    raw_providers = dlt.read("medisure_jen.bronze.providers")
    
    # If Locations is an array, explode it first
    return (
        raw_providers.select(
            col("ProviderID").alias("provider_id"),
            col("Name").alias("provider_name"),
            col("TIN").alias("tin")
        )
        .dropDuplicates(["provider_id"])
    )
# Silver Layer: Diagnosis Reference Data
@dlt.table(
  name="silver_diagnosis",
  comment="Validated diagnosis code reference data.",
  table_properties={"quality": "silver"}
)
@dlt.expect_or_drop("valid_diagnosis_code", "diagnosis_code IS NOT NULL")
@dlt.expect_or_drop("valid_diagnosis_description", "diagnosis_description IS NOT NULL")
def silver_diagnosis():
  return (
    dlt.read("medisure_jen.bronze.diagnosis_ref").select(
        col("`Code`").alias("diagnosis_code"),
        col("`Description`").alias("diagnosis_description"))
      .dropDuplicates(["diagnosis_code"])  # Use the new alias in dropDuplicates
  )

# Silver Layer: Enriched Claims
@dlt.table(
  name="silver_claims_enriched",
  comment="Claims data enriched with member, provider, and diagnosis information.",
  table_properties={"quality": "silver"}
)
@dlt.expect("valid_claim_amount", "claim_amount BETWEEN 1 AND 1000000") #validating claim amount
def silver_claims_enriched():
  claims = dlt.read("silver_claims")
  members = dlt.read("silver_members")
  providers_loc = dlt.read("silver_provider_locations")
  providers_specialties = dlt.read("silver_provider_specialties")
  diagnosis = dlt.read("silver_diagnosis")
  
  return (
    claims
      # Validate foreign key relationships
      .join(members, claims.member_id == members.member_id, "left_semi")  # Keep claims with valid members
      .join(providers_loc, claims.provider_id == providers_loc.provider_id, "left_semi")  # Keep claims with valid providers
      .join(diagnosis, claims.diagnosis_code == diagnosis.diagnosis_code, "left")
      .select("claim_id", "member_id", "provider_id", diagnosis.diagnosis_code,
              "claim_amount", "claim_date", "procedure_code",
              col("diagnosis_description").alias("primary_diagnosis"))
  )