In [0]:
%sql
--Create schema
CREATE SCHEMA IF NOT EXISTS `breweries-pipeline-data-catalog`.silver_layer;

In [0]:
from pyspark.sql import functions as F

#Read bronze table
df_bronze = spark.read.table("`breweries-pipeline-data-catalog`.bronze_layer.breweries_bronze")

#display(df_bronze.limit(5))

In [0]:
#Define best course of partitioning
#display(df_bronze.select("country").distinct().count())

In [0]:
# Select relevant fields
df_silver = (
    df_bronze
    .select(
        F.col("id").cast("string"),
        F.col("name"),
        F.col("brewery_type"),
        F.col("city"),
        F.col("state"),
        F.col("country"),
        F.col("ingestion_timestamp")
    )
# Filter out rows missing location
.filter(F.col("state").isNotNull())
# Drop any duplicates
.dropDuplicates(["id"])
)

In [0]:
#DQ
if df_silver.count() == 0:
    raise Exception("Data Quality Check Failed: Silver Table is empty")

null_columns = [c for c in df_silver.columns if df_silver.filter(F.col(c).isNull()).count() > 0]
if null_columns:
    raise Exception(f"Data Quality Check Failed: Null values found in columns: {null_columns}")

# Show a sample of the transformed data
#display(df_silver.limit(5))

In [0]:
# Write the DataFrame with the desired partitioning
df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("state") \
    .saveAsTable("`breweries-pipeline-data-catalog`.silver_layer.breweries_silver")