In [0]:
%sql
CREATE CATALOG IF NOT EXISTS `breweries-pipeline-data-catalog`;
--Create schema
CREATE SCHEMA IF NOT EXISTS `breweries-pipeline-data-catalog`.bronze_layer;

In [0]:
import requests
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    DoubleType,
    IntegerType
)
import pyspark.sql.functions as F

#Get data from API
api_url = "https://api.openbrewerydb.org/v1/breweries"
resp = requests.get(api_url)
# Raises an HTTPError if the HTTP request returned an unsuccessful status code
resp.raise_for_status()
data = resp.json()

#Define schema based on API fields
schema = StructType([
    StructField("id", StringType(), False),
    StructField("name", StringType(), False),
    StructField("brewery_type", StringType(), False),
    StructField("address_1", StringType(), True),
    StructField("address_2", StringType(), True),
    StructField("address_3", StringType(), True),
    StructField("city", StringType(), False),
    StructField("state_province", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("country", StringType(), False),
    StructField("longitude", StringType(), True),
    StructField("latitude", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("website_url", StringType(), True),
    StructField("state", StringType(), False),
    StructField("street", StringType(), True)
])

# Create Dataframe
df_bronze = spark.createDataFrame(data, schema=schema)

# Add ingestion timestamp column
df_bronze = df_bronze.withColumn("ingestion_timestamp", F.current_timestamp())

In [0]:
#DQ
if df_bronze.count() == 0:
    raise Exception("Data Quality Check Failed: Bronze Table is empty")

not_null_columns = [field.name for field in df_bronze.schema.fields if not field.nullable]
null_not_allowed = [c for c in not_null_columns if df_bronze.filter(F.col(c).isNull()).count() > 0]
if null_not_allowed:
    raise Exception(f"Data Quality Check Failed: Null values found in NOT NULL columns: {null_not_allowed}")

# Show a sample of the transformed data
display(df_bronze.limit(5))

In [0]:
# Write table
df_bronze.write \
  .format("delta") \
  .mode("overwrite") \
  .saveAsTable("`breweries-pipeline-data-catalog`.bronze_layer.breweries_bronze")