### Bronze Layer
Bronze Layer: Persist the raw data from the API in its native format or any format you find suitable.

In [0]:
import requests 
import logging
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, count, when, current_timestamp, lit

In [0]:
def fetch_breweries_from_api(
    params: dict = None,
    per_page: int = 200,
    sort: str = None,
    version: str = "v1",
    retries: int = 3
) -> list:
    """
    Fetches paginated brewery data from the Open Brewery DB API with retry logic.

    Parameters
    ----------
    params : dict, optional
        Filters for the API request (e.g., by_city, by_state, etc). Default is None.
    per_page : int, optional
        Number of items per page (maximum 200). Default is 200.
    sort : str, optional
        Fields for sorting (e.g., 'name,desc'). Default is None.
    version : str, optional
        API version to use. Default is "v1".
    retries : int, optional
        Number of times to retry a failed request per page. Default is 3.

    Returns
    -------
    list
        List of dictionaries containing brewery data from the API.
    """
    url = f"https://api.openbrewerydb.org/{version}/breweries"
    all_data = []
    page = 1
    while True:
        query = params.copy() if params else {}
        query.update({"page": page, "per_page": per_page})
        if sort:
            query["sort"] = sort
        attempt = 0
        while attempt < retries:
            try:
                response = requests.get(url, params=query)
                response.raise_for_status()
                data = response.json()
                break
            except requests.RequestException:
                attempt += 1
                if attempt == retries:
                    return all_data
        if not data:
            break
        all_data.extend(data)
        page += 1
    return all_data

In [0]:
def get_brewery_schema():
    """
    Returns the schema for the breweries DataFrame.

    Returns
    -------
    StructType
        A StructType object defining the schema for brewery data, including required and optional fields.
    """
    return StructType([
        StructField("id", StringType(), False),  
        StructField("name", StringType(), False),  
        StructField("brewery_type", StringType(), False),  
        StructField("address_1", StringType(), True),
        StructField("address_2", StringType(), True),
        StructField("address_3", StringType(), True),
        StructField("city", StringType(), False),  
        StructField("state_province", StringType(), False),  
        StructField("postal_code", StringType(), False),  
        StructField("country", StringType(), False),  
        StructField("longitude", StringType(), True),
        StructField("latitude", StringType(), True),
        StructField("phone", StringType(), True),
        StructField("website_url", StringType(), True),
        StructField("state", StringType(), True),  
        StructField("street", StringType(), True) 
    ])

def create_brewery_df(data):
    """
    Creates a Spark DataFrame for brewery data using the predefined schema.

    Parameters
    ----------
    data : list of dict
        List of dictionaries containing brewery data.

    Returns
    -------
    pyspark.sql.DataFrame
        Spark DataFrame with the brewery schema applied.
    """
    schema = get_brewery_schema()
    return spark.createDataFrame(data, schema=schema)

In [0]:
def validate_required_fields(
    df: 'pyspark.sql.DataFrame', 
    required_fields: list[str]
) -> None:
    """
    Validates the presence and non-nullity of required fields in a Spark DataFrame.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
        The DataFrame to validate.
    required_fields : list of str
        List of required column names to check for existence and non-null values.

    Raises
    ------
    ValueError
        If any required columns are missing or contain null values.

    Notes
    -----
    - Prints the status of required columns and null value checks.
    - Raises an error if validation fails.
    """
    
    missing_columns = [field for field in required_fields if field not in df.columns]
    if missing_columns:
        raise ValueError(f"Important columns are missing: {missing_columns}")
    print("All required columns are present")

   
    print("\nCheck null values on fields:")
    null_counts = df.select([
        count(when(col(c).isNull(), c)).alias(c) for c in required_fields
    ]).collect()[0]

    has_nulls = False
    for field in required_fields:
        null_count = null_counts[field]
        if null_count > 0:
            print(f" {field}: {null_count} null values")
            has_nulls = True
        else:
            print(f"{field}: haven't null values")

    if has_nulls:
        raise ValueError("There are null values in required fields")

    print("\nQuality Success! All required columns are present and haven't null values")

In [0]:
def write_table(
    df: 'pyspark.sql.DataFrame', 
    catalog: str, 
    schema: str, 
    table: str, 
    mode: str = "append",
    catalog_location: str = None
) -> None:
    """
    Write a Spark DataFrame to a Delta table in Databricks.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
        The DataFrame to be written.
    catalog : str
        The catalog name (e.g., 'hive_metastore').
    schema : str
        The schema/database name.
    table : str
        The table name.
    mode : str, optional
        Write mode: 'append' (default) or 'overwrite'.
    catalog_location : str, optional
        Optional location for the catalog (not used in this function).

    Raises
    ------
    ValueError
        If the mode is not 'append' or 'overwrite'.

    Notes
    -----
    - In 'append' mode, adds an 'execution_date' column (current date) and partitions by it.
    - Uses Delta format and overwrites schema if needed.
    """
    table_full = f"{catalog}.{schema}.{table}"
    if mode == "append":
        df = df.withColumn("execution_date", current_timestamp())
        df.write \
            .format("delta") \
            .mode("append") \
            .partitionBy("execution_date") \
            .option("overwriteSchema", "false") \
            .option("mergeSchema", "true") \
            .saveAsTable(table_full)
    elif mode == "overwrite":
        df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_full)
    else:
        raise ValueError("Mode of write table doesn't support. You must use 'append' or 'overwrite'.")

In [0]:
def process_api() -> None:
    """
    Fetches brewery data from the Open Brewery DB API, validates required fields, and writes the data to Delta Lake.

    This function performs the following steps:
        1. Fetches brewery data using the fetch_breweries_from_api function.
        2. Creates a Spark DataFrame with the expected schema.
        3. Validates the presence and non-nullity of required fields.
        4. Writes the validated DataFrame to a Delta Lake table in append mode, partitioned by execution date.

    Raises
    ------
    ValueError
        If required columns are missing or contain null values.
    Exception
        For any other errors during the process, prints the error message.
    """
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(message)s"
    )
    required_fields = [
        'id', 'name', 'brewery_type', 'city', 
        'state_province', 'postal_code', 'country'
    ]
    try:
        logging.info("Step 1: Fetching brewery data from API...")
        data = fetch_breweries_from_api()
        logging.info(f"Step 1 complete: Fetched {len(data)} records.")

        logging.info("Step 2: Creating Spark DataFrame with expected schema...")
        df_breweries = create_brewery_df(data)
        logging.info("Step 2 complete: DataFrame created.")
        df_breweries.printSchema()

        if df_breweries.rdd.isEmpty():
            logging.error("Step 2 error: DataFrame is empty. No data to process or write.")
            raise Exception("DataFrame is empty. No data to process or write.")

        logging.info("Step 3: Validating required fields for presence and non-nullity...")
        validate_required_fields(df_breweries, required_fields)
        logging.info("Step 3 complete: Required fields validated.")

        logging.info("Step 4: Writing DataFrame to Delta Lake table...")
        write_table(
            df_breweries, 
            "ab_inbev_lake", 
            "bronze", 
            "breweries_api_data", 
            mode="append"
        )
        logging.info("Step 4 complete: Data written to Delta Lake table.")

    except Exception as e:
        logging.error(f"Error in process_api: {e}")
        raise

In [0]:
process_api()