In [10]:
import pyspark
from pyspark.sql import SparkSession
import logging
from pyspark.sql import functions as func
from configs import configs
from functions import functions as F
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

HOST_ADDRESS = os.getenv('HOST_ADDRESS')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')

def configure_spark():
    """Configure and return a SparkSession."""
    spark = SparkSession.builder \
        .appName("ELT Incremental Landing to Bronze AdventureWorks") \
        .config("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000") \
        .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
        .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
        .config("spark.hadoop.fs.s3a.path.style.access", True) \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("hive.metastore.uris", "thrift://metastore:9083") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .getOrCreate()
    return spark

def ingest_data():
    """Ingest data from Landing Zone (Parquet) to Delta Lake in Bronze layer."""
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    logging.info("Starting Delta ingestion...")
    
    input_prefix_layer_name = configs.prefix_layer_name['0']  # landing layer prefix
    storage_input = configs.lake_path['landing_adventure_works']  # path to landing zone

    output_prefix_layer_name = configs.prefix_layer_name['1']  # bronze layer prefix
    storage_output = configs.lake_path['bronze']  # path to bronze zone

    for key, value in configs.tables_postgres_adventureworks.items():
        table = value
        table_name = F.convert_table_name(table)
        
        delta_path = f'{storage_output}{output_prefix_layer_name}{table_name}'  # path to Delta table in bronze layer
        input_path = f'{storage_input}{input_prefix_layer_name}{table_name}'  # path to Parquet files in landing zone

        try:
            # Get the maximum modified date from the Delta table in Bronze layer
            try:
                max_modified_date_delta = spark.read.format("delta") \
                    .load(delta_path) \
                    .select(func.max("modifieddate").alias("max_modifieddate")) \
                    .collect()[0]["max_modifieddate"]
            except:
                max_modified_date_delta = None

            # Read data from Parquet files in Landing Zone and filter based on max_modified_date_delta
            df_input_data = spark.read \
                .format("parquet") \
                .load(input_path)

            if max_modified_date_delta:
                df_input_data = df_input_data.filter(func.col("modifieddate") > max_modified_date_delta)

            input_data_count = df_input_data.count()
            logging.info(f"Number of rows processed for table {table_name}: {input_data_count}")

            if input_data_count == 0:
                logging.info(f"No new data to process for table {table_name}.")
                continue

            # Add update date metadata and month_key column
            df_with_update_date = F.add_metadata(df_input_data)
            df_with_month_key = F.add_month_key(df_with_update_date, 'modifieddate')

            # Write the new data to Delta Lake in Bronze layer
            df_with_month_key.write.format("delta").mode("append").partitionBy('month_key').save(delta_path)

            num_rows_written = df_with_month_key.count()
            logging.info(f"Table {table_name} successfully processed and saved to Delta Lake in Bronze layer: {delta_path}. {num_rows_written} rows written.")

        except Exception as e:
            logging.error(f"Error processing table {table_name}: {str(e)}")

    logging.info("Delta ingestion completed!")

if __name__ == "__main__":
    spark = configure_spark()
    ingest_data()


2024-08-20 21:50:39,564 - INFO - Starting Delta ingestion...
2024-08-20 21:50:39,785 - INFO - Number of rows processed for table sales_countryregioncurrency: 0
2024-08-20 21:50:39,785 - INFO - No new data to process for table sales_countryregioncurrency.
2024-08-20 21:50:40,134 - INFO - Number of rows processed for table sales_creditcard: 0
2024-08-20 21:50:40,135 - INFO - No new data to process for table sales_creditcard.
2024-08-20 21:50:40,346 - INFO - Number of rows processed for table sales_currency: 0
2024-08-20 21:50:40,347 - INFO - No new data to process for table sales_currency.
2024-08-20 21:50:40,576 - INFO - Number of rows processed for table humanresources_department: 0
2024-08-20 21:50:40,577 - INFO - No new data to process for table humanresources_department.
2024-08-20 21:50:40,789 - INFO - Number of rows processed for table humanresources_employee: 0
2024-08-20 21:50:40,790 - INFO - No new data to process for table humanresources_employee.
2024-08-20 21:50:41,125 - INF