In [None]:
import pyspark
from pyspark.sql import SparkSession
import logging
from datetime import datetime 

from pyspark.sql import functions as func
from pyspark.sql.functions import lit

from configs import configs
from functions import functions as F


def configure_spark():
    """Configure SparkSession."""
    spark = SparkSession.builder \
        .appName("Refinement Incremental Silver to Gold AdventureWorks") \
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
        .config("spark.hadoop.fs.s3a.access.key", "chapolin") \
        .config("spark.hadoop.fs.s3a.secret.key", "mudar@123") \
        .config("spark.hadoop.fs.s3a.path.style.access", True) \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("hive.metastore.uris", "thrift://metastore:9083") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .getOrCreate()
    return spark

def ingest_data():
    """Ingest data from AdventureWorks to HDFS."""

    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    logging.info("Starting Refinement...")
    
    spark = configure_spark()
    input_name = configs.prefix_layer_name['2']  # bronze layer
    hdfs_input = configs.lake_path['silver']
    
    output_name = configs.prefix_layer_name['3']  # silver layer
    hdfs_output = configs.lake_path['gold']

    for table_name, query in configs.tables_gold.items():        
        try:
            # Obtém a data máxima de modificação do delta da camada gold
            max_modified_date_delta = spark.read.format("delta") \
                .load(f'{hdfs_output}{output_name}{table_name}') \
                .select(func.max("modifieddate").alias("max_modifieddate")) \
                .collect()[0]["max_modifieddate"]
            
            # Adiciona o filtro de data na query existente
            query_with_filter = f"""
            SELECT * FROM ({query}) AS subquery 
            WHERE modifieddate > '{max_modified_date_delta}'
            """

            # Executa a query com o filtro aplicado
            df_new_data = spark.sql(query_with_filter)
            
            # Adiciona a coluna de data de atualização
            df_with_update_date = F.add_metadata(df_new_data)

            # Escreve os dados no Delta Lake
            df_with_update_date.write.format("delta").mode("append").partitionBy('month_key').save(f'{hdfs_output}{output_name}{table_name}')

            num_rows_written = df_with_update_date.count()
            logging.info(f"Table {table_name} successfully processed and saved to Delta Lake: {hdfs_output}{output_name}{table_name}. {num_rows_written} rows written.")

        except Exception as e:
            logging.error(f"Error processing table {table_name}: {str(e)}")

    logging.info("Refinement completed!")

if __name__ == "__main__":
    configure_spark()
    ingest_data()



2024-08-21 10:00:18,255 - INFO - Starting Refinement...
