# refinement_silver_to_gold_adventureworks

In [1]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pyspark
from pyspark.sql import SparkSession
import logging
from datetime import datetime

from configs import configs
from functions import functions as F

from dotenv import load_dotenv
import os

## Import Environment

In [3]:
load_dotenv()

HOST_ADDRESS=os.getenv('HOST_ADDRESS')
MINIO_ACCESS_KEY=os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY=os.getenv('MINIO_SECRET_KEY')

## Function process table

In [4]:
def process_table(spark, query_input, output_path):
    try:
        df_input_data = spark.sql(query_input)
        df_with_update_date = F.add_metadata(df_input_data)
        df_with_update_date.write.format("delta").mode("overwrite").partitionBy('month_key').save(output_path)
        logging.info(f"query '{query_input}' sucessfully processed and saved to {output_path}")
    except Exception as e:
        logging.error(f"Error processsing query '{query_input}': {str(e)}")

## Spark Session

In [5]:
if __name__ == "__main__":
    spark = SparkSession.builder \
            .appName("refinement_silver_to_gold_adventureworks") \
            .config("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000") \
            .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
            .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
            .config("spark.hadoop.fs.s3a.path.style.access", True) \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
            .config("hive.metastore.uris", "thrift://metastore:9083") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .getOrCreate()

## Log configs

In [6]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

logging.info("Starting refinement to gold...")

2024-08-21 10:13:59,638 - INFO - Starting refinement to gold...


## Path configs

In [7]:
input_prefix_layer_name = configs.prefix_layer_name['2']  # silver layer
input_path = configs.lake_path['silver']

output_prefix_layer_name = configs.prefix_layer_name['3']  # gold layer
output_path = configs.lake_path['gold']

## Process

In [8]:
try:
    for table_name, query_input in configs.tables_gold.items():
        table_name = F.convert_table_name(table_name)
        
        query_input = F.get_query(table_name, input_path, input_prefix_layer_name, configs.tables_gold)
        
        storage_output = f'{output_path}{output_prefix_layer_name}{table_name}'
        
        process_table(spark, query_input, storage_output)
        
    logging.info("Refinement to gold completed!")
    
except Exception as e:
    logging.error(f'Error processing table: {str(e)}')

2024-08-21 10:14:10,182 - INFO - query '
SELECT 
    departmentid as id_departamento, 
    name as nome_departamento, 
    groupname as nome_grupo, 
    modifieddate, 
    last_update, 
    month_key 
FROM 
    delta.`s3a://silver/adventure_works/silver_humanresources_department`
    ' sucessfully processed and saved to s3a://gold/adventure_works/gold_humanresources_department
2024-08-21 10:14:12,184 - INFO - query '
SELECT 
    groupname as nome_grupo, 
    modifieddate,
    last_update, 
    month_key, 
    count(*) as qtd
FROM 
    delta.`s3a://silver/adventure_works/silver_humanresources_department`
group by 
	groupname,
	modifieddate,
    last_update, 
    month_key 
    ' sucessfully processed and saved to s3a://gold/adventure_works/gold_humanresources_groupname_qtd
2024-08-21 10:14:12,186 - INFO - Refinement to gold completed!


In [9]:
df = spark.read.format("delta").load('s3a://gold/adventure_works/gold_humanresources_department').show(truncate=False)

+---------------+--------------------------+------------------------------------+-------------------+--------------------------+---------+
|id_departamento|nome_departamento         |nome_grupo                          |modifieddate       |last_update               |month_key|
+---------------+--------------------------+------------------------------------+-------------------+--------------------------+---------+
|1              |Engineering               |Research and Development            |2008-04-30 00:00:00|2024-08-21 10:14:07.204462|200804   |
|2              |Tool Design               |Research and Development            |2008-04-30 00:00:00|2024-08-21 10:14:07.204462|200804   |
|3              |Sales                     |Sales and Marketing                 |2008-04-30 00:00:00|2024-08-21 10:14:07.204462|200804   |
|4              |Marketing                 |Sales and Marketing                 |2008-04-30 00:00:00|2024-08-21 10:14:07.204462|200804   |
|5              |Purchasing