In [1]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from configs import configs

def fetch_selected_apis(table_indexes):
    responses = {}

    # Filtra as tabelas com base nos índices fornecidos na lista table_indexes
    tables_to_process = list(configs.tables_landing.items())
    
    for i in table_indexes:
        if i < len(tables_to_process):  # Verifica se o índice está dentro do limite
            key, url = tables_to_process[i]
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    responses[key] = response.json()  # Armazena a resposta JSON no dicionário
                else:
                    print(f"Erro: Código de status {response.status_code} para a URL {url}")
                    responses[key] = None  # Armazena None para indicar que houve erro
            except requests.exceptions.RequestException as e:
                print(f"Erro na requisição para {url}: {e}")
                responses[key] = None
        else:
            print(f"Índice {i} fora do alcance das tabelas.")

    return responses

# Exemplo de uso
table_indexes = [0, 1, 2, 3, 4, 5, 7, 8, 9]  # Passar os índices das tabelas que você deseja processar
api_responses = fetch_selected_apis(table_indexes)
print("Respostas das APIs:", api_responses)

Respostas das APIs: {'1': 'Encaminhado com sucesso ao bucket!!', '2': 'Encaminhado com sucesso ao bucket!!', '3': 'Encaminhado com sucesso ao bucket!!', '4': 'Encaminhado com sucesso ao bucket!!', '5': 'Encaminhado com sucesso ao bucket!!', '6': 'Encaminhado com sucesso ao bucket!!', '8': 'Encaminhado com sucesso ao bucket!!', '9': 'Encaminhado com sucesso ao bucket!!', '10': 'Encaminhado com sucesso ao bucket!!'}


In [3]:
import pyspark
from pyspark.sql import SparkSession
import logging
from datetime import datetime
from pyspark.sql.functions import lit, coalesce, col, current_date
from pyspark.sql.functions import year, month, lpad, concat
from configs import configs
from functions import functions as F
from dotenv import load_dotenv
import os

load_dotenv()
HOST_ADDRESS = os.getenv('HOST_ADDRESS')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')

spark = SparkSession.builder \
    .appName("el_landing_to_bronze_isp_performance") \
    .config("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000") \
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("hive.metastore.uris", "thrift://metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.sql.shuffle.partitions", "50") \
    .getOrCreate()

# Configurações específicas do Delta Lake
spark.conf.set("spark.delta.logRetentionDuration", "interval 1 day")  # Manter logs por 1 dia
spark.conf.set("spark.delta.deletedFileRetentionDuration", "interval 1 day")  # Manter arquivos deletados por 1 dia

# Desabilitar a verificação de retenção de duração no Delta Lake
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info("Starting conversions from Minio to Minio Delta...")

input_prefix_layer_name = configs.prefix_layer_name['0']
table_input_name = configs.lake_path['landing']
output_prefix_layer_name = configs.prefix_layer_name['1']
storage_output = configs.lake_path['bronze']

def process_table(table):
    table_name = F.convert_table_name(table)
    
    try:
        
        # Caminho da tabela destino (Bronze)
        delta_table_path = f'{storage_output}{output_prefix_layer_name}{table_name}'
        
        df_input_data = spark.read.format("parquet").load(f'{table_input_name}{input_prefix_layer_name}{table_name}')        
        #df_input_data = df_input_data.repartition(1)        
        df_with_update_date = F.add_metadata(df_input_data)
        
        if 'data_abertura' in df_with_update_date.columns:
            df_with_month_key = df_with_update_date.withColumn(
                'month_key',
                concat(year(col('data_abertura').cast('date')), lpad(month(col('data_abertura').cast('date')), 2, '0'))
            )
        else:
            df_with_month_key = df_with_update_date.withColumn(
                'month_key',
                concat(year(current_date()), lpad(month(current_date()), 2, '0'))
            )
            
        df_with_month_key.write.format("delta").mode("overwrite").option("mergeSchema", "true").partitionBy('month_key').save(delta_table_path)
        logging.info(f'Table {table_name} successfully processed and saved to Minio: {storage_output}{output_prefix_layer_name}{table_name}')
        
        # Limpar versões antigas imediatamente
        spark.sql(f"VACUUM delta.`{delta_table_path}` RETAIN 0 HOURS")
        logging.info(f"Old versions of Delta table '{table_name}' have been removed (VACUUM).")
        
    except Exception as e:
        logging.error(f'Error processing table {table_name}: {str(e)}')

# Lista de índices das tabelas que você deseja processar
table_indexes = [0, 1, 2, 3, 4, 5, 7, 8, 9]  # Exemplo, você pode alterar os índices conforme necessário

# Processar apenas as tabelas com os índices especificados
for index in table_indexes:
    if index < len(configs.tables_api_isp_performance):
        table_name = list(configs.tables_api_isp_performance.values())[index]
        process_table(table_name)
    else:
        logging.error(f"Índice {index} fora do alcance das tabelas.")

logging.info("Conversion from parquet to Delta completed successfully!")

2025-01-16 21:34:51,789 - INFO - Starting conversions from Minio to Minio Delta...
2025-01-16 21:35:04,132 - INFO - Table dim_filial successfully processed and saved to Minio: s3a://bronze/isp_performance/bronze_dim_filial
2025-01-16 21:35:11,006 - INFO - Old versions of Delta table 'dim_filial' have been removed (VACUUM).
2025-01-16 21:35:15,057 - INFO - Table dim_colaboradores successfully processed and saved to Minio: s3a://bronze/isp_performance/bronze_dim_colaboradores
2025-01-16 21:35:19,463 - INFO - Old versions of Delta table 'dim_colaboradores' have been removed (VACUUM).
2025-01-16 21:35:23,009 - INFO - Table dim_assunto successfully processed and saved to Minio: s3a://bronze/isp_performance/bronze_dim_assunto
2025-01-16 21:35:27,152 - INFO - Old versions of Delta table 'dim_assunto' have been removed (VACUUM).
2025-01-16 21:35:30,580 - INFO - Table dim_setor successfully processed and saved to Minio: s3a://bronze/isp_performance/bronze_dim_setor
2025-01-16 21:35:34,609 - INF

In [4]:
import pyspark
from pyspark.sql import SparkSession
import logging
from datetime import datetime
from configs import configs
from functions import functions as F
from dotenv import load_dotenv
import os

load_dotenv()
HOST_ADDRESS=os.getenv('HOST_ADDRESS')
MINIO_ACCESS_KEY=os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY=os.getenv('MINIO_SECRET_KEY')

# Lista de índices dos itens que você deseja processar (exemplo até o item 6)
indices_para_processar = [0, 1, 2, 3, 4, 5, 7, 8, 9]  # Isso corresponde às tabelas que serão processadas

def process_table(spark, query_input, output_path):
    try:
        df_input_data = spark.sql(query_input)
        df_with_update_date = F.add_metadata(df_input_data)
        #df_with_update_date = df_with_update_date.repartition(100)
        df_with_update_date.write \
            .format("delta") \
            .option("mergeSchema", "true") \
            .mode("overwrite") \
            .save(output_path)
        
        logging.info(f"query '{query_input}' successfully processed and saved to {output_path}")
        
        # Limpar versões antigas imediatamente
        spark.sql(f"VACUUM delta.`{output_path}` RETAIN 0 HOURS")
        logging.info(f"Old versions of Delta table have been removed (VACUUM).")
        
    except Exception as e:
        logging.error(f"Error processing query '{query_input}': {str(e)}")

if __name__ == "__main__":
    spark = SparkSession.builder \
            .appName("process_bronze_to_silver_isp_performance") \
            .config("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000") \
            .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
            .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
            .config("spark.hadoop.fs.s3a.path.style.access", True) \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
            .config("hive.metastore.uris", "thrift://metastore:9083") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.executor.memory", "4g") \
            .config("spark.driver.memory", "4g") \
            .config("spark.memory.fraction", "0.8") \
            .config("spark.sql.shuffle.partitions", "50") \
            .config("spark.sql.parquet.int96RebaseModeInWrite", "LEGACY") \
            .getOrCreate()

# Configurações específicas do Delta Lake
spark.conf.set("spark.delta.logRetentionDuration", "interval 1 day")  # Manter logs por 1 dia
spark.conf.set("spark.delta.deletedFileRetentionDuration", "interval 1 day")  # Manter arquivos deletados por 1 dia    

# Desabilitar a verificação de retenção de duração no Delta Lake
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info("Starting processing from bronze to silver...")

input_prefix_layer_name = configs.prefix_layer_name['1']  # bronze layer
input_path = configs.lake_path['bronze']
output_prefix_layer_name = configs.prefix_layer_name['2']  # silver layer
output_path = configs.lake_path['silver']

try:
    # Itera sobre os índices definidos e processa apenas as tabelas correspondentes
    for idx in indices_para_processar:
        # Pegando a tabela e a query correspondentes pelo índice
        table_name = list(configs.tables_silver.keys())[idx]
        query_input = configs.tables_silver[table_name]

        table_name = F.convert_table_name(table_name)
        
        query_input = F.get_query(table_name, input_path, input_prefix_layer_name, configs.tables_silver)        
        
        storage_output = f'{output_path}{output_prefix_layer_name}{table_name}'
        
        process_table(spark, query_input, storage_output)
        
    logging.info("Process to silver completed!")
    
except Exception as e:
    logging.error(f'Error processing table: {str(e)}')

2025-01-16 21:36:09,973 - INFO - Starting processing from bronze to silver...
2025-01-16 21:36:13,677 - INFO - query '
SELECT 
    id,
    fantasia,
    last_update,
    month_key
FROM 
    delta.`s3a://bronze/isp_performance/bronze_dim_filial`
    ' successfully processed and saved to s3a://silver/isp_performance/silver_dim_filial
2025-01-16 21:36:17,704 - INFO - Old versions of Delta table have been removed (VACUUM).
2025-01-16 21:36:21,947 - INFO - query '
SELECT 
     id,
     upper(funcionario) funcionario,
     last_update,
     month_key
FROM 
    delta.`s3a://bronze/isp_performance/bronze_dim_colaboradores`
    ' successfully processed and saved to s3a://silver/isp_performance/silver_dim_colaboradores
2025-01-16 21:36:25,885 - INFO - Old versions of Delta table have been removed (VACUUM).
2025-01-16 21:36:29,992 - INFO - query '
SELECT 
    id,
    assunto,
    last_update,
    month_key
FROM 
    delta.`s3a://bronze/isp_performance/bronze_dim_assunto`
    ' successfully proces

In [5]:
import pyspark
from pyspark.sql import SparkSession
import logging
from datetime import datetime
from configs import configs
from functions import functions as F
from dotenv import load_dotenv
import os

load_dotenv()
HOST_ADDRESS=os.getenv('HOST_ADDRESS')
MINIO_ACCESS_KEY=os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY=os.getenv('MINIO_SECRET_KEY')
   
def process_table(spark, query_input, output_path):
    try:
        df_input_data = spark.sql(query_input)
        df_with_update_date = F.add_metadata(df_input_data)
        #df_with_update_date = df_with_update_date.repartition(100)
        df_with_update_date.write \
            .format("delta") \
            .option("mergeSchema", "true") \
            .mode("overwrite") \
            .save(output_path)
        
        # Limpar versões antigas imediatamente
        spark.sql(f"VACUUM delta.`{output_path}` RETAIN 0 HOURS")
        logging.info(f"Old versions of Delta table '{table_name}' have been removed (VACUUM).")
        
        logging.info(f"query '{query_input}' successfully processed and saved to {output_path}")
    except Exception as e:
        logging.error(f"Error processing query '{query_input}': {str(e)}")
    
if __name__ == "__main__":
    spark = SparkSession.builder \
            .appName("process_bronze_to_gold_isp_performance") \
            .config("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000") \
            .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
            .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
            .config("spark.hadoop.fs.s3a.path.style.access", True) \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
            .config("hive.metastore.uris", "thrift://metastore:9083") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.executor.memory", "4g") \
            .config("spark.driver.memory", "4g") \
            .config("spark.memory.fraction", "0.8") \
            .config("spark.sql.shuffle.partitions", "50") \
            .getOrCreate()

# Configurações específicas do Delta Lake
spark.conf.set("spark.delta.logRetentionDuration", "interval 1 day")  # Manter logs por 1 dia
spark.conf.set("spark.delta.deletedFileRetentionDuration", "interval 1 day")  # Manter arquivos deletados por 1 dia
    
# Desabilitar a verificação de retenção de duração no Delta Lake
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")    
    
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info("Starting processing from bronze to gold...")
   
input_prefix_layer_name = configs.prefix_layer_name['2']  # bronze layer
input_path = configs.lake_path['silver']
output_prefix_layer_name = configs.prefix_layer_name['3']  # silver layer
output_path = configs.lake_path['gold']
   
try:
    for table_name, query_input in configs.tables_gold.items():
        table_name = F.convert_table_name(table_name)
        
        query_input = F.get_query(table_name, input_path, input_prefix_layer_name, configs.tables_gold)        
        
        storage_output = f'{output_path}{output_prefix_layer_name}{table_name}'
        
        process_table(spark, query_input, storage_output)
        
    logging.info("Process to gold completed!")
    
except Exception as e:
    logging.error(f'Error processing table: {str(e)}')

2025-01-16 21:37:18,400 - INFO - Starting processing from bronze to gold...
2025-01-16 21:37:45,555 - INFO - Old versions of Delta table 'performance_ordem_servico' have been removed (VACUUM).
2025-01-16 21:37:45,556 - INFO - query '
WITH BASE_PERFORMANCE AS (
    SELECT
        t1.ano_abertura,
        t1.ano_mes_abertura,
        t1.data_abertura,
        t1.dia_do_mes_abertura,
        t1.hora_abertura,
        t1.ano_fechamento,
        t1.ano_mes_fechamento,
        t1.data_fechamento,
        t1.dia_do_mes_fechamento,
        t1.hora_fechamento,
        t1.data_agenda,
        t1.data_hora_assumido,
        t1.data_hora_execucao,
        t1.id_filial,
        t2.fantasia AS filial,
        t1.id_setor,
        t4.setor AS setor,
        t6.id AS id_relator,
        t6.login AS relator,
        t1.id_tecnico,
        t5.funcionario AS tecnico,
        t1.id_assunto,
        t3.assunto AS assunto,
        t1.id_su_diagnostico,
        t1.id AS ordem_servico_id,
        t1.id_status