# el_api_to_landing_isp_performance

In [1]:
import pyspark
from pyspark.sql import SparkSession
import logging
from dotenv import load_dotenv
import os
import requests
import pandas as pd
from pyspark.sql import types as T
from functions import functions as F

## Import Environment

In [2]:
 # Carregar variáveis de ambiente
load_dotenv()

# Configurações de conexão
HOST_ADDRESS = os.getenv('HOST_ADDRESS')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')

# Configurações da API
URL_TOKEN = "http://keycloak.nexusitconsulting.com.br:8080/realms/ISPerformace/protocol/openid-connect/token"
API_URL = "http://api.nexusitconsulting.com.br:3000/api/v1/ixc/ordem-servico/aberto"

# Credenciais para obter o token
payload = {
    "grant_type": "password",
    "client_id": "api",
    "username": "superset@superset.com",
    "password": "123",
    "client_secret": "aJUbEhLzRJ4WWCOr0FjwoJ9Bkv9VUJUV"
}

## Spark Session

In [3]:
# Iniciar a sessão do Spark
spark = SparkSession.builder \
    .appName("el_api_to_landing") \
    .config("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000") \
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

## Log configs

In [4]:
# Configurar o log
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info("Starting ingestions from API to Minio landing...")

2024-10-21 03:29:02,871 - INFO - Starting ingestions from API to Minio landing...


## Função para obter o token de autenticação

In [5]:
# Função para obter o token de autenticação
def get_token(url_token, payload):
    try:
        response = requests.post(url_token, data=payload, headers={"Content-Type": "application/x-www-form-urlencoded"})
        response.raise_for_status()
        token = response.json().get("access_token")
        logging.info("Token successfully obtained.")
        return token
    except Exception as e:
        logging.error(f"Error obtaining token: {str(e)}")
        return None

## Função para buscar dados da API e transformá-los em DataFrame Spark

In [6]:
# Função para buscar dados da API e transformá-los em DataFrame Spark
def fetch_data_from_api(api_url, token):
    try:
        headers = {"Authorization": f"Bearer {token}"}
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        data = response.json()
        return data
    except Exception as e:
        logging.error(f"Error fetching data from API: {str(e)}")
        return None

## Função para converter a resposta JSON em DataFrame Spark

In [7]:
# Função para converter a resposta JSON em DataFrame Spark
def convert_api_data_to_spark_df(api_data):
    try:
        if isinstance(api_data, list):
            pdf = pd.DataFrame(api_data)
            logging.info(f"Fetched data contains {pdf.shape[0]} rows and {pdf.shape[1]} columns.")
            logging.info(f"Data sample: {pdf.head()}")  # Log do exemplo de dados
        else:
            raise ValueError("Formato inesperado para api_data")

        # Definindo o esquema da tabela com base nos campos retornados pela API
        schema = T.StructType([
            T.StructField("mensagemResposta", T.StringType(), True),
            T.StructField("dataHoraAnalise", T.StringType(), True),
            T.StructField("dataHoraEncaminhado", T.StringType(), True),
            T.StructField("dataHoraAssumido", T.StringType(), True),
            T.StructField("dataHoraExecucao", T.StringType(), True),
            T.StructField("idContratoKit", T.StringType(), True),
            T.StructField("preview", T.StringType(), True),
            T.StructField("dataAgendaFinal", T.StringType(), True),
            T.StructField("id", T.StringType(), True),
            T.StructField("tipo", T.StringType(), True),
            T.StructField("idFilial", T.StringType(), True),
            T.StructField("idWflTarefa", T.StringType(), True),
            T.StructField("statusSla", T.StringType(), True),
            T.StructField("dataAbertura", T.StringType(), True),
            T.StructField("melhorHorarioAgenda", T.StringType(), True),
            T.StructField("liberado", T.StringType(), True),
            T.StructField("status", T.StringType(), True),
            T.StructField("idCliente", T.StringType(), True),
            T.StructField("idAssunto", T.StringType(), True),
            T.StructField("setor", T.StringType(), True),
            T.StructField("idCidade", T.StringType(), True),
            T.StructField("idTecnico", T.StringType(), True),
            T.StructField("prioridade", T.StringType(), True),
            T.StructField("mensagem", T.StringType(), True),
            T.StructField("protocolo", T.StringType(), True),
            T.StructField("endereco", T.StringType(), True),
            T.StructField("complemento", T.StringType(), True),
            T.StructField("idCondominio", T.StringType(), True),
            T.StructField("bloco", T.StringType(), True),
            T.StructField("apartamento", T.StringType(), True),
            T.StructField("latitude", T.StringType(), True),
            T.StructField("bairro", T.StringType(), True),
            T.StructField("longitude", T.StringType(), True),
            T.StructField("referencia", T.StringType(), True),
            T.StructField("impresso", T.StringType(), True),
            T.StructField("dataInicio", T.StringType(), True),
            T.StructField("dataAgenda", T.StringType(), True),
            T.StructField("dataFinal", T.StringType(), True),
            T.StructField("dataFechamento", T.StringType(), True),
            T.StructField("idWflParamOs", T.StringType(), True),
            T.StructField("valorTotalComissao", T.StringType(), True),
            T.StructField("valorTotal", T.StringType(), True),
            T.StructField("valorOutrasDespesas", T.StringType(), True),
            T.StructField("idx", T.StringType(), True),
            T.StructField("idSuDiagnostico", T.StringType(), True),
            T.StructField("geraComissao", T.StringType(), True),
            T.StructField("idEstrutura", T.StringType(), True),
            T.StructField("idLogin", T.StringType(), True),
            T.StructField("valorUnitComissao", T.StringType(), True),
            T.StructField("dataPrazoLimite", T.StringType(), True),
            T.StructField("dataReservada", T.StringType(), True),
            T.StructField("idTicket", T.StringType(), True),
            T.StructField("origemEndereco", T.StringType(), True),
            T.StructField("justificativaSlaAtrasado", T.StringType(), True),
            T.StructField("origemEnderecoEstrutura", T.StringType(), True),
            T.StructField("dataReagendar", T.StringType(), True),
            T.StructField("dataPrevFinal", T.StringType(), True),
            T.StructField("origemCadastro", T.StringType(), True),
            T.StructField("ultimaAtualizacao", T.StringType(), True)
        ])

        # Convertendo para DataFrame Spark
        spark_df = spark.createDataFrame(pdf, schema=schema)
        logging.info("Data successfully converted to Spark DataFrame.")
        return spark_df
    except Exception as e:
        logging.error(f"Error converting API data to Spark DataFrame: {str(e)}")
        return None

## Process

In [8]:
# Processar e salvar os dados da API
def process_and_save_api_data():
    try:
        # Obter o token de autenticação
        token = get_token(URL_TOKEN, payload)
        if not token:
            raise Exception("No token obtained.")

        # Buscar dados da API usando o token
        api_data = fetch_data_from_api(API_URL, token)
        if not api_data:
            raise Exception("No data obtained from the API.")

        # Converter dados da API para DataFrame Spark
        spark_df = convert_api_data_to_spark_df(api_data)
        if spark_df is None:
            raise Exception("Error converting data to DataFrame.")
        
        # Adicionar metadata e a coluna 'month_key'
        spark_df.select("dataAbertura").show(truncate=False)
        df_with_update_date = F.add_metadata(spark_df)  # Se necessário, adicione metadados como no Script 1
        df_with_month_key = F.add_month_key(df_with_update_date, 'dataAbertura')  # Substitua 'order_date' pela coluna correta

        # Salvar os dados em um arquivo Parquet no MinIO
        output_path = "s3a://landing/isp_performance/landing_ordem_servico_aberto"
        df_with_month_key.write.mode("overwrite").partitionBy('month_key').parquet(output_path)
        logging.info(f"Data successfully saved to MinIO at {output_path}.")

    except Exception as e:
        logging.error(f"Error during processing and saving API data: {str(e)}")

if __name__ == "__main__":
    process_and_save_api_data()
    spark.stop()

2024-10-21 03:29:03,152 - INFO - Token successfully obtained.
2024-10-21 03:29:07,430 - INFO - Fetched data contains 6990 rows and 59 columns.
2024-10-21 03:29:07,444 - INFO - Data sample:   mensagemResposta      dataHoraAnalise  dataHoraEncaminhado  \
0             None                 None                 None   
1             None                 None                 None   
2             None  0000-00-00 00:00:00  0000-00-00 00:00:00   
3             None                 None  2022-10-17 17:42:27   
4             None                 None                 None   

      dataHoraAssumido     dataHoraExecucao idContratoKit preview  \
0                 None                 None         28975    None   
1                 None                 None             0    None   
2  0000-00-00 00:00:00  0000-00-00 00:00:00         34241    None   
3                 None  2022-07-01 09:04:13             0    None   
4                 None                 None             0    None   

       data

+-------------------+
|dataAbertura       |
+-------------------+
|2021-07-29 15:00:20|
|2022-05-02 08:41:59|
|2022-05-23 15:50:37|
|2022-06-24 09:59:58|
|2022-07-01 16:26:43|
|2022-07-02 09:29:08|
|2022-07-12 11:16:48|
|2022-07-22 10:46:01|
|2022-07-31 19:13:29|
|2022-08-12 16:15:50|
|2022-08-16 13:31:24|
|2022-08-20 11:19:33|
|2022-08-20 16:18:50|
|2022-08-31 15:57:47|
|2022-09-03 11:45:59|
|2022-09-08 14:04:49|
|2022-09-14 08:59:44|
|2022-09-15 08:15:59|
|2022-09-19 15:10:15|
|2022-09-20 09:31:30|
+-------------------+
only showing top 20 rows



2024-10-21 03:29:28,854 - INFO - Data successfully saved to MinIO at s3a://landing/isp_performance/landing_ordem_servico_aberto.
