SAP 
Addresses
BusinessPartners
ProductCategories
ProductCategoryText
Products
ProductTexts
SalesOrderItems
SalesOrders_dsp1
SalesOrders

In [0]:
import requests
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan
import great_expectations as ge

In [0]:
spark = SparkSession.builder.appName("EXT_SAP_API").getOrCreate()

In [0]:
class Ext_SAP_API:

    def __init__(self, area, arquivo, chave,spark):
        self.area = area 
        self.arquivo = arquivo
        self.chave = chave
        self.spark = spark

    
    def get_dados_archive(self):
        url_base = "https://datasap-293251100165.herokuapp.com/dados"
        url = f"{url_base}/{self.area}/{self.arquivo}"
        auth = ("", f"{self.chave}")

        response = requests.get(url, auth=auth)
        if response.status_code == 200:
            data = response.json()
            return data 
        else:
            print("Erro: Não foi possível obter dados da API.")
            return None
        
    def create_dataframe(self):
        dados = self.get_dados_archive()
        if dados is not None:
            df = self.spark.createDataFrame(dados)
            return df
        else:
            print("Não foi possível obter dados da API.")
            return None

In [0]:
area = 'SAP'
arquivo = 'SalesOrders'
chave = 'meizterdevs2024'

In [0]:
extract = Ext_SAP_API(area, arquivo, chave, spark)

#Criar o dataframe do spark e imprimir
df = extract.create_dataframe()

if df is not None:
    print('Dataframe criado com sucesso!')
    display(df)
else:
    print("Não foi possível criar o Dataframe.")

Dataframe criado com sucesso!


BILLINGSTATUS,CHANGEDAT,CHANGEDBY,CREATEDAT,CREATEDBY,CURRENCY,DELIVERYSTATUS,FISCALYEARPERIOD,FISCVARIANT,GROSSAMOUNT,LIFECYCLESTATUS,NETAMOUNT,NOTEID,PARTNERID,SALESORDERID,SALESORG,TAXAMOUNT
C,20180116,4,20180111,4,USD,C,2018001,K4,13587,C,11888.625,,100000022,500000000,APJ,1698.375
C,20180115,2,20180112,2,USD,C,2018001,K4,12622,C,11044.25,,100000026,500000001,EMEA,1577.75
C,20180120,5,20180115,5,USD,C,2018001,K4,45655,C,39948.125,,100000018,500000002,APJ,5706.875
C,20180120,3,20180115,3,USD,C,2018001,K4,101786,C,89062.75,,100000009,500000003,EMEA,12723.25
C,20180117,8,20180116,8,USD,C,2018001,K4,71684,C,62723.5,,100000025,500000004,EMEA,8960.5
C,20180119,8,20180116,8,USD,C,2018001,K4,104213,C,91186.375,,100000008,500000005,EMEA,13026.625
C,20180120,3,20180117,3,USD,C,2018001,K4,173987,C,152238.625,,100000038,500000006,EMEA,21748.375
C,20180121,4,20180119,4,USD,C,2018001,K4,1398,C,1223.25,,100000020,500000007,APJ,174.75
C,20180122,3,20180120,3,USD,C,2018001,K4,127803,C,111827.625,,100000028,500000008,EMEA,15975.375
C,20180122,4,20180121,4,USD,C,2018001,K4,79101,C,69213.375,,100000021,500000009,APJ,9887.625


Verificação de qualidade

In [0]:
df_ge = ge.dataset.SparkDFDataset(df)

#Expectativa: Verifica a presença das colunas esperadas
colunas_esperadas = ['BILLINGSTATUS',
 'CHANGEDAT',
 'CHANGEDBY',
 'CREATEDAT',
 'CREATEDBY',
 'CURRENCY',
 'DELIVERYSTATUS',
 'FISCALYEARPERIOD',
 'FISCVARIANT',
 'GROSSAMOUNT',
 'LIFECYCLESTATUS',
 'NETAMOUNT',
 'NOTEID',
 'PARTNERID',
 'SALESORDERID',
 'SALESORG',
 'TAXAMOUNT']

expectativa_colunas = df_ge.expect_table_columns_to_match_ordered_list(column_list=colunas_esperadas)

#Validar as expectativas
resultado_validacao = df_ge.validate()

#Verifica o resultado
if resultado_validacao['success']:
    print('Arquivo Ok!')
else:
    print('Erro na validação!')

Arquivo Ok!


In [0]:
tmp_delta_path = "/tmp/transient/tb_fat_sales_orders"
df.write.format("parquet").mode("overwrite").save(tmp_delta_path)