## Configurações iniciais

In [1]:
import sys
sys.path.append('D:\\__case_ifood\\notebooks\\utils') # Defina aqui a pasta

In [5]:
import importlib
import utils

# Recarregar o módulo 'utils'
importlib.reload(utils)

from utils import get_bq_client, send_parquets_to_bigquery, credencial_gcp, pasta_projeto, semanas, bigquery, service_account, pl, pd, datetime, gc, os


In [4]:
#pasta_projeto = "D:\\__case_ifood"

# Caminho do arquivo JSON da conta de serviço
#credencial_gcp = os.path.join(pasta_projeto,"case-ifood-fsg-6f1d7cf34e08.json")

# Autenticando
credencial = service_account.Credentials.from_service_account_file(credencial_gcp)

# Cliente BigQuery
client = get_bq_client() #bigquery.Client(credentials=credencial, project="case-ifood-fsg")


#### Filtro para otimizar o download da tabela no BQ

In [None]:
# Realizando o download da tabela e salvando em diversos arquivos parquet

system_hour = pd.Timestamp

for semana in semanas:
    print(f"Lendo semana: {semana}")

    qry = f"""
         WITH 
         -- Busca a maior data de inserção de registros na tabela
            order_last_date AS (
                SELECT
                        MAX(insert_date) last_date
                FROM silver.order
                ),

        -- Seleciona as colunas e as linhas necessárias, filtrando pelos registros inseridos por ultimo
            tbl_order AS (
                SELECT 
                        o.order_id,
                        o.order_created_at,
                        o.order_total_amount,
                        o.customer_id,
                        o.cpf,
                        o.customer_name,
                        o.delivery_address_district,
                        o.delivery_address_city,
                        o.delivery_address_state,
                        o.delivery_address_country,
                        o.merchant_id,
                        o.order_scheduled,
                        o.order_scheduled_date,
                        o.origin_platform,
                        CAST(DATE(DATE_TRUNC(o.order_created_at,week)) AS STRING) AS semana 
                FROM `silver.order` o
                INNER JOIN order_last_date old
                    ON o.insert_date = old.last_date
                ),

        -- Busca a maior data de inserção de registros na tabela
            consumer_last_date AS (
                SELECT
                        MAX(insert_date) last_date
                FROM silver.consumer c
                ),

        -- Seleciona as colunas e as linhas necessárias, filtrando pelos registros inseridos por ultimo
            tbl_consumer AS (
                SELECT
                        c.customer_id,
                        c.customer_name,
                        c.created_at AS customer_created_at,
                        c.active AS customer_active,
                        CONCAT(c.customer_phone_area,'-',c.customer_phone_number) AS customer_phone,
                        c.language AS customer_language,
                FROM silver.consumer c
                INNER JOIN consumer_last_date cld
                    ON c.insert_date = cld.last_date
                ),
                
        -- Busca a maior data de inserção de registros na tabela
            merchants_last_date AS (
                SELECT
                        MAX(insert_date) last_date
                FROM silver.merchants
                ),
            
        -- Seleciona as colunas e as linhas necessárias, filtrando pelos registros inseridos por ultimo
            tbl_merchants AS (
                SELECT
                        m.merchant_id,
                        m.created_at AS merchant_created_at,
                        m.enabled AS merchant_enabled,
                        m.price_range,
                        m.average_ticket,
                        m.takeout_time,
                        m.delivery_time,
                        m.minimum_order_value,
                        m.merchant_city,
                        m.merchant_state
                FROM silver.merchants m
                INNER JOIN merchants_last_date mld
                    ON m.insert_date = mld.last_date
                ),

        -- Busca a maior data de inserção de registros na tabela
            ab_test_last_date AS (
                SELECT
                        MAX(insert_date) last_date
                FROM silver.ab_test
                ),

        -- Seleciona as colunas e as linhas necessárias, filtrando pelos registros inseridos por ultimo
            tbl_ab_test AS (
                SELECT
                        ab.customer_id,
                        ab.is_target
                FROM silver.ab_test ab
                INNER JOIN ab_test_last_date abl
                    ON ab.insert_date = abl.last_date
                ),

        -- Busca a maior data de inserção de registros na tabela
            order_details_last_date AS (
                SELECT
                        MAX(insert_date) last_date
                FROM silver.order_details od
                ),

        -- Seleciona as colunas e as linhas necessárias, filtrando pelos registros inseridos por ultimo
            tbl_order_details AS (
                SELECT
                        od.order_id,
                        od.cpf,
                        od.name AS product,
                        od.quantity,
                        od.unitPrice,
                        od.addition,
                        od.discount,
                        od.type AS product_type,
                        od.sequence,
                FROM silver.order_details od
                INNER JOIN order_details_last_date odl
                    ON od.insert_date = odl.last_date
                ),

        -- Monta a tabela final
                tabela AS (
                            SELECT 
                                o.order_id,
                                o.order_created_at,
                                od.product,
                                od.quantity,
                                od.unitPrice,
                                od.addition,
                                od.discount,
                                o.order_total_amount,
                                od.product_type,
                                od.sequence,
                                o.customer_id,
                                o.cpf,
                                COALESCE(c.customer_name, o.customer_name) AS customer_name,
                                c.customer_created_at,
                                c.customer_active,
                                c.customer_phone,
                                ab.is_target,
                                c.customer_language,
                                o.delivery_address_district,
                                o.delivery_address_city,
                                o.delivery_address_state,
                                o.delivery_address_country,
                                o.merchant_id,
                                m.merchant_created_at,
                                m.merchant_enabled,
                                m.price_range,
                                m.average_ticket,
                                m.takeout_time,
                                m.delivery_time,
                                m.minimum_order_value,
                                m.merchant_city,
                                m.merchant_state,
                                IF(od.order_id IS NULL, FALSE, TRUE) has_details,
                                o.order_scheduled,
                                o.order_scheduled_date,
                                o.origin_platform,
                                o.semana 
                            FROM tbl_order o
                            LEFT JOIN tbl_consumer c 
                                ON o.customer_id = c.customer_id
                            LEFT JOIN tbl_merchants m 
                                ON o.merchant_id = m.merchant_id
                            LEFT JOIN tbl_ab_test ab 
                                ON o.customer_id = ab.customer_id
                            LEFT JOIN tbl_order_details od 
                                ON  o.order_id = od.order_id 
                                AND o.cpf = od.cpf
                )
        SELECT * EXCEPT(semana)
        FROM tabela
        WHERE semana = '{semana}'
    """

    result = client.query(qry)
    arrow_table = result.to_arrow()
    df = pl.from_arrow(arrow_table)

    # Salva parquet na pasta do projeto
    nome_arquivo = f"sales_{semana}.parquet"
    caminho_arquivo = os.path.join(pasta_projeto, nome_arquivo)
    df.write_parquet(caminho_arquivo)

    print(f"Arquivo salvo: {caminho_arquivo}")

Lendo semana: 2018-12-02
Arquivo salvo: D:\__case_ifood\sales_2018-12-02.parquet
Lendo semana: 2018-12-09
Arquivo salvo: D:\__case_ifood\sales_2018-12-09.parquet
Lendo semana: 2018-12-16
Arquivo salvo: D:\__case_ifood\sales_2018-12-16.parquet
Lendo semana: 2018-12-23
Arquivo salvo: D:\__case_ifood\sales_2018-12-23.parquet
Lendo semana: 2018-12-30
Arquivo salvo: D:\__case_ifood\sales_2018-12-30.parquet
Lendo semana: 2019-01-06
Arquivo salvo: D:\__case_ifood\sales_2019-01-06.parquet
Lendo semana: 2019-01-13
Arquivo salvo: D:\__case_ifood\sales_2019-01-13.parquet
Lendo semana: 2019-01-20
Arquivo salvo: D:\__case_ifood\sales_2019-01-20.parquet
Lendo semana: 2019-01-27
Arquivo salvo: D:\__case_ifood\sales_2019-01-27.parquet


In [5]:
del df
del arrow_table
gc.collect()

286

In [6]:
# Parametros necessários para enviar os arquivos Parquet para o BigQuery
dataset_nome = "gold"
tabela_nome = "sales"

# Variável criada para garantir que os registros inseridos sejam no mesmo momento e não causar problema no particionamento
var_timestamp = datetime.datetime.now(datetime.UTC)

# Inserindo o DataFrame no BigQuery
send_parquets_to_bigquery(pasta_projeto, dataset_nome, tabela_nome, client, var_timestamp)

Lendo arquivo: sales_2018-12-02.parquet
Enviando para BigQuery: sales_2018-12-02.parquet
Dataset 'case-ifood-fsg.gold' pronto.
Convertido de Polars para Pandas.
Enviando chunk 1/1 com 940370 linhas...
Tabela 'case-ifood-fsg.gold.sales' carregada com 940370 linhas.
Lendo arquivo: sales_2018-12-09.parquet
Enviando para BigQuery: sales_2018-12-09.parquet
Dataset 'case-ifood-fsg.gold' pronto.
Convertido de Polars para Pandas.
Enviando chunk 1/2 com 1000000 linhas...
Enviando chunk 2/2 com 369852 linhas...
Tabela 'case-ifood-fsg.gold.sales' carregada com 1369852 linhas.
Lendo arquivo: sales_2018-12-16.parquet
Enviando para BigQuery: sales_2018-12-16.parquet
Dataset 'case-ifood-fsg.gold' pronto.
Convertido de Polars para Pandas.
Enviando chunk 1/2 com 1000000 linhas...
Enviando chunk 2/2 com 228143 linhas...
Tabela 'case-ifood-fsg.gold.sales' carregada com 1228143 linhas.
Lendo arquivo: sales_2018-12-23.parquet
Enviando para BigQuery: sales_2018-12-23.parquet
Dataset 'case-ifood-fsg.gold' pr