In [0]:
%run ../_utils

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from pyspark.sql.functions import when, to_date, col, dayofweek, max, collect_set, sum
from pyspark.sql.types import StringType, BooleanType


# Camada GOLD

Na camada gold, as limpezas e ajustes já foram feitos, então essa camada é responsável por aplicar regras de negócio, agregações e junções de dados que convirjam para analises.

In [0]:
tb_name = "olist_gold.orders"
dataset_location = "olist_orders_dataset"
target_location = f"dbfs:/FileStore/delta/brazilian_ecommerce/{dataset_location}/gold"

## 1 - Data ingestion

Conforme o schema disponibilizado, iremos agregar os dados em uma big table que permitirá ~quase~ todas as analises subsequentes

Apenas para fins de teste, iremos agregar apenas reviews e payments à table "fact" orders;
Portanto, iremos carregar essas tabelas


In [0]:
df_orders = spark.read.table("olist_silver.orders") # leituira da delta table central, orders
df_order_reviews = spark.read.table("olist_silver.order_reviews") # leituira da delta table "dim" reviews
df_order_payments = spark.read.table("olist_silver.order_payments") # leituira da delta table "dim" payments
#df_order_items = spark.sql("select * from olist_silver.order_items") # leitura de outra maneira, da delta table "dim" items


## 2 - preparation


### 2.1 order_payments

uma order_id pode ter várias formas de pagamento (geralmente vouchs).  cada pagamento gera um registro

Então iremos agregar, somando em valor de pagamento e pegando o max payment_sequential

In [0]:
df_order_payments = df_order_payments.groupBy("order_id").agg(
    max("payment_sequential").alias("total_payment_sequential"),
    sum("payment_value").alias("total_payment_value"),
    collect_set("payment_type").alias("payment_types"),
)


### 2.2 order_reviews

podemos perceber que existem casos onde existe mais de um review para um mesmo order_id


## 2 - Data Join

In [0]:
print(f"Total de registros ANTES da agregação {df_orders.count()}")

Total de registros ANTES da agregação 99441


In [0]:
df = (df_orders
      .join(df_order_payments, on=['order_id'], how='left')
      .join(df_order_reviews, on=['order_id'], how='left'))

In [0]:
print(f"Total de registros DEPOIS da agregação {df.count()}")

Total de registros DEPOIS da agregação 99992


In [0]:
display(df)

order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,month_year,minutes_to_approve,days_to_deliver,hours_to_deliver,total_elapsed_days,total_elapsed_hours,overdue,delay_hours,delay_days,day_of_week,weekend,total_payment_sequential,total_payment_value,payment_types,review_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02T10:56:33Z,2017-10-02T11:07:15Z,2017-10-04T19:55:00Z,2017-10-10T21:25:13Z,2017-10-18T00:00:00Z,10-2017,10.7,6.06,145.5,8.44,202.48,True,170.58,7.11,Segunda-feira,False,3,38.71,"List(credit_card, voucher)",a54f0611adc9ed256b57ede6b6eb5114,4.0,,"Não testei o produto ainda, mas ele veio correto e em boas condições. Apenas a caixa que veio bem amassada e danificada, o que ficará chato, pois se trata de um presente.",2017-10-11T00:00:00Z,2017-10-12T03:43:48Z
53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24T20:41:37Z,2018-07-26T03:24:27Z,2018-07-26T14:31:00Z,2018-08-07T15:27:45Z,2018-08-13T00:00:00Z,07-2018,1842.83,12.04,288.95,13.78,330.77,True,128.54,5.36,Terça-feira,False,1,141.46,List(boleto),8d5266042046a06655c8db133d120ba5,4.0,Muito boa a loja,Muito bom o produto.,2018-08-08T00:00:00Z,2018-08-08T18:37:50Z
47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08T08:38:49Z,2018-08-08T08:55:23Z,2018-08-08T13:50:00Z,2018-08-17T18:06:29Z,2018-09-04T00:00:00Z,08-2018,16.57,9.18,220.27,9.39,225.46,True,413.89,17.25,Quarta-feira,False,1,179.12,List(credit_card),e73b67b67587f7644d5bd1a52deb1b01,5.0,,,2018-08-18T00:00:00Z,2018-08-22T19:07:58Z
949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18T19:28:06Z,2017-11-18T19:45:59Z,2017-11-22T13:39:59Z,2017-12-02T00:28:42Z,2017-12-15T00:00:00Z,11-2017,17.88,9.45,226.81,13.21,317.01,True,311.52,12.98,Sábado,True,1,72.2,List(credit_card),359d03e676b3c069f62cadba8dd3f6e8,5.0,,O produto foi exatamente o que eu esperava e estava descrito no site e chegou bem antes da data prevista.,2017-12-03T00:00:00Z,2017-12-05T19:21:58Z
ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13T21:18:39Z,2018-02-13T22:20:29Z,2018-02-14T19:46:34Z,2018-02-16T18:17:02Z,2018-02-26T00:00:00Z,02-2018,61.83,1.94,46.51,2.87,68.97,True,221.72,9.24,Terça-feira,False,1,28.62,List(credit_card),e50934924e227544ba8246aeb3770dd4,5.0,,,2018-02-17T00:00:00Z,2018-02-18T13:02:51Z
a4591c265e18cb1dcee52889e2d8acc3,503740e9ca751ccdda7ba28e9ab8f608,delivered,2017-07-09T21:57:05Z,2017-07-09T22:10:13Z,2017-07-11T14:58:04Z,2017-07-26T10:57:55Z,2017-08-01T00:00:00Z,07-2017,13.13,14.83,356.0,16.54,397.01,True,133.03,5.54,Domingo,True,1,175.26,List(credit_card),89b738e70a1ce346db29a20fb2910161,4.0,,,2017-07-27T00:00:00Z,2017-07-27T22:48:30Z
136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11T12:22:08Z,2017-04-13T13:25:17Z,,,2017-05-09T00:00:00Z,04-2017,2943.15,,,,,False,0.0,0.0,Terça-feira,False,1,65.95,List(credit_card),e07549ef5311abcc92ba1784b093fb56,2.0,,fiquei triste por n ter me atendido.,2017-05-13T00:00:00Z,2017-05-13T20:25:42Z
6514b8ad8028c9f2cc2374ded245783f,9bdf08b4b3b52b5526ff42d37d47f222,delivered,2017-05-16T13:10:30Z,2017-05-16T13:22:11Z,2017-05-22T10:07:46Z,2017-05-26T12:55:51Z,2017-06-07T00:00:00Z,05-2017,11.68,4.12,98.8,9.99,239.76,True,275.07,11.46,Terça-feira,False,1,75.16,List(credit_card),07d67dd06ed5f88bef11ef6b464e79ae,5.0,,,2017-05-27T00:00:00Z,2017-05-28T02:59:57Z
76c6e866289321a7c93b82b54852dc33,f54a9f0e6b351c431402b8461ea51999,delivered,2017-01-23T18:29:09Z,2017-01-25T02:50:47Z,2017-01-26T14:16:31Z,2017-02-02T14:08:10Z,2017-03-06T00:00:00Z,01-2017,1941.63,6.99,167.86,9.82,235.65,True,753.86,31.41,Segunda-feira,False,1,35.95,List(boleto),fc4af8aea8ec3f1a3cd181d3d0cadbd5,1.0,,,2017-02-03T00:00:00Z,2017-02-05T01:58:35Z
e69bfb5eb88e0ed6a785585b27e16dbf,31ad1d1b63eb9962463f764d4e6e0c9d,delivered,2017-07-29T11:55:02Z,2017-07-29T12:05:32Z,2017-08-10T19:45:24Z,2017-08-16T17:14:30Z,2017-08-23T00:00:00Z,07-2017,10.5,5.9,141.49,18.22,437.32,True,150.76,6.28,Sábado,True,2,169.76,"List(voucher, credit_card)",abc5655186d40772bd6e410420e6a3ed,5.0,,,2017-08-17T00:00:00Z,2017-08-18T01:47:32Z



## Saving data

In [0]:
save_dataframe(df, format_mode="delta", table_name=tb_name, target_location=target_location)

[LOG] Saving olist_gold.orders delta on dbfs:/FileStore/delta/brazilian_ecommerce/olist_orders_dataset/gold... OK!



## create delta table

TODO: implementar UPSERT

o upsert serve para não precisar reescrever todos os dados, mas aproveitar do Delta para fazer um MERGE, caso um registro antigo tenha uma nova versão e INSERT para os dados que são novos

In [0]:
create_table(table_name=tb_name, target_location=target_location)

[LOG] Creating delta table olist_gold.orders on dbfs:/FileStore/delta/brazilian_ecommerce/olist_orders_dataset/gold... OK!


In [0]:
# exit para fechar a execução
dbutils.notebook.exit("OK")

In [0]:
%sql

select * from olist_gold.orders limit 10

order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,month_year,order_purchase_month,order_purchase_year,minutes_to_approve,days_to_deliver,hours_to_deliver,total_elapsed_days,total_elapsed_hours,overdue,delay_hours,delay_days,number_day_of_week,day_of_week,weekend,total_payment_sequential,total_payment_value,payment_types,review_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02T10:56:33Z,2017-10-02T11:07:15Z,2017-10-04T19:55:00Z,2017-10-10T21:25:13Z,2017-10-18T00:00:00Z,10-2017,10,2017,10.7,6.06,145.5,8.44,202.48,True,170.58,7.11,2,Segunda-feira,False,3,38.71,"List(credit_card, voucher)",a54f0611adc9ed256b57ede6b6eb5114,4,,"Não testei o produto ainda, mas ele veio correto e em boas condições. Apenas a caixa que veio bem amassada e danificada, o que ficará chato, pois se trata de um presente.",2017-10-11T00:00:00Z,2017-10-12T03:43:48Z
53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24T20:41:37Z,2018-07-26T03:24:27Z,2018-07-26T14:31:00Z,2018-08-07T15:27:45Z,2018-08-13T00:00:00Z,07-2018,7,2018,1842.83,12.04,288.95,13.78,330.77,True,128.54,5.36,3,Terça-feira,False,1,141.46,List(boleto),8d5266042046a06655c8db133d120ba5,4,Muito boa a loja,Muito bom o produto.,2018-08-08T00:00:00Z,2018-08-08T18:37:50Z
47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08T08:38:49Z,2018-08-08T08:55:23Z,2018-08-08T13:50:00Z,2018-08-17T18:06:29Z,2018-09-04T00:00:00Z,08-2018,8,2018,16.57,9.18,220.27,9.39,225.46,True,413.89,17.25,4,Quarta-feira,False,1,179.12,List(credit_card),e73b67b67587f7644d5bd1a52deb1b01,5,,,2018-08-18T00:00:00Z,2018-08-22T19:07:58Z
949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18T19:28:06Z,2017-11-18T19:45:59Z,2017-11-22T13:39:59Z,2017-12-02T00:28:42Z,2017-12-15T00:00:00Z,11-2017,11,2017,17.88,9.45,226.81,13.21,317.01,True,311.52,12.98,7,Sábado,True,1,72.2,List(credit_card),359d03e676b3c069f62cadba8dd3f6e8,5,,O produto foi exatamente o que eu esperava e estava descrito no site e chegou bem antes da data prevista.,2017-12-03T00:00:00Z,2017-12-05T19:21:58Z
ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13T21:18:39Z,2018-02-13T22:20:29Z,2018-02-14T19:46:34Z,2018-02-16T18:17:02Z,2018-02-26T00:00:00Z,02-2018,2,2018,61.83,1.94,46.51,2.87,68.97,True,221.72,9.24,3,Terça-feira,False,1,28.62,List(credit_card),e50934924e227544ba8246aeb3770dd4,5,,,2018-02-17T00:00:00Z,2018-02-18T13:02:51Z
a4591c265e18cb1dcee52889e2d8acc3,503740e9ca751ccdda7ba28e9ab8f608,delivered,2017-07-09T21:57:05Z,2017-07-09T22:10:13Z,2017-07-11T14:58:04Z,2017-07-26T10:57:55Z,2017-08-01T00:00:00Z,07-2017,7,2017,13.13,14.83,356.0,16.54,397.01,True,133.03,5.54,1,Domingo,True,1,175.26,List(credit_card),89b738e70a1ce346db29a20fb2910161,4,,,2017-07-27T00:00:00Z,2017-07-27T22:48:30Z
136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11T12:22:08Z,2017-04-13T13:25:17Z,,,2017-05-09T00:00:00Z,04-2017,4,2017,2943.15,,,,,False,0.0,0.0,3,Terça-feira,False,1,65.95,List(credit_card),e07549ef5311abcc92ba1784b093fb56,2,,fiquei triste por n ter me atendido.,2017-05-13T00:00:00Z,2017-05-13T20:25:42Z
6514b8ad8028c9f2cc2374ded245783f,9bdf08b4b3b52b5526ff42d37d47f222,delivered,2017-05-16T13:10:30Z,2017-05-16T13:22:11Z,2017-05-22T10:07:46Z,2017-05-26T12:55:51Z,2017-06-07T00:00:00Z,05-2017,5,2017,11.68,4.12,98.8,9.99,239.76,True,275.07,11.46,3,Terça-feira,False,1,75.16,List(credit_card),07d67dd06ed5f88bef11ef6b464e79ae,5,,,2017-05-27T00:00:00Z,2017-05-28T02:59:57Z
76c6e866289321a7c93b82b54852dc33,f54a9f0e6b351c431402b8461ea51999,delivered,2017-01-23T18:29:09Z,2017-01-25T02:50:47Z,2017-01-26T14:16:31Z,2017-02-02T14:08:10Z,2017-03-06T00:00:00Z,01-2017,1,2017,1941.63,6.99,167.86,9.82,235.65,True,753.86,31.41,2,Segunda-feira,False,1,35.95,List(boleto),fc4af8aea8ec3f1a3cd181d3d0cadbd5,1,,,2017-02-03T00:00:00Z,2017-02-05T01:58:35Z
e69bfb5eb88e0ed6a785585b27e16dbf,31ad1d1b63eb9962463f764d4e6e0c9d,delivered,2017-07-29T11:55:02Z,2017-07-29T12:05:32Z,2017-08-10T19:45:24Z,2017-08-16T17:14:30Z,2017-08-23T00:00:00Z,07-2017,7,2017,10.5,5.9,141.49,18.22,437.32,True,150.76,6.28,7,Sábado,True,2,169.76,"List(voucher, credit_card)",abc5655186d40772bd6e410420e6a3ed,5,,,2017-08-17T00:00:00Z,2017-08-18T01:47:32Z
