
# 1. Imports

In [0]:
from pyspark.sql.functions import (
    datediff
)


# 2. Transformações de Dados


## 2.1 Adicionando coluna calculada em orders

> Teste Prático -> Adicione uma coluna calculada, por exemplo, o valor total de cada transação.

Estou criando uma coluna calculada. Esta coluna, chamada "delivery_date_diff" e representa a diferença entre duas datas: order_delivered_customer_date e order_estimated_delivery_date. 

delivery_time_diff | Descrição
-------------------| -----------------
&gt; 0             | pedido entregue DEPOIS da data estimada
&lt; 0             | pedido entregue ANTES da data estimada
&equals; 0         | pedido entregue NA data estimada
null               | Pedido não foi entregue. Coluna order_status pode auxiliar a entender o null do order_delivered_customer_date

Com esta coluna calculada, podemos:

- Analisar se o algoritmo que estima a data de entrega está retornando um "resultado satisfatório".
- Analisar para quais estados/cidades o algoritmo de estimativa possui melhor performance.
- Analisar para quais estados/cidades, por exemplo, as entregas são realizadas antes/depois do previsto.
- Gerar um relatórios a respeito das transportadoras que mais "atrasam" a entrega, se comparado com a data prevista pelo algoritmo.

In [0]:
# Local do arquivo Delta
path_to_delta_table = "dbfs:/FileStore/Datum/KaggleOlistData/silver/delta/orders"

# Ler o arquivo Delta como um DataFrame
df_orders = spark.read.format("delta").load(path_to_delta_table)

In [0]:
del path_to_delta_table

In [0]:
# Calculando a diferença de tempo entre as datas estimada e real de entrega
df_orders = df_orders.withColumn(
    "delivery_time_diff",
    datediff(
        df_orders["order_delivered_customer_date"],
        df_orders["order_estimated_delivery_date"],
    ),
)

In [0]:
display(df_orders.take(10))

order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,delivery_time_diff
e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02T10:56:33Z,2017-10-02T11:07:15Z,2017-10-04T19:55:00Z,2017-10-10T21:25:13Z,2017-10-18T00:00:00Z,-8.0
53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24T20:41:37Z,2018-07-26T03:24:27Z,2018-07-26T14:31:00Z,2018-08-07T15:27:45Z,2018-08-13T00:00:00Z,-6.0
47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08T08:38:49Z,2018-08-08T08:55:23Z,2018-08-08T13:50:00Z,2018-08-17T18:06:29Z,2018-09-04T00:00:00Z,-18.0
949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18T19:28:06Z,2017-11-18T19:45:59Z,2017-11-22T13:39:59Z,2017-12-02T00:28:42Z,2017-12-15T00:00:00Z,-13.0
ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13T21:18:39Z,2018-02-13T22:20:29Z,2018-02-14T19:46:34Z,2018-02-16T18:17:02Z,2018-02-26T00:00:00Z,-10.0
a4591c265e18cb1dcee52889e2d8acc3,503740e9ca751ccdda7ba28e9ab8f608,delivered,2017-07-09T21:57:05Z,2017-07-09T22:10:13Z,2017-07-11T14:58:04Z,2017-07-26T10:57:55Z,2017-08-01T00:00:00Z,-6.0
136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11T12:22:08Z,2017-04-13T13:25:17Z,,,2017-05-09T00:00:00Z,
6514b8ad8028c9f2cc2374ded245783f,9bdf08b4b3b52b5526ff42d37d47f222,delivered,2017-05-16T13:10:30Z,2017-05-16T13:22:11Z,2017-05-22T10:07:46Z,2017-05-26T12:55:51Z,2017-06-07T00:00:00Z,-12.0
76c6e866289321a7c93b82b54852dc33,f54a9f0e6b351c431402b8461ea51999,delivered,2017-01-23T18:29:09Z,2017-01-25T02:50:47Z,2017-01-26T14:16:31Z,2017-02-02T14:08:10Z,2017-03-06T00:00:00Z,-32.0
e69bfb5eb88e0ed6a785585b27e16dbf,31ad1d1b63eb9962463f764d4e6e0c9d,delivered,2017-07-29T11:55:02Z,2017-07-29T12:05:32Z,2017-08-10T19:45:24Z,2017-08-16T17:14:30Z,2017-08-23T00:00:00Z,-7.0



# 3. Exportando dados para Delta Lake

In [0]:
df_orders.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(
    "dbfs:/FileStore/Datum/KaggleOlistData/silver/delta/orders"
)

In [0]:
del df_orders