
# 1. Imports

In [0]:
from pyspark.ml.fpm import FPGrowth

from pyspark.sql.functions import (
    collect_set,
    size,
    concat_ws,
)


# 2. Carregando sparkDFs


## 2.1 Order items

In [0]:
# Local do arquivo Delta
path_to_delta_table = "dbfs:/FileStore/Datum/KaggleOlistData/silver/delta/order_items"

# Ler o arquivo Delta como um DataFrame
df_order_items = spark.read.format("delta").load(path_to_delta_table)

In [0]:
display(df_order_items.take(10))

order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19T09:45:35Z,58.9,13.29
00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03T11:05:13Z,239.9,19.93
000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18T14:48:30Z,199.0,17.87
00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15T10:10:18Z,12.99,12.79
00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13T13:57:51Z,199.9,18.14
00048cc3ae777c65dbb7d2a0634bc1ea,1,ef92defde845ab8450f9d70c526ef70f,6426d21aca402a131fc0a5d0960a3c90,2017-05-23T03:55:27Z,21.9,12.69
00054e8431b9d7675808bcb819fb4a32,1,8d4f2bb7e93e6710a28f34fa83ee7d28,7040e82f899a04d1b434b795a43b4617,2017-12-14T12:10:31Z,19.9,11.85
000576fe39319847cbb9d288c5617fa6,1,557d850972a7d6f792fd18ae1400d9b6,5996cddab893a4652a15592fb58ab8db,2018-07-10T12:30:45Z,810.0,70.75
0005a1a1728c9d785b8e2b08b904576c,1,310ae3c140ff94b03219ad0adc3c778f,a416b6a846a11724393025641d4edd5e,2018-03-26T18:31:29Z,145.95,11.65
0005f50442cb953dcd1d21e1fb923495,1,4535b0e1091c278dfd193e5a1d63b39f,ba143b05f0110f0dc71ad71b4466ce92,2018-07-06T14:10:56Z,53.99,11.4



## 2.2 Products

In [0]:
# Local do arquivo Delta
path_to_delta_table = "dbfs:/FileStore/Datum/KaggleOlistData/silver/delta/products"

# Ler o arquivo Delta como um DataFrame
df_products = spark.read.format("delta").load(path_to_delta_table)

In [0]:
display(df_products.take(10))

product_id,product_category_name,product_name_lenght,product_description_lenght,product_weight_g,product_length_cm,product_height_cm,product_width_cm
1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40,287,1,225,16,10
3aa071139cb16b67ca9e5dea641aaa2f,artes,44,276,1,1000,30,18
96bd76ec8810374ed1b65e291975717f,esporte_lazer,46,250,1,154,18,9
cef67bcfe19066a932b7673e239eb23d,bebes,27,261,1,371,26,4
9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37,402,4,625,20,17
41d3672d4792049fa1779bb35283ed13,instrumentos_musicais,60,745,1,200,38,5
732bd381ad09e530fe0a5f457d81becb,cool_stuff,56,1272,4,18350,70,24
2548af3e6e77a690cf3eb6368e9ab61e,moveis_decoracao,56,184,2,900,40,8
37cc742be07708b53a98702e77a21a02,eletrodomesticos,57,163,1,400,27,13
8c92109888e8cdf9d66dc7e463025574,brinquedos,36,1156,1,600,17,10


In [0]:
del path_to_delta_table


# 3. Transformação


## 3.1 Order items

In [0]:
# Selecionando colunas 
df_order_items = df_order_items.select(["order_id", "product_id"])


## 3.2 Products

In [0]:
# Selecionando colunas 
df_products = df_products.select(["product_id", "product_category_name"])
# Remove linhas onde 'product_category_name' é nulo em df_products
df_products = df_products.dropna(subset=["product_category_name"])


## 3.3 Join

In [0]:
df_join = df_order_items.join(
    df_products, df_order_items.product_id == df_products.product_id, "left"
).drop(df_products.product_id)

In [0]:
display(df_join.take(10))

order_id,product_id,product_category_name
00010242fe8c5a6d1ba2dd792cb16214,4244733e06e7ecb4970a6e2683c13e61,cool_stuff
00018f77f2f0320c557190d7a144bdd3,e5f2d52b802189ee658865ca93d83a8f,pet_shop
000229ec398224ef6ca0657da4fc703e,c777355d18b72b67abbeef9df44fd0fd,moveis_decoracao
00024acbcdf0a6daa1e931b038114c75,7634da152a4610f1595efa32f14722fc,perfumaria
00042b26cf59d7ce69dfabb4e55b4fd9,ac6c3623068f30de03045865e4e10089,ferramentas_jardim
00048cc3ae777c65dbb7d2a0634bc1ea,ef92defde845ab8450f9d70c526ef70f,utilidades_domesticas
00054e8431b9d7675808bcb819fb4a32,8d4f2bb7e93e6710a28f34fa83ee7d28,telefonia
000576fe39319847cbb9d288c5617fa6,557d850972a7d6f792fd18ae1400d9b6,ferramentas_jardim
0005a1a1728c9d785b8e2b08b904576c,310ae3c140ff94b03219ad0adc3c778f,beleza_saude
0005f50442cb953dcd1d21e1fb923495,4535b0e1091c278dfd193e5a1d63b39f,livros_tecnicos


In [0]:
del df_order_items, df_products


# 4. Identificando Padrões de Compra

> Teste Prático -> Introduza uma regra mais complexa, como identificar padrões de comportamento de compra ao longo do tempo ou criar categorias personalizadas de produtos com base em determinados critérios.


## 4.1 Identificação de dupla de produtos mais comuns nas orders

In [0]:
df_grouped = df_join.groupBy("order_id").agg(
    collect_set("product_category_name").alias("products")
)
fpGrowth = FPGrowth(itemsCol="products", minSupport=0.0001, minConfidence=0.0001)
model = fpGrowth.fit(df_grouped)

In [0]:
del df_join, df_grouped

In [0]:
# Filtrar para conjuntos de itens que contêm exatamente 2 produtos
frequent_itemsets = model.freqItemsets
df_most_common_product_pair = frequent_itemsets.filter(size(frequent_itemsets["items"]) == 2)

In [0]:
association_rules = model.associationRules

In [0]:
display(association_rules.take(10))

antecedent,consequent,confidence,lift,support
List(moveis_decoracao),List(cama_mesa_banho),0.0108543960303923,0.1137262226542091,0.0007094642531368455
List(moveis_decoracao),List(construcao_ferramentas_iluminacao),0.0017056908047759,0.6897282333771407,0.00011148723977864716
List(moveis_decoracao),List(utilidades_domesticas),0.0037215072104202,0.0624041859998846,0.0002432448867897756
List(moveis_decoracao),List(bebes),0.0018607536052101,0.0636371283229326,0.0001216224433948878
List(moveis_decoracao),List(casa_construcao),0.0020158164056442,0.4059031458761207,0.00013175764701112846
List(moveis_decoracao),List(ferramentas_jardim),0.0026360676073809,0.0739312809976841,0.00017229846147609105
List(cool_stuff),List(bebes),0.0055066079295154,0.1883240824865054,0.000202704072324813
List(cool_stuff),List(cama_mesa_banho),0.0027533039647577,0.0288475617486231,0.0001013520361624065
List(esporte_lazer),List(utilidades_domesticas),0.0014248704663212,0.0238929757698932,0.00011148723977864716
List(esporte_lazer),List(beleza_saude),0.0018134715025906,0.020249884481056,0.0001418928506273691


In [0]:
del model, frequent_itemsets, association_rules

In [0]:
# Transforma a coluna 'items' em uma string, unindo os elementos com um separador
df_most_common_product_pair = df_most_common_product_pair.withColumn(
    "items_str", 
    concat_ws(", ", "items")
).select(["items_str", "freq"])

In [0]:
display(df_most_common_product_pair.take(10))

items_str,freq
"moveis_decoracao, cama_mesa_banho",70
"construcao_ferramentas_iluminacao, moveis_decoracao",11
"perfumaria, beleza_saude",12
"casa_conforto, cama_mesa_banho",43
"utilidades_domesticas, esporte_lazer",11
"utilidades_domesticas, moveis_decoracao",24
"utilidades_domesticas, cama_mesa_banho",20
"bebes, moveis_decoracao",12
"bebes, cool_stuff",20
"bebes, brinquedos",19



# 5. Exportando dados para Delta Lake

> Teste Prático -> "Grave os mesmos dados em formato Delta Lake para aproveitar as funcionalidades de versionamento e transações ACID."

In [0]:
df_most_common_product_pair.write.format("delta").mode("overwrite").option(
    "mergeSchema", "true"
).save("dbfs:/FileStore/Datum/KaggleOlistData/gold/delta/most_common_product_pair")

In [0]:
del df_most_common_product_pair


# 6. Criando Delta Table

In [0]:
%sql

create database if not exists olist;

In [0]:
%sql

use olist;

In [0]:
%sql

drop table if exists most_common_product_pair;

In [0]:
%sql

CREATE TABLE most_common_product_pair
USING DELTA
LOCATION 'dbfs:/FileStore/Datum/KaggleOlistData/gold/delta/most_common_product_pair'