### Premissas:
- Nome: dim_products
- Descrição da tabela: Tabela composta por variáveis qualitativas de vendedores.
- Tipo: SCD-2

In [0]:
# Importa bibliotecas
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, ArrayType
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from datetime import datetime

In [0]:
%sql
SELECT DISTINCT
  product_id,
  count(*) 
FROM poc_datum.olist.products_silver
GROUP BY product_id
HAVING count(*) > 1

product_id,count(1)


In [0]:
# Cria dataframe com dados para a criação da dimensão
df_dimensao = spark.sql("""
SELECT DISTINCT
    product_id,
    product_category_name
FROM poc_datum.olist.products_silver 
""")

display(df_dimensao)

product_id,product_category_name
e586ebb6022265ae1eea38f46ffe3ead,beleza_saude
3519403062e217f433e0bbdc52e0b19f,consoles_games
c4344f6de69a96da8febce3f885171f0,consoles_games
afa6d9a2f3ba0d5b06ccd5eb4bd2e29a,esporte_lazer
c8aa3a6754ab1ad362858eca26f58c8b,cama_mesa_banho
37b4783642fb927cc790934843b00b06,moveis_decoracao
8934769521d632e65c27992662f05673,cool_stuff
06b4fb6f7d3ef030f7906bb13dde0728,bebidas
f4b952dd7ab601c02bb1bd06395f45f6,relogios_presentes
659dc4a2860c19265edc2fe7e451329c,malas_acessorios


In [0]:
# Primeira carga

# Cria DataFrame com os dados de "NÃO INFORMADO"
data = [(-1, "Não informado")]
column = df_dimensao.schema.fieldNames()
df = spark.createDataFrame( data, schema=column )
# df.show()

# Faz o join com o dataframe com dados para a criação da dimensão
dim_products = df.union(df_dimensao)
# dim_customers.show()

# Inseri as colunas default de dimensão
dim_products = dim_products.withColumn( "row_ingestion_timestamp", current_timestamp() ) \
    .withColumn( "row_version", lit(1) ) \
    .withColumn( "row_current_indicator", lit(True) ) \
    .withColumn( "row_effective_date", to_timestamp( lit('1900-01-01 00:00:00'), "yyyy-MM-dd HH:mm:ss") ) \
    .withColumn( "row_expiration_date", to_timestamp( lit('2200-01-01 00:00:00') , "yyyy-MM-dd HH:mm:ss" ) )

# Inseri coluna SK
# dim_products = dim_products.withColumn( 'sk_dim_products', monotonically_increasing_id() ) 
dim_products = dim_products.withColumn( 
    'sk_dim_products', 
    sha2(concat_ws("|",  
        dim_products.row_ingestion_timestamp,
        dim_products.product_id, 
        dim_products.product_category_name), 256))

# Ordena as colunas
dim_products_select = dim_products.select( 
    'sk_dim_products',
    'row_ingestion_timestamp',
    'row_version',
    'row_current_indicator',
    'row_effective_date',
    'row_expiration_date',
    'product_id',
    'product_category_name'
    )

display(dim_products_select)

sk_dim_products,row_ingestion_timestamp,row_version,row_current_indicator,row_effective_date,row_expiration_date,product_id,product_category_name
62a31fcbdf6e6544797514512b4d6373d8ec3ce6c9b59a876db45cbb53b362a4,2024-04-29T13:04:44.377Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,-1,Não informado
e54384c4905cc5468a93590c634c2b10af490cdd0a3bff2a4de9ed89dc1b7b20,2024-04-29T13:04:44.377Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,e586ebb6022265ae1eea38f46ffe3ead,beleza_saude
8eb0253f9ea309e223fb0f6e770d0b830c6818e05fe150c806b759caeacda9f2,2024-04-29T13:04:44.377Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,3519403062e217f433e0bbdc52e0b19f,consoles_games
dff49816193532c061f9a91cfea7b1210ca594883885d9c6ef1e234957364bc9,2024-04-29T13:04:44.377Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,c4344f6de69a96da8febce3f885171f0,consoles_games
62a383da0c4f13f7baf92136940836f340b356a4256697e2101339f4e9b4b7d0,2024-04-29T13:04:44.377Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,afa6d9a2f3ba0d5b06ccd5eb4bd2e29a,esporte_lazer
8bc4a3bba8bbda4161fe527d4615fd3395b315d55ef7816f50ed1ac12db04147,2024-04-29T13:04:44.377Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,c8aa3a6754ab1ad362858eca26f58c8b,cama_mesa_banho
4b57187494ae0660c9093536b453d095c1eb4a3867e48bc8b520fb372fe32b63,2024-04-29T13:04:44.377Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,37b4783642fb927cc790934843b00b06,moveis_decoracao
dbeb653e06ab0a8189ff76c20089884138a54308468fd3964b9a946248be486f,2024-04-29T13:04:44.377Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,8934769521d632e65c27992662f05673,cool_stuff
5e8bfee8f05bc1d40743b43d2f15af01c4cce6d24d92feb030d76ddb260843f9,2024-04-29T13:04:44.377Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,06b4fb6f7d3ef030f7906bb13dde0728,bebidas
1a07ae6aa98a300d6628d43c454216e159bcdf33076c0afefd3b41db7d8ad1d4,2024-04-29T13:04:44.377Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,f4b952dd7ab601c02bb1bd06395f45f6,relogios_presentes


In [0]:
table_name = 'dim_products'
spark.sql('USE olist')
spark.sql(f'DROP TABLE IF EXISTS dim_products')
dim_products_select.write.format("delta").mode('overwrite').saveAsTable(table_name)

### Carga Diferencial (Upsert)

In [0]:
#teste
data = [('f4b952dd7ab601c02bb1bd06395f45f6','relogios_presentes1')]
column = ['product_id','product_category_name']

df_dimensao = spark.createDataFrame(data, schema=column)
df_dimensao.show()

+--------------------+---------------------+
|          product_id|product_category_name|
+--------------------+---------------------+
|f4b952dd7ab601c02...|  relogios_presentes1|
+--------------------+---------------------+



In [0]:
# Dados Novos
df_origem = df_dimensao

# Dados da dimensão
df_destino = spark.sql("""
    SELECT   
        product_id,
        product_category_name
    FROM 
        poc_datum.olist.dim_products
    ORDER BY product_id """)

# Realize o EXCEPT (retornar apenas registros novos)
df_dados_novos = df_origem.exceptAll(df_destino)

display(df_dados_novos)

# Cria uma tabela temporária
df_dados_novos.createOrReplaceTempView("temp_dados_novos")

product_id,product_category_name
f4b952dd7ab601c02bb1bd06395f45f6,relogios_presentes1


In [0]:
# Paramêtros
table_merge = 'dim_products'
change_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
print(change_date)

2024-04-29 13:04:54.426689


In [0]:
spark.sql(f"""
CREATE OR REPLACE TEMPORARY VIEW dados_novos AS 
SELECT
     sha2(concat_ws("|",  '{change_date}',  a.product_id, a.product_category_name), 256) AS sk_dim_products
    ,a.product_id
    ,a.product_category_name
    ,to_timestamp('{change_date}')  AS change_date
    ,(
        SELECT
            MAX(b.sk_dim_products)
        FROM
            dim_products as b
        WHERE
            a.product_id = b.product_id
    ) AS max_sk_dim_products
    ,COALESCE(
        (
            SELECT
                MAX(c.row_version) + 1
            FROM
                dim_products as c
            WHERE
                a.product_id = c.product_id
        ), 1
    ) AS max_row_version
FROM
    temp_dados_novos AS a
""")

DataFrame[]

In [0]:
%sql
SELECT
    *
FROM
    dados_novos AS a

sk_dim_products,product_id,product_category_name,change_date,max_sk_dim_products,max_row_version
087ede96187467e78fe9aa5dbb0fe4a04c6ac2d1a5572bddb62f8a095cf75971,f4b952dd7ab601c02bb1bd06395f45f6,relogios_presentes1,2024-04-29T13:04:54.426689Z,453f7f9e28c6fbe9e4e97c18f8ac14d9ac739779527c2e5ab793e2d1ee63364e,2


In [0]:
spark.sql(f""" 
MERGE INTO {table_merge} as destino
USING dados_novos 
ON destino.sk_dim_products = dados_novos.max_sk_dim_products

WHEN MATCHED THEN 
  UPDATE SET
   destino.row_expiration_date = to_timestamp('{change_date}') --dados_novos.change_date
  ,destino.row_current_indicator = False
  """)

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql(f"""
MERGE INTO {table_merge} as destino
USING dados_novos 
ON destino.product_id = dados_novos.product_id
AND destino.product_category_name = dados_novos.product_category_name

WHEN NOT MATCHED 
  THEN INSERT (
    sk_dim_products
    , row_ingestion_timestamp
    ,row_version
    ,row_current_indicator
    ,row_effective_date
    ,row_expiration_date
    ,product_id
    ,product_category_name
  )
  VALUES (
    dados_novos.sk_dim_products
    ,to_timestamp('{change_date}') --dados_novos.change_date
    ,dados_novos.max_row_version
    ,1
    ,to_timestamp('{change_date}') --dados_novos.change_date
    ,to_timestamp( '2200-01-01 00:00:00')
    ,dados_novos.product_id
    ,dados_novos.product_category_name
  )
""")

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
%sql
SELECT * FROM poc_datum.olist.dim_products
WHERE product_id = 'f4b952dd7ab601c02bb1bd06395f45f6'

sk_dim_products,row_ingestion_timestamp,row_version,row_current_indicator,row_effective_date,row_expiration_date,product_id,product_category_name
087ede96187467e78fe9aa5dbb0fe4a04c6ac2d1a5572bddb62f8a095cf75971,2024-04-29T13:04:54.426689Z,2,True,2024-04-29T13:04:54.426689Z,2200-01-01T00:00:00Z,f4b952dd7ab601c02bb1bd06395f45f6,relogios_presentes1
453f7f9e28c6fbe9e4e97c18f8ac14d9ac739779527c2e5ab793e2d1ee63364e,2024-04-29T13:04:47.445Z,1,False,1900-01-01T00:00:00Z,2024-04-29T13:04:54.426689Z,f4b952dd7ab601c02bb1bd06395f45f6,relogios_presentes
