### Premissas:
- Nome: dim_order_status
- Descrição da tabela: Tabela composta por variáveis qualitativas de status do pedido.
- Tipo: SCD-2

In [0]:
# Importa bibliotecas
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, TimestampType, LongType
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from datetime import datetime

In [0]:
# Cria dataframe com dados para a criação da dimensão
df_dimensao = spark.sql("SELECT DISTINCT order_status FROM poc_datum.olist.inter_sales ORDER BY 1")     
df_dimensao = df_dimensao.withColumn('order_status_id', monotonically_increasing_id() )

# Ordena as colunas
df_dimensao = df_dimensao.select('order_status_id', 'order_status')

# Exibe o resultado
display(df_dimensao)

order_status_id,order_status
0,approved
1,canceled
2,delivered
3,invoiced
4,processing
5,shipped
6,unavailable


In [0]:
# Primeira carga

# Cria DataFrame com os dados de "NÃO INFORMADO"
data = [(-1, "Não informado")]
column = df_dimensao.schema.fieldNames()
df = spark.createDataFrame( data, schema=column )
# df.show()

# Faz o join com o dataframe com dados para a criação da dimensão
dim_order_status = df.union(df_dimensao)
# dim_customers.show()

# Inseri as colunas default de dimensão
dim_order_status = dim_order_status.withColumn( "row_ingestion_timestamp", current_timestamp() ) \
    .withColumn( "row_version", lit(1) ) \
    .withColumn( "row_current_indicator", lit(True) ) \
    .withColumn( "row_effective_date", to_timestamp( lit('1900-01-01 00:00:00'), "yyyy-MM-dd HH:mm:ss") ) \
    .withColumn( "row_expiration_date", to_timestamp( lit('2200-01-01 00:00:00') , "yyyy-MM-dd HH:mm:ss" ) )

# Inseri coluna SK
dim_order_status = dim_order_status.withColumn( 
    'sk_dim_order_status', 
    sha2(concat_ws("|",  
        dim_order_status.row_ingestion_timestamp,
        dim_order_status.order_status_id, 
        dim_order_status.order_status), 256))

# Ordena as colunas
dim_order_status_select = dim_order_status.select( 
    'sk_dim_order_status',
    'row_ingestion_timestamp',
    'row_version',
    'row_current_indicator',
    'row_effective_date',
    'row_expiration_date',
    'order_status_id',
    'order_status'
    )

display(dim_order_status_select)

sk_dim_order_status,row_ingestion_timestamp,row_version,row_current_indicator,row_effective_date,row_expiration_date,order_status_id,order_status
af786bb765f6947dd7b24957e0d7e40ea499dee63f5b8ac23613fd0092bf4a17,2024-04-29T13:14:39.579Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,-1,Não informado
c5d152ae659a9eb074bd035dffa9f8e78c43ae44cfd1dd3bf5c8d5b7d42ec040,2024-04-29T13:14:39.579Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,0,approved
50078d3b42785c3bed7fd5c2af70bc38cb28956c0f9c34f58e9ad1ec8ecc0bd2,2024-04-29T13:14:39.579Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,1,canceled
0564f7b39b6c606de0668bd816f57211d96c9539366481a5ccaae51305f4acce,2024-04-29T13:14:39.579Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,2,delivered
a65209c9e0b9110f7efae7292e01ff2de1a03e4c80042aa10d19be02c503cf1b,2024-04-29T13:14:39.579Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,3,invoiced
6884278d9f9f6c1ec56beb307677effbdbb7bf7d6be40cd244048ba6b6d330b8,2024-04-29T13:14:39.579Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,4,processing
8184a768cc7a9fd348235393b411108de7fdc17ab711eb5b70cb4e2fa1aeb892,2024-04-29T13:14:39.579Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,5,shipped
c03a686765bb39b0cf62687bb7f2cb636a706d443facac95b4f7c12c66f413f0,2024-04-29T13:14:39.579Z,1,True,1900-01-01T00:00:00Z,2200-01-01T00:00:00Z,6,unavailable


In [0]:
table_name = 'dim_order_status'
spark.sql('USE olist')
spark.sql(f'DROP TABLE IF EXISTS dim_order_status')
dim_order_status_select.write.format("delta").mode('overwrite').saveAsTable(table_name)

### Carga Diferencial (Upsert)

In [0]:
#teste
data = [(0,	'approvedD')]
column = ['order_status_id','order_status']

df_dimensao = spark.createDataFrame(data, schema=column)
df_dimensao.show()

+---------------+------------+
|order_status_id|order_status|
+---------------+------------+
|              0|   approvedD|
+---------------+------------+



In [0]:
# Dados Novos
df_origem = df_dimensao

# Dados da dimensão
df_destino = spark.sql("""
    SELECT   
        order_status_id,
        order_status
    FROM 
        poc_datum.olist.dim_order_status
    ORDER BY order_status_id """)

# Realize o EXCEPT (retornar apenas registros novos)
df_dados_novos = df_origem.exceptAll(df_destino)

display(df_dados_novos)

# Cria uma tabela temporária
df_dados_novos.createOrReplaceTempView("temp_dados_novos")

order_status_id,order_status
0,approvedD


In [0]:
# Paramêtros
table_merge = 'dim_order_status'
change_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
print(change_date)

2024-04-29 13:14:48.199435


In [0]:
spark.sql(f"""
CREATE OR REPLACE TEMPORARY VIEW dados_novos AS 
SELECT
     sha2(concat_ws("|",  '{change_date}',  a.order_status_id, a.order_status), 256) AS sk_dim_order_status
    ,a.order_status_id
    ,a.order_status
    ,to_timestamp('{change_date}')  AS change_date
    ,(
        SELECT
            MAX(b.sk_dim_order_status)
        FROM
            dim_order_status as b
        WHERE
            a.order_status_id = b.order_status_id
    ) AS max_sk_dim_order_status
    ,COALESCE(
        (
            SELECT
                MAX(c.row_version) + 1
            FROM
                dim_order_status as c
            WHERE
                a.order_status_id = c.order_status_id
        ), 1
    ) AS max_row_version
FROM
    temp_dados_novos AS a
""")

DataFrame[]

In [0]:
%sql
SELECT
    *
FROM
    dados_novos AS a

sk_dim_order_status,order_status_id,order_status,change_date,max_sk_dim_order_status,max_row_version
9bf231d60e0eac864bd0fef44eb7a2a93330161ca113815ce063e67ae1c1b36e,0,approvedD,2024-04-29T13:14:48.199435Z,1c21607f1701994c4f6add5fdd5b6d39fbd7ec2f523968f8ea903459dceb9c8a,2


In [0]:
spark.sql(f""" 
MERGE INTO {table_merge} as destino
USING dados_novos 
ON destino.sk_dim_order_status = dados_novos.max_sk_dim_order_status

WHEN MATCHED THEN 
  UPDATE SET
   destino.row_expiration_date = to_timestamp('{change_date}') --dados_novos.change_date
  ,destino.row_current_indicator = False
  """)

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql(f"""
MERGE INTO {table_merge} as destino
USING dados_novos 
ON destino.order_status_id = dados_novos.order_status_id
AND destino.order_status = dados_novos.order_status

WHEN NOT MATCHED 
  THEN INSERT (
    sk_dim_order_status
    , row_ingestion_timestamp
    ,row_version
    ,row_current_indicator
    ,row_effective_date
    ,row_expiration_date
    ,order_status_id
    ,order_status
  )
  VALUES (
    dados_novos.sk_dim_order_status
    ,to_timestamp('{change_date}') --dados_novos.change_date
    ,dados_novos.max_row_version
    ,1
    ,to_timestamp('{change_date}') --dados_novos.change_date
    ,to_timestamp( '2200-01-01 00:00:00')
    ,dados_novos.order_status_id
    ,dados_novos.order_status
  )
""")

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
%sql
SELECT * FROM poc_datum.olist.dim_order_status
WHERE order_status_id = 0

sk_dim_order_status,row_ingestion_timestamp,row_version,row_current_indicator,row_effective_date,row_expiration_date,order_status_id,order_status
9bf231d60e0eac864bd0fef44eb7a2a93330161ca113815ce063e67ae1c1b36e,2024-04-29T13:14:48.199435Z,2,True,2024-04-29T13:14:48.199435Z,2200-01-01T00:00:00Z,0,approvedD
1c21607f1701994c4f6add5fdd5b6d39fbd7ec2f523968f8ea903459dceb9c8a,2024-04-29T13:14:42.494Z,1,False,1900-01-01T00:00:00Z,2024-04-29T13:14:48.199435Z,0,approved
