In [0]:
import dlt
import logging
import re
from pyspark.sql.functions import col, to_timestamp, when, regexp_extract, lit, regexp_replace, udf
from pyspark.sql.types import StringType, LongType, DoubleType

In [0]:
logger = logging.getLogger("DLTLoggerSilverNintendo")

logger.setLevel(logging.INFO)

if not logger.handlers:
    handler = logging.StreamHandler()
    formatter = logging.Formatter('[%(levelname)s] %(asctime)s - %(message)s',
                                  '%Y-%m-%d %H:%M:%S')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

In [0]:
def extract_memory_info(info):
    if isinstance(info, str) and info:
        padrao = r'(\d+)\s*(G[Bb])'
        resultado = re.search(padrao, info, re.IGNORECASE)
        if resultado:
            return resultado.group(0).upper()
    return '-'

In [0]:
@dlt.table(
    name="nintendodatabricks_workspace.silver.consoles_silver",
    comment="Tabela limpa e tratada da camada de ingestão aprticionada por liquid cluster",
    cluster_by=["codigo","extract"]
)

@dlt.expect("codigo_string_not_null", "typeof(codigo) = 'string' AND codigo IS NOT NULL")
@dlt.expect("nome_string_not_null", "typeof(nome) = 'string' AND nome IS NOT NULL")
@dlt.expect("preco_double_not_null", "typeof(preco) = 'double' AND preco IS NOT NULL")
@dlt.expect("desconto_double_not_null", "typeof(desconto) = 'double' AND desconto IS NOT NULL")
@dlt.expect("numero_parcelas_long_not_null", "typeof(numero_parcelas) = 'bigint' AND numero_parcelas IS NOT NULL")
@dlt.expect("valor_prestacao_double_not_null", "typeof(valor_prestacao) = 'double' AND valor_prestacao IS NOT NULL")
@dlt.expect("origem_string_not_null", "typeof(origem) = 'string' AND origem IS NOT NULL")
@dlt.expect("link_string_not_null", "typeof(link) = 'string' AND link IS NOT NULL")
@dlt.expect("memoria_string_not_null", "typeof(memoria) = 'string' AND memoria IS NOT NULL")
@dlt.expect("oled_string_not_null", "typeof(oled) = 'string' AND oled IS NOT NULL")
@dlt.expect("extract_timestamp_not_null", "typeof(extract) = 'timestamp' AND extract IS NOT NULL")

def console_silver():

    logger.info("Leitura da tabela Striming iniciada...")

    df = dlt.read("nintendodatabricks_workspace.bronze.consoles_ing") 

    dffiltered = df.filter(col("codigo").isNotNull())

    dftime = dffiltered.withColumn("extract", to_timestamp(col("extract"), "yyyy-MM-dd HH:mm"))

    df_com_parcelas = dftime.withColumn(
        "numero_parcelas",
        when(col("parcelamento").isNotNull(), regexp_extract(col("parcelamento"), r'(\d+)x', 1)).otherwise(lit(0))
    )
    df_com_parcelas = df_com_parcelas.withColumn("numero_parcelas", col("numero_parcelas").cast(LongType()))

    df_com_valores = df_com_parcelas.withColumn(
        "valor_prestacao",
        when(col("parcelamento").isNotNull(), regexp_extract(col("parcelamento"), r'R\$ (\d+,\d{2})', 1)).otherwise(lit("0"))
    ).withColumn(
        "valor_prestacao",
        when(col("valor_prestacao") != '0',
            regexp_replace(col("valor_prestacao"), ",", ".").cast(DoubleType())
        ).otherwise(lit(0.0))
    ).drop("parcelamento")

    df_desconto = df_com_valores.withColumn(
        "desconto",
        when(
            col("desconto").rlike("^\d+%$"),
            (regexp_replace(col("desconto"), "%", "").cast(DoubleType()) / 100)
        ).otherwise(col("desconto"))
    ).withColumn("desconto", col("desconto").cast(DoubleType())).fillna({"desconto": 0.0})

    df_preço = df_desconto.withColumn(
        "preco",
        when(
            col("preco").rlike("^R\\$?\\s?\\d{1,3}(\\.\\d{3})*(,\\d{2})?$"),
            regexp_replace(
                regexp_replace(
                    regexp_replace(col("preco"), "R\\$", ""), 
                    "\\.", "" 
                ),
                ",", "."  
            ).cast(DoubleType())
        ).otherwise(col("preco"))
    ).withColumn("preco", col("preco").cast(DoubleType())).fillna({"preco": 0.0})

    extrair_memoria_udf = udf(extract_memory_info, StringType())

    df_memoria = df_preço.withColumn('memoria', extrair_memoria_udf(col("nome")))

    df_oled = df_memoria.withColumn('oled', when(col('nome').rlike('(?i)Oled'), 'Sim').otherwise('Nao'))

    df_final = df_oled.select(
        "codigo",
        "nome",
        "preco",
        "desconto",
        "numero_parcelas",
        "valor_prestacao",
        "origem",
        "link",
        "memoria",
        "oled",
        "extract"
    )

    logger.info("Carregando dados na Liquid Cluster")

    return df_final