In [0]:
import logging
import re
from pyspark.sql.functions import col, to_timestamp, when, regexp_extract, lit, regexp_replace, udf
from pyspark.sql.types import StringType, LongType, DoubleType

In [0]:
logger = logging.getLogger("DeBugLoggerSilverNintendo")

logger.setLevel(logging.INFO)

if not logger.handlers:
    handler = logging.StreamHandler()
    formatter = logging.Formatter('[%(levelname)s] %(asctime)s - %(message)s',
                                  '%Y-%m-%d %H:%M:%S')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

In [0]:
df_consoles = spark.read.table("nintendodatabricks_workspace.bronze.consoles_ing")

In [0]:
dffiltered = df_consoles.filter(col("codigo").isNotNull())

In [0]:
dftime = dffiltered.withColumn("extract", to_timestamp(col("extract"), "yyyy-MM-dd HH:mm"))

In [0]:
df_com_parcelas = dftime.withColumn(
    "numero_parcelas",
    when(col("parcelamento").isNotNull(), regexp_extract(col("parcelamento"), r'(\d+)x', 1)).otherwise(lit(0))
)
df_com_parcelas = df_com_parcelas.withColumn("numero_parcelas", col("numero_parcelas").cast(LongType()))

In [0]:
df_com_valores = df_com_parcelas.withColumn(
    "valor_prestacao",
    when(col("parcelamento").isNotNull(), regexp_extract(col("parcelamento"), r'R\$ (\d+,\d{2})', 1)).otherwise(lit("0"))
).withColumn(
    "valor_prestacao",
    when(col("valor_prestacao") != '0',
        regexp_replace(col("valor_prestacao"), ",", ".").cast(DoubleType())
    ).otherwise(lit(0.0))
).drop("parcelamento")

In [0]:
df_desconto = df_com_valores.withColumn(
    "desconto",
    when(
        col("desconto").rlike("^\d+%$"),
        (regexp_replace(col("desconto"), "%", "").cast(DoubleType()) / 100)
    ).otherwise(col("desconto"))
).withColumn("desconto", col("desconto").cast(DoubleType())).fillna({"desconto": 0.0})

In [0]:
df_preço = df_desconto.withColumn(
    "preco",
    when(
        col("preco").rlike("^R\\$?\\s?\\d{1,3}(\\.\\d{3})*(,\\d{2})?$"),
        regexp_replace(
            regexp_replace(
                regexp_replace(col("preco"), "R\\$", ""), 
                "\\.", "" 
            ),
            ",", "."  
        ).cast(DoubleType())
    ).otherwise(col("preco"))
).withColumn("preco", col("preco").cast(DoubleType())).fillna({"preco": 0.0})

In [0]:
def extract_memory_info(info):
    if isinstance(info, str) and info:
        padrao = r'(\d+)\s*(G[Bb])'
        resultado = re.search(padrao, info, re.IGNORECASE)
        if resultado:
            return resultado.group(0).upper()
    return '-'

extrair_memoria_udf = udf(extract_memory_info, StringType())

df_memoria = df_preço.withColumn('memoria', extrair_memoria_udf(col("nome")))

In [0]:
df_oled = df_memoria.withColumn('oled', when(col('nome').rlike('(?i)Oled'), 'Sim').otherwise('Nao'))

In [0]:
df_final = df_oled.select(
    "codigo",
    "nome",
    "preco",
    "desconto",
    "numero_parcelas",
    "valor_prestacao",
    "origem",
    "link",
    "memoria",
    "oled",
    "extract"
)

In [0]:
display(df_final)

In [0]:
df_final.printSchema()