In [0]:
# Importações necessárias

from pyspark.sql.functions import col, regexp_replace, split, explode, trim, regexp_extract, expr, array_distinct
from pyspark.sql.types import IntegerType

In [0]:
# Leitura dos dados bronze
df = spark.table("steam.bronze.details_games")

In [0]:
# Itera sobre todos os campos internos da coluna 'data' (que é uma struct)
for field in df.schema["data"].dataType.fields:
    # Cria uma nova coluna no DataFrame para cada campo da struct 'data'
    # O nome da nova coluna será o mesmo do campo interno
    # O valor vem de 'data.<nome_do_campo>'
    df = df.withColumn(field.name, col("data." + field.name))

# Remove a coluna 'data' original (struct) e a coluna 'success', 
# pois não são mais necessárias após a extração dos campos
df = df.drop('data', 'success')

In [0]:
def chave_explode(df,column_key):

    # Remove colchetes externos
    df = df.withColumn(
        "column_clean",
        regexp_replace(column_key, r'^\[|\]$', "")
    )

    # Separa por "}, {"
    df = df.withColumn(
        "column_parts",
        split(regexp_replace("column_clean", r'\}, \{', '}|{'), r'\}\|\{')
    )

    # Explode em linhas
    df = df.withColumn("column_str", explode("column_parts"))

    # Remove chaves { }
    df = df.withColumn(
        "column_str",
        regexp_replace("column_str", r'[\{\}]', "")
    )

    # Separa pelos campos (description=..., id=...)
    df = (
        df.withColumn("description", trim(regexp_extract("column_str", r'description=([^,]+)', 1)))
            .withColumn("id", regexp_extract("column_str", r'id=(\d+)', 1).cast(IntegerType()))
    )

    df = df.select("steam_appid", "id", "description")

    return df

In [0]:
# Gera o dataframe generos a partir da coluna array genres
genre = df.select('steam_appid','genres')
genre = chave_explode(genre,'genres')
genre = genre.withColumnRenamed('id', 'id_genres') \
             .withColumnRenamed('description', 'description_genres')

# Gera o dataframe categoria a partir da coluna array categories
categories = df.select('steam_appid','categories')
categories = chave_explode(categories,'categories')
categories = categories.withColumnRenamed('id', 'id_categories') \
             .withColumnRenamed('description', 'description_categories')


In [0]:

def langueges_list(df,column_key):

    # Remover tags <strong> e </strong>
    df = df.withColumn(
        "column_clean",
        regexp_replace(column_key, r'<strong>|</strong>', "")
    )

    # Remover tudo após <br> (inclusive ele)
    df = df.withColumn(
        "column_clean",
        regexp_replace("column_clean", r'<br>.*$', "")
    )

    # Remover asterisco (*)
    df = df.withColumn(
        "column_clean",
        regexp_replace("column_clean", r'\*', "")
    )

    # Quebrar string em array usando vírgula
    df = df.withColumn(
        "column_array",
        split("column_clean", r'\s*,\s*')
    )

    # Tirar espaços extras de cada elemento (higher-order function)
    df = df.withColumn(
        "column_array",
        expr("transform(column_array, x -> trim(x))")
    )

    # Remover duplicatas no array (se houver)
    df = df.withColumn(
        "languages",
        array_distinct("column_array")
    )

    df = df.select('steam_appid','languages')
    
    return df


In [0]:
# Gera o dataframe languages a partir da coluna de lista supported_languages
languages = df.select('steam_appid','supported_languages')
languages = langueges_list(languages,'supported_languages')

In [0]:
def extract_min_requirements(df, col):
    # Extrair apenas o conteúdo após 'minimum=' (antes de ', recommended=' se existir)
    df = df.withColumn(
        "min_part", 
        regexp_extract(col, r"minimum=(.*?)(?:,\s*recommended=|$)", 1)
    )

    # Remover tags HTML genéricas
    df = df.withColumn(
        "min_clean",
        regexp_replace("min_part", r"<.*?>", " ")
    )

    # Normalizar espaços
    df = df.withColumn(
        "min_clean",
        regexp_replace("min_clean", r"\s+", " ")
    )

    # Extrair cada campo usando regex específico
    df = df.withColumn(
        "OS",
        trim(regexp_extract("min_clean", r"OS:\s*([^P|M|G|S]+)", 1))
    ).withColumn(
        "Processor",
        trim(regexp_extract("min_clean", r"Processor:\s*([^M|G|S]+)", 1))
    ).withColumn(
        "Memory",
        trim(regexp_extract("min_clean", r"Memory:\s*([^G|S]+)", 1))
    ).withColumn(
        "Graphics",
        trim(regexp_extract("min_clean", r"Graphics:\s*([^S]+)", 1))
    ).withColumn(
        "Storage",
        trim(regexp_extract("min_clean", r"Storage:\s*([^A]+)", 1))
    )

    return df.drop("min_part", "min_clean")


In [0]:
# Gera o dataframe requeriments a parter da coluna pc_requirements
requirements = df.select("steam_appid","pc_requirements")
requirements = extract_min_requirements(requirements, "pc_requirements")
requirements = requirements.select("steam_appid","OS", "Processor", "Memory", "Graphics", "Storage")

In [0]:
def get_price(df,column_key):

    # Extrai a moeda (3 letras maiúsculas)
    df = df.withColumn(
        "Currency",
        regexp_extract(col(column_key), r"currency=([A-Z]{3})", 1)
    )

    # Extrai o valor de final_formatted (com símbolo)
    df = df.withColumn(
        "Price",
        regexp_extract(col(column_key), r"final_formatted=([^,}]+)", 1)
    )

    # Remove símbolos de moeda e converte para double
    df = df.withColumn(
        "Price",
        regexp_replace(col("Price"), r"[^0-9,.\-]", "")  # remove $, €, ₩ etc.
    )

    # Troca vírgula por ponto (para valores no formato europeu)
    df = df.withColumn(
        "Price",
        regexp_replace(col("Price"), ",", ".").cast("double")
    )

    # Trata valores vazios/nulos
    df = df.withColumn(
        "Currency",
        when((col("Currency").isNull()) | (col("Currency") == ""), "-")
        .otherwise(col("Currency"))
    )

    df = df.withColumn(
        "Price",
        when(col("Price").isNull(), 0.0).otherwise(col("Price"))
    )

    df = df.select("steam_appid","Currency","Price")

    return df


In [0]:
# Gerando dataframe para dados de preços
price = df.select("steam_appid","price_overview")
# Chamando função que normaliza as colunas referente a preço e tipo e moeda
price = get_price(price, "price_overview")

In [0]:
def get_publishers(df, column_key):

    # Remove o colchete esquerdo '[' (pode remover o direito também, se quiser)
    df = df.withColumn("publishers_clean", regexp_replace(col(column_key), r"^\[|\]$", ""))

    # Separa por vírgula (caso tenha múltiplos valores, ainda que no seu exemplo não tem), resultando em um array
    df = df.withColumn("publishers_array", split(col("publishers_clean"), ","))

    # Pega o primeiro valor do array e remove espaços extras
    df = df.withColumn(column_key, trim(col("publishers_array").getItem(0)))

    return df
    
    

In [0]:
# Função responsável pela definição de publicadores
df_publi = get_publishers(df, 'publishers')

In [0]:
# Selecionando colunas ao dataframe
df_selected = df_publi.select('steam_appid','name','type','header_image',
                              'is_free','publishers','required_age','website','short_description')

In [0]:
# Realizando left join para agrupar os dataframes
df_joined = df_selected.join(genre, on="steam_appid", how="left") \
              .join(categories, on="steam_appid", how="left") \
              .join(languages, on="steam_appid", how="left") \
              .join(requirements, on="steam_appid", how="left") \
              .join(price, on="steam_appid", how="left")

In [None]:
# Formatando tipo de dados em colunas especificas do dataframe
df_formated = (
    df_joined
    .withColumn("steam_appid", col("steam_appid").cast("int"))
    .withColumn("required_age", col("required_age").cast("int"))
    .withColumn("is_free", col("is_free").cast("boolean"))
)

In [None]:
%sql
-- Usando SQL mágico do Databricks para criar schema se não existir
CREATE SCHEMA IF NOT EXISTS steam.silver.details_games;

In [None]:
%sql
-- Criação da tabela Delta se não existir
CREATE TABLE IF NOT EXISTS steam.silver.details_games (
    steam_appid INT,
    name STRING,
    type STRING,
    header_image STRING,
    is_free BOOLEAN,
    publishers STRING,
    required_age INT,
    website STRING,
    short_description STRING,
    id_genres INT,
    description_genres STRING,
    id_categories INT,
    description_categories STRING,
    languages ARRAY<STRING>,
    OS STRING,
    Processor STRING,
    Memory STRING,
    Graphics STRING,
    Storage STRING,
    Currency STRING,
    Price DOUBLE
)
USING DELTA


In [0]:
# Escrita na camada silver
df_formated.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("steam.silver.details_games")