In [None]:
from pyspark.sql.functions import col, regexp_replace, split, explode, trim, regexp_extract, expr, array_distinct
from pyspark.sql.types import IntegerType

In [None]:
df = spark.read.format('delta').load("dbfs:/FileStore/tables/bronze/games_details")

In [None]:
df = df
for field in df.schema["data"].dataType.fields:
    df = df.withColumn(field.name, col("data." + field.name))

df = df.drop('data','success')

In [None]:
def chave_explode(df,column_key):

    # Remove colchetes externos
    df = df.withColumn(
        "column_clean",
        regexp_replace(column_key, r'^\[|\]$', "")
    )

    # Separa por "}, {"
    df = df.withColumn(
        "column_parts",
        split(regexp_replace("column_clean", r'\}, \{', '}|{'), r'\}\|\{')
    )

    # Explode em linhas
    df = df.withColumn("column_str", explode("column_parts"))

    # Remove chaves { }
    df = df.withColumn(
        "column_str",
        regexp_replace("column_str", r'[\{\}]', "")
    )

    # Separa pelos campos (description=..., id=...)
    df = (
        df.withColumn("description", trim(regexp_extract("column_str", r'description=([^,]+)', 1)))
            .withColumn("id", regexp_extract("column_str", r'id=(\d+)', 1).cast(IntegerType()))
    )

    df = df.select("steam_appid", "id", "description")

    return df

In [None]:
genre = df.select('steam_appid','genres')
genre = chave_explode(genre,'genres')
categories = df.select('steam_appid','categories')
categories = chave_explode(categories,'categories')

In [None]:

def langueges_list(df,column_key):

    # 1️⃣ Remover tags <strong> e </strong>
    df = df.withColumn(
        "column_clean",
        regexp_replace(column_key, r'<strong>|</strong>', "")
    )

    # 2️⃣ Remover tudo após <br> (inclusive ele)
    df = df.withColumn(
        "column_clean",
        regexp_replace("column_clean", r'<br>.*$', "")
    )

    # 3️⃣ Remover asterisco (*)
    df = df.withColumn(
        "column_clean",
        regexp_replace("column_clean", r'\*', "")
    )

    # 4️⃣ Quebrar string em array usando vírgula
    df = df.withColumn(
        "column_array",
        split("column_clean", r'\s*,\s*')
    )

    # 5️⃣ Tirar espaços extras de cada elemento (higher-order function)
    df = df.withColumn(
        "column_array",
        expr("transform(column_array, x -> trim(x))")
    )

    # 6️⃣ Remover duplicatas no array (se houver)
    df = df.withColumn(
        "languages",
        array_distinct("column_array")
    )

    df = df.select('steam_appid','languages')
    
    return df


In [None]:
languages = df.select('steam_appid','supported_languages')
languages = langueges_list(languages,'supported_languages')

In [None]:
def extract_min_requirements(df, col):
    # 1️⃣ Extrair apenas o conteúdo após 'minimum=' (antes de ', recommended=' se existir)
    df = df.withColumn(
        "min_part", 
        regexp_extract(col, r"minimum=(.*?)(?:,\s*recommended=|$)", 1)
    )

    # 2️⃣ Remover tags HTML genéricas
    df = df.withColumn(
        "min_clean",
        regexp_replace("min_part", r"<.*?>", " ")
    )

    # 3️⃣ Normalizar espaços
    df = df.withColumn(
        "min_clean",
        regexp_replace("min_clean", r"\s+", " ")
    )

    # 4️⃣ Extrair cada campo usando regex específico
    df = df.withColumn(
        "OS",
        trim(regexp_extract("min_clean", r"OS:\s*([^P|M|G|S]+)", 1))
    ).withColumn(
        "Processor",
        trim(regexp_extract("min_clean", r"Processor:\s*([^M|G|S]+)", 1))
    ).withColumn(
        "Memory",
        trim(regexp_extract("min_clean", r"Memory:\s*([^G|S]+)", 1))
    ).withColumn(
        "Graphics",
        trim(regexp_extract("min_clean", r"Graphics:\s*([^S]+)", 1))
    ).withColumn(
        "Storage",
        trim(regexp_extract("min_clean", r"Storage:\s*([^A]+)", 1))
    )

    # 5️⃣ Resultado final: colunas desejadas
    return df.drop("min_part", "min_clean")


In [None]:
requirements = df.select("steam_appid","pc_requirements")
requirements = extract_min_requirements(requirements, "pc_requirements")
requirements = requirements.select("steam_appid","OS", "Processor", "Memory", "Graphics", "Storage")

In [None]:
def get_price(df,column_key):

    df = df.withColumn(
        "Currency",
        regexp_extract(column_key, r"currency=([A-Z]{3})", 1)
    ).withColumn(
        "Price",
        regexp_extract(column_key, r"final=(\d+)", 1).cast("int")
    )

    df = df.select("steam_appid","Currency","Price")

    return df


In [None]:
price = df.select("steam_appid","price_overview")
price = get_price(price, "price_overview")

In [None]:
def get_publishers(df, column_key):

    # 1. Remove o colchete esquerdo '[' (pode remover o direito também, se quiser)
    df = df.withColumn("publishers_clean", regexp_replace(col(column_key), r"^\[|\]$", ""))

    # 2. Separa por vírgula (caso tenha múltiplos valores, ainda que no seu exemplo não tem), resultando em um array
    df = df.withColumn("publishers_array", split(col("publishers_clean"), ","))

    # 3. Pega o primeiro valor do array e remove espaços extras
    df = df.withColumn(column_key, trim(col("publishers_array").getItem(0)))

    return df
    

In [None]:
df = get_publishers(df, 'publishers')

In [None]:
df = df.select('steam_appid','name','type','header_image','is_free','publishers','required_age','website','short_description')

In [None]:
# Faz o left join sequencial
df_joined = df.join(genre, on="steam_appid", how="left") \
              .join(categories, on="steam_appid", how="left") \
              .join(languages, on="steam_appid", how="left") \
              .join(requirements, on="steam_appid", how="left") \
              .join(price, on="steam_appid", how="left")