In [0]:
# Importações necessárias

from pyspark.sql.types import StructType, StructField, StringType, BooleanType
from pyspark.sql.utils import AnalysisException
from pyspark.sql import functions as F

In [0]:
%sql
-- Usando SQL mágico do Databricks para criar schema se não existir
CREATE SCHEMA IF NOT EXISTS steam.bronze;

In [0]:
%sql
-- Criação da tabela Delta se não existir
CREATE TABLE IF NOT EXISTS bronze.details_games (
  success BOOLEAN,
  data STRUCT<
    about_the_game: STRING,
    achievements: STRING,
    background: STRING,
    background_raw: STRING,
    capsule_image: STRING,
    capsule_imagev5: STRING,
    categories: STRING,
    content_descriptors: STRING,
    controller_support: STRING,
    demos: STRING,
    detailed_description: STRING,
    developers: STRING,
    dlc: STRING,
    drm_notice: STRING,
    ext_user_account_notice: STRING,
    genres: STRING,
    header_image: STRING,
    is_free: STRING,
    legal_notice: STRING,
    linux_requirements: STRING,
    mac_requirements: STRING,
    metacritic: STRING,
    movies: STRING,
    name: STRING,
    package_groups: STRING,
    packages: STRING,
    pc_requirements: STRING,
    platforms: STRING,
    price_overview: STRING,
    publishers: STRING,
    ratings: STRING,
    recommendations: STRING,
    release_date: STRING,
    required_age: STRING,
    reviews: STRING,
    screenshots: STRING,
    short_description: STRING,
    steam_appid: STRING,
    support_info: STRING,
    supported_languages: STRING,
    type: STRING,
    website: STRING
  >
)
USING DELTA


In [0]:
%sql
-- Otimização da tabela (Z-Ordering para consultas rápidas)
OPTIMIZE details_games
ZORDER BY (data)

In [0]:
# Definição do schema para leitura dos arquivos JSON

schema = StructType([
    StructField("success", BooleanType(), True),
    StructField("data", StructType([
        StructField("about_the_game", StringType(), True),
        StructField("achievements", StringType(), True),
        StructField("background", StringType(), True),
        StructField("background_raw", StringType(), True),
        StructField("capsule_image", StringType(), True),
        StructField("capsule_imagev5", StringType(), True),
        StructField("categories", StringType(), True),
        StructField("content_descriptors", StringType(), True),
        StructField("controller_support", StringType(), True),
        StructField("demos", StringType(), True),
        StructField("detailed_description", StringType(), True),
        StructField("developers", StringType(), True),
        StructField("dlc", StringType(), True),
        StructField("drm_notice", StringType(), True),
        StructField("ext_user_account_notice", StringType(), True),
        StructField("genres", StringType(), True),
        StructField("header_image", StringType(), True),
        StructField("is_free", StringType(), True),
        StructField("legal_notice", StringType(), True),
        StructField("linux_requirements", StringType(), True),
        StructField("mac_requirements", StringType(), True),
        StructField("metacritic", StringType(), True),
        StructField("movies", StringType(), True),
        StructField("name", StringType(), True),
        StructField("package_groups", StringType(), True),
        StructField("packages", StringType(), True),
        StructField("pc_requirements", StringType(), True),
        StructField("platforms", StringType(), True),
        StructField("price_overview", StringType(), True),
        StructField("publishers", StringType(), True),
        StructField("ratings", StringType(), True),
        StructField("recommendations", StringType(), True),
        StructField("release_date", StringType(), True),
        StructField("required_age", StringType(), True),
        StructField("reviews", StringType(), True),
        StructField("screenshots", StringType(), True),
        StructField("short_description", StringType(), True),
        StructField("steam_appid", StringType(), True),
        StructField("support_info", StringType(), True),
        StructField("supported_languages", StringType(), True),
        StructField("type", StringType(), True),
        StructField("website", StringType(), True)
    ]), True)
])



In [0]:
# Leitura dos dados novos (inbound)

df_new = (
    spark.read
    .schema(schema) # Garante tipagem consistente
    .option("multiline", "true") # Leitura de arquivo json formatado em várias linhas
    .json("abfss://steam@steamstorageaccount.dfs.core.windows.net/inbound/details/*.json")
)

df_new.cache() # Salvando dataframe em memória

In [0]:
# Leitura dos dados existentes na camada bronze

try:
    df_old = spark.table("steam.bronze.details_games")
except AnalysisException:
    # Caso a tabela ainda não exista, cria DataFrame vazio
    df_old = spark.createDataFrame([], schema)

df_old.cache() # Salvando dataframe em memória

In [0]:
# Unindo os dados novos com os antigos
df = df_new.unionByName(df_old)

# Obtendo valores distintos para acrescentar apenas valores incrementais
df_incremental = df.groupBy(df.columns).count().filter(F.col("count") == 1).drop("count")

In [0]:
# Escrita na camada bronze

df_incremental.write \
    .format("delta") \
    .mode("append") \
    .saveAsTable("steam.bronze.details_games")