# BIBLIOTECAS START SPARK

In [2]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession

# Carrega as vari√°veis de ambiente
load_dotenv()
s3_endpoint = os.getenv("S3_ENDPOINT")
s3_access_key = os.getenv("S3_ACCESS_KEY")
s3_secret_key = os.getenv("S3_SECRET_KEY")

# Inicializa SparkSession com suporte a Iceberg + MinIO (igual ao seu script original)
spark = SparkSession.builder \
    .appName("ReadIcebergSilverTables") \
    .config("spark.jars", "/opt/spark/jars/iceberg-spark-runtime-3.5_2.12-1.6.0.jar") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "s3a://datalake/iceberg") \
    .config("spark.hadoop.fs.s3a.endpoint", s3_endpoint) \
    .config("spark.hadoop.fs.s3a.access.key", s3_access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", s3_secret_key) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")


25/04/22 09:44:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# SILVER

In [7]:
# Lista tabelas da camada silver
silver_tables = [row.tableName for row in spark.sql("SHOW TABLES IN local.silver").collect()]

print("\nüìä Tabelas dispon√≠veis na camada Silver:")
for table in silver_tables:
    print(f"- {table}")

# Exemplo: Ler uma tabela espec√≠fica (voc√™ pode escolher qualquer uma da lista)
if silver_tables:
    table_to_read = silver_tables[0]  # Lendo a primeira tabela como exemplo
    print(f"\nüîç Lendo dados da tabela: local.silver.{table_to_read}")
    
    df = spark.read.format("iceberg").load(f"local.silver.{table_to_read}")
    
    # Mostra o schema e algumas linhas
    print("\nüìê Schema:")
    df.printSchema()
    
    print("\nüìù Amostra dos dados (10 primeiras linhas):")
    df.show(10, truncate=False)
    
    # Voc√™ tamb√©m pode fazer consultas SQL diretamente
    print("\nüîé Exemplo de consulta SQL (contagem de registros):")
    spark.sql(f"SELECT COUNT(*) FROM local.silver.{table_to_read}").show()
else:
    print("\n‚ö†Ô∏è Nenhuma tabela encontrada na camada Silver.")

print("\n‚úÖ Leitura conclu√≠da.")


üìä Tabelas dispon√≠veis na camada Silver:
- clientes

üîç Lendo dados da tabela: local.silver.clientes

üìê Schema:
root
 |-- id: string (nullable = true)
 |-- nome: string (nullable = true)
 |-- email: string (nullable = true)
 |-- data_cadastro: string (nullable = true)
 |-- status: string (nullable = true)
 |-- created_at: string (nullable = true)


üìù Amostra dos dados (10 primeiras linhas):
+---+----------+----------------+-------------+------+--------------------------+
|id |nome      |email           |data_cadastro|status|created_at                |
+---+----------+----------------+-------------+------+--------------------------+
|1  |Jo√£o Silva|joao@empresa.com|2025-04-06   |ativo |2025-04-18 13:49:35.601193|
|1  |Jo√£o Silva|joao@empresa.com|2025-04-06   |ativo |2025-04-19 14:06:49.98277 |
|1  |Jo√£o Silva|joao@empresa.com|2025-04-06   |ativo |2025-04-18 13:45:52.2518  |
+---+----------+----------------+-------------+------+--------------------------+


üîé Exemplo de

In [6]:
SHOW TABLES FROM iceberg.local.silver;


SyntaxError: invalid syntax (2720420295.py, line 1)

# VERIFICAR DADOS ATUAIS

In [3]:
spark.sql("SELECT * FROM local.silver.clientes where 1=1 and status='ativo'").show()

                                                                                

+-------+--------------------+--------------------+-------------+------+--------------------+
|     id|                nome|               email|data_cadastro|status|          created_at|
+-------+--------------------+--------------------+-------------+------+--------------------+
|6010017|       Hellena Alves|hellena.alves.13@...|   2025-02-03| ativo|2025-04-21 09:59:...|
|6010045|       Samuel Vieira|samuel.vieira.41@...|   2025-01-07| ativo|2025-04-21 09:59:...|
|6010047|       Bruna Costela|bruna.costela.43@...|   2023-10-06| ativo|2025-04-21 09:59:...|
|6010059|Ana Luiza Casa Gr...|ana.luiza.casa.gr...|   2024-08-02| ativo|2025-04-21 09:59:...|
|6010060|    Guilherme Farias|guilherme.farias....|   2025-03-30| ativo|2025-04-21 09:59:...|
|6010068|        Sarah C√¢mara|sarah.c√¢mara.64@e...|   2024-01-27| ativo|2025-04-21 09:59:...|
|6010076|   Maria Luiza Ramos|maria.luiza.ramos...|   2024-01-09| ativo|2025-04-21 09:59:...|
|6010087|    Mathias da Costa|mathias.da.costa....|   2023

In [3]:
spark.sql("SELECT count(*) FROM local.bronze.clientes").show()

                                                                                

+--------+
|count(1)|
+--------+
|  100000|
+--------+



In [6]:
spark.sql("SELECT * FROM local.bronze.clientes").show()

                                                                                

+-------+--------------------+--------------------+-------------+--------+--------------------+
|     id|                nome|               email|data_cadastro|  status|          created_at|
+-------+--------------------+--------------------+-------------+--------+--------------------+
|6010006|Anthony Gabriel C...|anthony.gabriel.c...|   2025-03-22|   ativo|2025-04-20 15:51:...|
|6010012|Luiz Ot√°vio Rodri...|luiz.ot√°vio.rodri...|   2024-04-24| inativo|2025-04-20 15:51:...|
|6010013|Dra. Maria Sophia...|dra..maria.sophia...|   2024-04-22| inativo|2025-04-20 15:51:...|
|6010021|       Let√≠cia Nunes|let√≠cia.nunes.17@...|   2025-01-15|   ativo|2025-04-20 15:51:...|
|6010022|    Nat√°lia Ferreira|nat√°lia.ferreira....|   2024-10-11| inativo|2025-04-20 15:51:...|
|6010025|      Beatriz Santos|beatriz.santos.21...|   2023-07-31|pendente|2025-04-20 15:51:...|
|6010027|      Rafaela Guerra|rafaela.guerra.23...|   2023-08-27|   ativo|2025-04-20 15:51:...|
|6010028|Maria Fernanda Bo...|mari

# Configurar propriedades da tabela:
Ajuste propriedades para otimizar performance:

In [6]:
spark.sql("""
    ALTER TABLE local.silver.clientes
    SET TBLPROPERTIES (
        'write.format.default'='parquet',
        'write.parquet.compression-codec'='snappy',
        'write.target-file-size-bytes'='134217728',
        'commit.retry.num-retries'='10'
    )
""")

DataFrame[]

# Verificar metadados:

In [13]:
spark.sql("SELECT * FROM local.silver.clientes.metadata_log_entries").show(truncate=False)

+-----------------------+----------------------------------------------------------------+-------------------+----------------+----------------------+
|timestamp              |file                                                            |latest_snapshot_id |latest_schema_id|latest_sequence_number|
+-----------------------+----------------------------------------------------------------+-------------------+----------------+----------------------+
|2025-04-20 01:37:34.36 |s3a://datalake/iceberg/silver/clientes/metadata/v1.metadata.json|5118660716084646725|0               |1                     |
|2025-04-20 01:37:39.305|s3a://datalake/iceberg/silver/clientes/metadata/v2.metadata.json|8236000187873164760|0               |2                     |
|2025-04-20 10:02:40.51 |s3a://datalake/iceberg/silver/clientes/metadata/v3.metadata.json|8236000187873164760|0               |2                     |
|2025-04-20 10:42:08.222|s3a://datalake/iceberg/silver/clientes/metadata/v4.metadata.json|7585

In [21]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta


# Calcular o timestamp de 30 dias atr√°s
current_time = datetime.now()
thirty_days_ago = current_time - timedelta(days=30)
timestamp_str = thirty_days_ago.strftime("%Y-%m-%d %H:%M:%S")

# Executar a limpeza de snapshots
spark.sql(f"""
    CALL local.system.expire_snapshots(
        'local.silver.clientes',
        TIMESTAMP '{timestamp_str}'
    )
""").show()

                                                                                

+------------------------+-----------------------------------+-----------------------------------+----------------------------+----------------------------+------------------------------+
|deleted_data_files_count|deleted_position_delete_files_count|deleted_equality_delete_files_count|deleted_manifest_files_count|deleted_manifest_lists_count|deleted_statistics_files_count|
+------------------------+-----------------------------------+-----------------------------------+----------------------------+----------------------------+------------------------------+
|                       0|                                  0|                                  0|                           0|                           0|                             0|
+------------------------+-----------------------------------+-----------------------------------+----------------------------+----------------------------+------------------------------+



# Alternativamente, realizar uma deduplica√ß√£o manual, mantendo apenas o registro mais recente por id:

In [18]:
spark.sql("""
        SELECT id, nome, email, data_cadastro, status, created_at
            FROM (
                SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY created_at DESC) AS rn
                FROM local.silver.clientes
            ) WHERE rn = 1
        """).show()

+---+----------+----------------+-------------+------+--------------------+
| id|      nome|           email|data_cadastro|status|          created_at|
+---+----------+----------------+-------------+------+--------------------+
|  1|Jo√£o Silva|joao@empresa.com|   2025-04-06| ativo|2025-04-19 14:06:...|
+---+----------+----------------+-------------+------+--------------------+



In [22]:
spark.sql("""
    CREATE OR REPLACE TABLE local.silver.clientes
    USING iceberg
    AS
    SELECT id, nome, email, data_cadastro, status, created_at
    FROM (
        SELECT *,
               ROW_NUMBER() OVER (PARTITION BY id ORDER BY created_at DESC) AS rn
        FROM local.silver.clientes
    ) t
    WHERE rn = 1
""")

                                                                                

DataFrame[]

# SNAPSHOTS

In [14]:
spark.sql("""SELECT * FROM local.silver.clientes FOR TIMESTAMP AS OF '2025-04-18 13:50:00'""").show();

IllegalArgumentException: Cannot find a snapshot older than 2025-04-18T13:50:00+00:00

# LIMPEZA DE SNAPSHOTS 

In [15]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta

# Calcular o timestamp de 7 dias atr√°s
current_time = datetime.now()
seven_days_ago = current_time - timedelta(days=7)
timestamp_str = seven_days_ago.strftime("%Y-%m-%d %H:%M:%S")

# Executar a limpeza de snapshots no cat√°logo 'local'
spark.sql(f"""
    CALL local.system.expire_snapshots('local.silver.clientes', TIMESTAMP '{timestamp_str}')
""").show()

                                                                                

+------------------------+-----------------------------------+-----------------------------------+----------------------------+----------------------------+------------------------------+
|deleted_data_files_count|deleted_position_delete_files_count|deleted_equality_delete_files_count|deleted_manifest_files_count|deleted_manifest_lists_count|deleted_statistics_files_count|
+------------------------+-----------------------------------+-----------------------------------+----------------------------+----------------------------+------------------------------+
|                       0|                                  0|                                  0|                           0|                           0|                             0|
+------------------------+-----------------------------------+-----------------------------------+----------------------------+----------------------------+------------------------------+



In [16]:
spark.sql("SELECT * FROM local.silver.clientes.snapshots").show(truncate=False)

+-----------------------+-------------------+---------+---------+--------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|committed_at           |snapshot_id        |parent_id|operation|manifest_list                                                                                                       |summary                                                                                                                                                                          

# PARTICIONAMENTO

In [12]:
spark.sql("""ALTER TABLE local.silver.clientes SET PARTITION SPEC (year(data_cadastro), month(data_cadastro))""").show();

ParseException: 
[PARSE_SYNTAX_ERROR] Syntax error at or near 'PARTITION'.(line 1, pos 38)

== SQL ==
ALTER TABLE local.silver.clientes SET PARTITION SPEC (year(data_cadastro), month(data_cadastro))
--------------------------------------^^^


In [32]:
spark.sql("""
    SELECT * 
    FROM local.silver.clientes 
    WHERE 1=1
""").show()

+---+----------+----------------+-------------+------+
| id|      nome|           email|data_cadastro|status|
+---+----------+----------------+-------------+------+
|  1|Jo√£o Silva|joao@empresa.com|   2025-04-06| ativo|
+---+----------+----------------+-------------+------+



# BRONZE

In [6]:
# Lista tabelas da camada silver
bronze_tables = [row.tableName for row in spark.sql("SHOW TABLES IN local.bronze").collect()]

print("\nüìä Tabelas dispon√≠veis na camada Bronze:")
for table in bronze_tables:
    print(f"- {table}")

# Exemplo: Ler uma tabela espec√≠fica (voc√™ pode escolher qualquer uma da lista)
if bronze_tables:
    table_to_read = bronze_tables[0]  # Lendo a primeira tabela como exemplo
    print(f"\nüîç Lendo dados da tabela: local.bronze.{table_to_read}")
    
    df = spark.read.format("iceberg").load(f"local.bronze.{table_to_read}")
    
    # Mostra o schema e algumas linhas
    print("\nüìê Schema:")
    df.printSchema()
    
    print("\nüìù Amostra dos dados (10 primeiras linhas):")
    df.show(10, truncate=False)
    
    # Voc√™ tamb√©m pode fazer consultas SQL diretamente
    print("\nüîé Exemplo de consulta SQL (contagem de registros):")
    spark.sql(f"SELECT COUNT(*) FROM local.bronze.{table_to_read}").show()
else:
    print("\n‚ö†Ô∏è Nenhuma tabela encontrada na camada Bronze.")

print("\n‚úÖ Leitura conclu√≠da.")


üìä Tabelas dispon√≠veis na camada Bronze:
- clientes

üîç Lendo dados da tabela: local.bronze.clientes

üìê Schema:
root
 |-- id: string (nullable = true)
 |-- nome: string (nullable = true)
 |-- email: string (nullable = true)
 |-- data_cadastro: string (nullable = true)
 |-- status: string (nullable = true)
 |-- created_at: string (nullable = true)


üìù Amostra dos dados (10 primeiras linhas):


                                                                                

+---+----------+----------------+-------------+------+--------------------------+
|id |nome      |email           |data_cadastro|status|created_at                |
+---+----------+----------------+-------------+------+--------------------------+
|1  |Jo√£o Silva|joao@empresa.com|2025-04-06   |ativo |2025-04-18 13:45:52.2518  |
|1  |Jo√£o Silva|joao@empresa.com|2025-04-06   |ativo |2025-04-18 13:49:35.601193|
|1  |Jo√£o Silva|joao@empresa.com|2025-04-06   |ativo |2025-04-19 14:06:49.98277 |
+---+----------+----------------+-------------+------+--------------------------+


üîé Exemplo de consulta SQL (contagem de registros):
+--------+
|count(1)|
+--------+
|       3|
+--------+


‚úÖ Leitura conclu√≠da.


# TESTES

In [44]:
cutoff = spark.sql("SELECT date_format(date_sub(current_date(), 30), 'yyyy-MM-dd')").first()[0]
filter_expr = f'created_at > to_date("{cutoff}")'

In [45]:
# Verificar o tipo da coluna created_at
schema_data = spark.sql("DESCRIBE TABLE local.silver.clientes").collect()

In [46]:

for row in schema_data:
    if row["col_name"].lower() == "created_at":
        created_at_type = row["data_type"]
        print(f"Tipo da coluna created_at: {created_at_type}")
        break

Tipo da coluna created_at: string


In [2]:
from pyspark.sql import Row
from datetime import datetime, timedelta

try:
    # 1. Detectar tipo da coluna com mais precis√£o
    schema_data = spark.sql("DESCRIBE TABLE local.silver.clientes").collect()
    col_info = next(row for row in schema_data if row["col_name"].lower() == "created_at")
    print(f"Tipo detectado: {col_info['data_type']}")
    
    # 2. Calcular data de corte (30 dias atr√°s)
    cutoff_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d %H:%M:%S')
    
    # 3. Construir express√£o de filtro
    if "timestamp" in col_info["data_type"].lower() or "datetime" in col_info["data_type"].lower():
        filter_expr = f"created_at > cast('{cutoff_date}' as timestamp)"
    elif "date" in col_info["data_type"].lower():
        filter_expr = f"created_at > date('{cutoff_date.split()[0]}')"
    else:
        filter_expr = None
    
    # 4. Executar compacta√ß√£o
    if filter_expr:
        print(f"Executando com filtro: {filter_expr}")
        spark.sql(f"""
          CALL local.system.rewrite_data_files(
            table => 'silver.clientes',
            strategy => 'binpack',
            where => '{filter_expr}'
          )
        """)
    else:
        print("Executando compacta√ß√£o sem filtro de data")
        spark.sql("""
          CALL local.system.rewrite_data_files(
            table => 'silver.clientes',
            strategy => 'binpack'
          )
        """)
    
    print("‚úÖ Compacta√ß√£o conclu√≠da com sucesso")

except StopIteration:
    print("‚ùå Coluna 'created_at' n√£o encontrada na tabela")
except Exception as e:
    print(f"‚ùå Erro durante a opera√ß√£o: {str(e)}")

Tipo detectado: string
Executando compacta√ß√£o sem filtro de data
‚úÖ Compacta√ß√£o conclu√≠da com sucesso


In [3]:
spark.sql("SELECT version()").show()

                                                                                

+--------------------+
|           version()|
+--------------------+
|3.5.1 fd86f85e181...|
+--------------------+



In [4]:
spark.sql(f"SELECT count(*) FROM local.silver.clientes WHERE created_at > timestamp('{cutoff_date}')").show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------+
|count(1)|
+--------+
|       3|
+--------+



                                                                                

# ESTUDO SPARK

In [6]:
from datetime import datetime, timedelta

# Definindo o intervalo de datas
start_date = datetime.strptime("2025-01-01", "%Y-%m-%d")
end_date = datetime.strptime("2025-01-02", "%Y-%m-%d")

current_date = start_date
while current_date <= end_date:
    data_str = current_date.strftime("%Y-%m-%d")
    
    query = f"""
        SELECT * FROM local.silver.clientes 
        WHERE status = 'ativo' 
        AND data_cadastro = '{data_str}'
    """
    
    df = spark.sql(query)
    print(f"Clientes cadastrados em {data_str}:")
    df.show()

    current_date += timedelta(days=1)


Clientes cadastrados em 2025-01-01:


                                                                                

+-------+--------------------+--------------------+-------------+------+--------------------+
|     id|                nome|               email|data_cadastro|status|          created_at|
+-------+--------------------+--------------------+-------------+------+--------------------+
|6012290|     Rafael da Cunha|rafael.da.cunha.2...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6024484|       Amanda Guerra|amanda.guerra.144...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6030504|     Raquel Teixeira|raquel.teixeira.2...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6061519|Ana Clara Casa Gr...|ana.clara.casa.gr...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6061708|Ana Laura Montenegro|ana.laura.montene...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6069072|        Antony Pinto|antony.pinto.5906...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6076809|        √çsis da Rosa|√≠sis.da.rosa.6680...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6091119|  Sr. Gustavo Barros|sr..gustavo.barro...|   2025

[Stage 51:>                                                         (0 + 1) / 1]

+-------+--------------------+--------------------+-------------+------+--------------------+
|     id|                nome|               email|data_cadastro|status|          created_at|
+-------+--------------------+--------------------+-------------+------+--------------------+
|6020085|     Lorenzo Martins|lorenzo.martins.1...|   2025-01-02| ativo|2025-04-21 09:59:...|
|6023153|      Julia da Rocha|julia.da.rocha.13...|   2025-01-02| ativo|2025-04-21 09:59:...|
|6027115|Anthony Gabriel F...|anthony.gabriel.f...|   2025-01-02| ativo|2025-04-21 09:59:...|
|6036870|  Vin√≠cius Fernandes|vin√≠cius.fernande...|   2025-01-02| ativo|2025-04-21 09:59:...|
|6040949|      Sophie da Mata|sophie.da.mata.30...|   2025-01-02| ativo|2025-04-21 09:59:...|
|6051607|Sr. Theo Albuquerque|sr..theo.albuquer...|   2025-01-02| ativo|2025-04-21 09:59:...|
|6060347|        Raul Martins|raul.martins.5034...|   2025-01-02| ativo|2025-04-21 09:59:...|
|6067261| Mariah da Concei√ß√£o|mariah.da.concei√ß...|   2

                                                                                

In [8]:
from datetime import datetime, timedelta
from pyspark.sql import SparkSession

# Cria o DataFrame vazio com o mesmo schema da primeira consulta
combined_df = None

start_date = datetime.strptime("2025-01-01", "%Y-%m-%d")
end_date = datetime.strptime("2025-01-02", "%Y-%m-%d")

current_date = start_date
while current_date <= end_date:
    data_str = current_date.strftime("%Y-%m-%d")
    
    query = f"""
        SELECT * FROM local.silver.clientes 
        WHERE status = 'ativo' 
        AND data_cadastro = '{data_str}'
    """
    
    df = spark.sql(query)
    
    if combined_df is None:
        combined_df = df
    else:
        combined_df = combined_df.union(df)
    
    current_date += timedelta(days=1)

# Mostrar o DataFrame final combinado
print("Todos os clientes ativos no intervalo de datas:")
combined_df.show()


Todos os clientes ativos no intervalo de datas:


[Stage 52:>                                                         (0 + 1) / 1]

+-------+--------------------+--------------------+-------------+------+--------------------+
|     id|                nome|               email|data_cadastro|status|          created_at|
+-------+--------------------+--------------------+-------------+------+--------------------+
|6012290|     Rafael da Cunha|rafael.da.cunha.2...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6024484|       Amanda Guerra|amanda.guerra.144...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6030504|     Raquel Teixeira|raquel.teixeira.2...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6061519|Ana Clara Casa Gr...|ana.clara.casa.gr...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6061708|Ana Laura Montenegro|ana.laura.montene...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6069072|        Antony Pinto|antony.pinto.5906...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6076809|        √çsis da Rosa|√≠sis.da.rosa.6680...|   2025-01-01| ativo|2025-04-21 09:59:...|
|6091119|  Sr. Gustavo Barros|sr..gustavo.barro...|   2025

                                                                                