In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName('dataincode') \
    .config("spark.jars", "/opt/spark/jars/iceberg-spark-runtime-3.5_2.12-1.6.0.jar") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.local.warehouse", "s3a://datalake/iceberg") \
    .getOrCreate()

#Ajuste de log WARN log para ERROR
spark.sparkContext.setLogLevel("ERROR")

In [3]:
# Importar funções 
from IPython.display import display, HTML

In [5]:
%run ./Includes/Utils.ipynb

In [6]:
%run ./Includes/Datasets.ipynb

## Criar Dataframes e escrever tabelas no catalogo

In [5]:
# Inicio rapido

# help(create_dataframe)
# print(lista_amostras)
create_dataframe?

[0;31mSignature:[0m [0mcreate_dataframe[0m[0;34m([0m[0mschema[0m[0;34m:[0m [0mlist[0m[0;34m,[0m [0mdata[0m[0;34m:[0m [0mlist[0m[0;34m)[0m [0;34m->[0m [0mpyspark[0m[0;34m.[0m[0msql[0m[0;34m.[0m[0mdataframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
This function will creates new Pyspark DataFrames.

Args:
    schema (list): Uma lista com a estrutura da tabela ou schema
    data (list): Uma lista com os dados no formato do schema
Returns:
    df: Retorna um DataFrame do Pyspark

Snippet:
    my_df = create_dataframe(my_schema, my_data)
[0;31mFile:[0m      /tmp/ipykernel_187/2596470257.py
[0;31mType:[0m      function

In [7]:
init_data_df = create_dataframe(columns_schema, init_data)

In [8]:
init_data_df.show(5)

                                                                                

+------------+----------+-----------+----------+----------+----------+---------------+---------+
|order_number|order_date|qty_ordered|unit_price|    status|product_id|product_line_id|  country|
+------------+----------+-----------+----------+----------+----------+---------------+---------+
|       10168|2024-01-23|          5|    98.115|  Disputed|  S10_1949|           1002|   France|
|       10180|2024-01-22|          1|    951.87|In Process|  S10_2016|           1002|   Norway|
|       10188|2024-01-21|         65|    95.202| Cancelled|  S10_4698|           1002|Australia|
|       10201|2024-01-26|          8|    951.17|   On Hold|  S10_4757|           1221|  Finland|
+------------+----------+-----------+----------+----------+----------+---------------+---------+



In [9]:
spark.sql("USE iceberg").show()

++
||
++
++



In [10]:
spark.sql("show catalogs").show()

+-------------+
|      catalog|
+-------------+
|      iceberg|
|spark_catalog|
+-------------+



### Criar Tabela
- ``` df.writeTo(t).create() ``` é equivalente a ``` CREATE TABLE AS SELECT ```
- ``` df.writeTo(t).replace() ``` é equivalente a ``` REPLACE TABLE AS SELECT ```
- ``` df.writeTo(t).append() ``` é equivalente a ``` INSERT INTO ```
- ``` df.writeTo(t).overwritePartitions() ``` é equivalente a dynamic ``` INSERT OVERWRITE ```
fonte: https://iceberg.apache.org/docs/1.6.0/spark-writes/?h=df.writeto%28t%29.create%28%29#writing-with-dataframes

In [11]:
# Escrever tabela no storage
(
    init_data_df
    .writeTo("iceberg.bronze.vendas")
    .partitionedBy("country")
    .createOrReplace()
)

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".    (0 + 4) / 4]
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

In [12]:
spark.sql("SHOW TABLES in bronze").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|   bronze|nyc_taxis|      false|
|   bronze|   vendas|      false|
+---------+---------+-----------+



In [13]:
## Visualizar os dados simples e leve

spark.sql("SELECT * FROM iceberg.bronze.vendas").show()

[Stage 5:>                                                          (0 + 1) / 1]

+------------+----------+-----------+----------+----------+----------+---------------+---------+
|order_number|order_date|qty_ordered|unit_price|    status|product_id|product_line_id|  country|
+------------+----------+-----------+----------+----------+----------+---------------+---------+
|       10180|2024-01-22|          1|    951.87|In Process|  S10_2016|           1002|   Norway|
|       10201|2024-01-26|          8|    951.17|   On Hold|  S10_4757|           1221|  Finland|
|       10168|2024-01-23|          5|    98.115|  Disputed|  S10_1949|           1002|   France|
|       10188|2024-01-21|         65|    95.202| Cancelled|  S10_4698|           1002|Australia|
+------------+----------+-----------+----------+----------+----------+---------------+---------+



                                                                                

In [14]:
## Visualizar os dados

spark.sql("SELECT * FROM iceberg.bronze.vendas").toPandas()

Unnamed: 0,order_number,order_date,qty_ordered,unit_price,status,product_id,product_line_id,country
0,10180,2024-01-22,1,951.87,In Process,S10_2016,1002,Norway
1,10201,2024-01-26,8,951.17,On Hold,S10_4757,1221,Finland
2,10168,2024-01-23,5,98.115,Disputed,S10_1949,1002,France
3,10188,2024-01-21,65,95.202,Cancelled,S10_4698,1002,Australia


In [15]:
## Inserir novos dados

feb_data_df = create_dataframe(columns_schema, feb_data)

feb_data_df.writeTo("iceberg.bronze.vendas").append()

                                                                                

### Primeiras Impressões

In [16]:
## Descrever tabela

spark.sql("DESCRIBE iceberg.bronze.vendas").show()

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|        order_number|   string|   NULL|
|          order_date|   string|   NULL|
|         qty_ordered|   string|   NULL|
|          unit_price|   string|   NULL|
|              status|   string|   NULL|
|          product_id|   string|   NULL|
|     product_line_id|   string|   NULL|
|             country|   string|   NULL|
|# Partition Infor...|         |       |
|          # col_name|data_type|comment|
|             country|   string|   NULL|
+--------------------+---------+-------+



In [17]:
## Inspecionar tabelas

spark.sql("SELECT * FROM iceberg.bronze.vendas.history").show()

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2025-04-13 16:56:...|2182265787075892720|               NULL|               true|
|2025-04-13 16:56:...|2836209064571787602|2182265787075892720|               true|
+--------------------+-------------------+-------------------+-------------------+



In [18]:
spark.sql("SHOW TBLPROPERTIES iceberg.bronze.vendas").toPandas()

Unnamed: 0,key,value
0,current-snapshot-id,2836209064571787602
1,format,iceberg/parquet
2,format-version,2
3,write.parquet.compression-codec,zstd


In [19]:
spark.sql("SELECT COUNT(*) FROM iceberg.bronze.vendas").show()

+--------+
|count(1)|
+--------+
|       9|
+--------+



## Testes

In [20]:
## Para deletar por completo do catalog e storage
spark.sql("DROP TABLE iceberg.bronze.vendas PURGE")

                                                                                

DataFrame[]

In [21]:
spark.stop()