In [9]:
%pip install pyspark==3.1.1 findspark

Note: you may need to restart the kernel to use updated packages.


In [1]:
import iceberg_spark
spark, jvm, hive_uri, sc = iceberg_spark.start_spark('minio', 'minio', 'hive')
sc.setLogLevel('FATAL')

In [2]:
# Create table
spark.sql("""
CREATE OR REPLACE TABLE sample (
    id bigint,
    data string,
    category string)
USING iceberg
PARTITIONED BY (category)""")

spark.read.table('sample').show()

+---+----+--------+
| id|data|category|
+---+----+--------+
+---+----+--------+



In [3]:
# insert records into a table
spark.sql("""INSERT INTO sample VALUES (1, 'a', 'orders'), (2, 'b', 'product')""")

spark.read.table('sample').show()

+---+----+--------+
| id|data|category|
+---+----+--------+
|  1|   a|  orders|
|  2|   b| product|
+---+----+--------+



In [5]:
# update records in a table
spark.sql("""
UPDATE default.sample
SET data = 'updated_data'
WHERE category = 'orders'""")

spark.read.table('sample').show()

+---+------------+--------+
| id|        data|category|
+---+------------+--------+
|  2|           b| product|
|  1|updated_data|  orders|
+---+------------+--------+



In [6]:
# Using Java compact the table for performance
catalog = jvm.CatalogUtil.loadCatalog("org.apache.iceberg.hive.HiveCatalog", "spark_catalog", {'uri': hive_uri}, sc._jsc.hadoopConfiguration())

# Select the table
table_name = jvm.TableIdentifier.parse("default.sample")
table = catalog.loadTable(table_name)

# Run the compact
jvm.Actions.forTable(table).rewriteDataFiles().targetSizeInBytes(500 * 1024 * 1024).execute()
print("Table compacted")

Table compacted


In [None]:
# delete records from a table
spark.sql("""DELETE FROM sample WHERE category = 'orders'""")

spark.read.table('sample').show()

In [None]:
# Droping tables
spark.sql("DROP TABLE sample")

In [7]:
spark.read.table('sample').show()

+---+------------+--------+
| id|        data|category|
+---+------------+--------+
|  2|           b| product|
|  1|updated_data|  orders|
+---+------------+--------+



# Writing with DataFrames
[apache Iceberg](https://iceberg.apache.org/spark-writes/#writing-with-dataframes)
<ul>
    <li>df.writeTo(t).create() is equivalent to CREATE TABLE AS SELECT</li>
    <li>df.writeTo(t).replace() is equivalent to REPLACE TABLE AS SELECT</li>
    <li>df.writeTo(t).append() is equivalent to INSERT INTO</li>
    <li>df.writeTo(t).overwritePartitions() is equivalent to dynamic INSERT OVERWRITE</li>
</ul>
