# Apache Iceberg in Spark

In [1]:
from pyspark.sql import SparkSession
import os

In [2]:
spark = (
    SparkSession.builder
    .appName("Intro to Iceberg in Spark")
    .master("spark://spark:7077") 
    .getOrCreate()
)

In [3]:
print(spark.conf.get("spark.eventLog.enabled"))
print(spark.conf.get("spark.eventLog.dir"))

true
s3a://spark-events/logs/


In [4]:
print("Spark version:", spark.version)

Spark version: 3.5.3


The following is purely for debugging, but you may find it interesting, this is the configuration for our job. It shows the settings from our configuration file. One of the important aspects to note is the location of the iceberg repository.

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Print all key-value pairs in Spark config
for k, v in spark.sparkContext.getConf().getAll():
    print(f"{k} = {v}")

spark.eventLog.enabled = true
spark.executor.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -Daws.region=us-east-1
spark.hadoop.fs.s3a.connection.ssl.enabled = false
spark.

In [6]:
print(spark.sparkContext.master) # should be spark://spark:7077
print(spark.sparkContext.uiWebUrl) # link to the app UI

spark://spark:7077
http://c820f10354d4:4040


In [7]:
spark.sql("SHOW NAMESPACES IN ice").show(truncate=False)

+---------+
|namespace|
+---------+
+---------+



In [8]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS ice.demo")

DataFrame[]

In [9]:
spark.sql("SHOW NAMESPACES IN ice").show(truncate=False)

+---------+
|namespace|
+---------+
|demo     |
+---------+



In [10]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ice.demo.customers (
        id BIGINT,
        name STRING,
        email STRING
    )
    USING iceberg
    PARTITIONED BY (email)
""")

DataFrame[]

In [11]:
spark.sql("""
    INSERT INTO ice.demo.customers VALUES
      (1, 'Alice Smith', 'alice@example.com'),
      (2, 'Bob Johnson', 'bob@example.com'),
      (3, 'Carol Adams', 'carol@example.com')
""")

DataFrame[]

Select the customers

In [12]:
spark.sql("SELECT * FROM ice.demo.customers").show()

+---+-----------+-----------------+
| id|       name|            email|
+---+-----------+-----------------+
|  3|Carol Adams|carol@example.com|
|  1|Alice Smith|alice@example.com|
|  2|Bob Johnson|  bob@example.com|
+---+-----------+-----------------+



Select the customers with an `o`

In [13]:
spark.sql("SELECT * FROM ice.demo.customers WHERE name like '%o%'").show()

+---+-----------+-----------------+
| id|       name|            email|
+---+-----------+-----------------+
|  3|Carol Adams|carol@example.com|
|  2|Bob Johnson|  bob@example.com|
+---+-----------+-----------------+



Let's add some more data to our DataLake

In [14]:
spark.sql("""
    INSERT INTO ice.demo.customers VALUES
      (4,  'Diego Ramirez',       'diego.ramirez@example.com'),
      (5,  'Maya Patel',          'maya.patel@example.com'),
      (6,  'Liam O’Connor',       'liam.oconnor@example.com'),
      (7,  'Sofia Almeida',       'sofia.almeida@example.com'),
      (8,  'Noah Williams',       'noah.williams@example.com'),
      (9,  'Ava Thompson',        'ava.thompson@example.com'),
      (10, 'Ethan Chen',          'ethan.chen@example.com'),
      (11, 'Olivia Garcia',       'olivia.garcia@example.com'),
      (12, 'Lucas Martin',        'lucas.martin@example.com'),
      (13, 'Emma Robinson',       'emma.robinson@example.com'),
      (14, 'Benjamin Kim',        'benjamin.kim@example.com'),
      (15, 'Isabella Rossi',      'isabella.rossi@example.com'),
      (16, 'James Nguyen',        'james.nguyen@example.com'),
      (17, 'Mila Novak',          'mila.novak@example.com'),
      (18, 'Henry Scott',         'henry.scott@example.com'),
      (19, 'Aria Johnson',        'aria.johnson@example.com'),
      (20, 'Daniela Costa',       'daniela.costa@example.com'),
      (21, 'Jack Wilson',         'jack.wilson@example.com'),
      (22, 'Zoe King',            'zoe.king@example.com'),
      (23, 'Oliver Brown',        'oliver.brown@example.com')
""")

DataFrame[]

Let's run our query

In [15]:
spark.sql("SELECT * FROM ice.demo.customers WHERE name like '%o%'").show()

+---+--------------+--------------------+
| id|          name|               email|
+---+--------------+--------------------+
| 22|      Zoe King|zoe.king@example.com|
| 21|   Jack Wilson|jack.wilson@examp...|
| 15|Isabella Rossi|isabella.rossi@ex...|
| 18|   Henry Scott|henry.scott@examp...|
|  7| Sofia Almeida|sofia.almeida@exa...|
| 19|  Aria Johnson|aria.johnson@exam...|
| 20| Daniela Costa|daniela.costa@exa...|
|  4| Diego Ramirez|diego.ramirez@exa...|
|  8| Noah Williams|noah.williams@exa...|
| 17|    Mila Novak|mila.novak@exampl...|
|  9|  Ava Thompson|ava.thompson@exam...|
| 13| Emma Robinson|emma.robinson@exa...|
|  6| Liam O’Connor|liam.oconnor@exam...|
| 23|  Oliver Brown|oliver.brown@exam...|
|  3|   Carol Adams|   carol@example.com|
|  2|   Bob Johnson|     bob@example.com|
+---+--------------+--------------------+



In [16]:
spark.stop()