In [1]:
from pyspark.sql import SparkSession

# Spark session - uses spark-defaults.conf, no manual config needed
spark = SparkSession.builder \
    .appName("Hive-Iceberg-MinIO") \
    .enableHiveSupport() \
    .getOrCreate()

print("✓ Spark session created!")
print("✓ Connected to Hive Metastore")
print("✓ Iceberg catalog ready")
print()
print(f"Spark version: {spark.version}")
print(f"Default catalog: {spark.catalog.currentCatalog()}")

✓ Spark session created!
✓ Connected to Hive Metastore
✓ Iceberg catalog ready

Spark version: 3.4.1
Default catalog: iceberg


## Create a Database and Table

In [2]:
# Create a database (namespace)
spark.sql("CREATE DATABASE IF NOT EXISTS demo")
print("✓ Database 'demo' created!")

# Show databases
spark.sql("SHOW DATABASES").show()

✓ Database 'demo' created!
+---------+
|namespace|
+---------+
|  default|
|     demo|
+---------+



In [3]:
# Create an Iceberg table
spark.sql("""
    CREATE TABLE IF NOT EXISTS demo.users (
        id INT,
        name STRING,
        email STRING,
        created_at TIMESTAMP
    )
    USING iceberg
    PARTITIONED BY (days(created_at))
""")
print("✓ Table 'demo.users' created!")

# Show tables
spark.sql("SHOW TABLES IN demo").show()

✓ Table 'demo.users' created!
+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|     demo|    users|      false|
+---------+---------+-----------+



## Insert and Query Data

In [4]:
# Insert some data
spark.sql("""
    INSERT INTO demo.users VALUES
    (1, 'Alice', 'alice@example.com', current_timestamp()),
    (2, 'Bob', 'bob@example.com', current_timestamp()),
    (3, 'Charlie', 'charlie@example.com', current_timestamp())
""")
print("✓ Data inserted!")

✓ Data inserted!


In [5]:
# Query the data
spark.sql("SELECT * FROM demo.users").show()

+---+-------+-------------------+--------------------+
| id|   name|              email|          created_at|
+---+-------+-------------------+--------------------+
|  1|  Alice|  alice@example.com|2025-11-30 08:22:...|
|  2|    Bob|    bob@example.com|2025-11-30 08:22:...|
|  3|Charlie|charlie@example.com|2025-11-30 08:22:...|
+---+-------+-------------------+--------------------+



## Updates and Deletes (Iceberg feature)

In [6]:
# Update a row
spark.sql("UPDATE demo.users SET email = 'alice.new@example.com' WHERE id = 1")
print("✓ Updated Alice's email")

# Delete a row
spark.sql("DELETE FROM demo.users WHERE id = 3")
print("✓ Deleted Charlie")

# Check results
spark.sql("SELECT * FROM demo.users").show()

✓ Updated Alice's email
✓ Deleted Charlie
+---+-----+--------------------+--------------------+
| id| name|               email|          created_at|
+---+-----+--------------------+--------------------+
|  1|Alice|alice.new@example...|2025-11-30 08:22:...|
|  2|  Bob|     bob@example.com|2025-11-30 08:22:...|
+---+-----+--------------------+--------------------+



## Time Travel (Iceberg feature)

Every change creates a snapshot. You can query historical versions.

In [2]:
# View table history
spark.sql("SELECT * FROM demo.users.history").show(truncate=False)

+-----------------------+-------------------+-------------------+-------------------+
|made_current_at        |snapshot_id        |parent_id          |is_current_ancestor|
+-----------------------+-------------------+-------------------+-------------------+
|2025-11-29 16:37:29.301|8156896692273332934|null               |true               |
|2025-11-29 16:37:41.102|462503457610497467 |8156896692273332934|true               |
|2025-11-29 16:37:41.534|155410955388836578 |462503457610497467 |true               |
|2025-11-29 17:49:11.63 |6866363440748165282|155410955388836578 |true               |
+-----------------------+-------------------+-------------------+-------------------+



In [8]:
# View snapshots
spark.sql("SELECT snapshot_id, committed_at, operation FROM demo.users.snapshots").show(truncate=False)

+-------------------+-----------------------+---------+
|snapshot_id        |committed_at           |operation|
+-------------------+-----------------------+---------+
|6264401885849743558|2025-11-30 08:22:46.497|append   |
|6412877392095783392|2025-11-30 08:22:53.794|overwrite|
|1922986721336350599|2025-11-30 08:22:54.46 |overwrite|
+-------------------+-----------------------+---------+



In [None]:
# Query an older snapshot (get snapshot_id from above)
# Uncomment and replace with actual snapshot_id:
# spark.sql("SELECT * FROM demo.users VERSION AS OF <snapshot_id>").show()

## Schema Evolution (Iceberg feature)

Add, drop, rename columns without rewriting data.

In [9]:
# Add a new column
spark.sql("ALTER TABLE demo.users ADD COLUMN age INT")
print("✓ Added 'age' column")

# Describe the table
spark.sql("DESCRIBE demo.users").show()

✓ Added 'age' column
+--------------+----------------+-------+
|      col_name|       data_type|comment|
+--------------+----------------+-------+
|            id|             int|   null|
|          name|          string|   null|
|         email|          string|   null|
|    created_at|       timestamp|   null|
|           age|             int|   null|
|              |                |       |
|# Partitioning|                |       |
|        Part 0|days(created_at)|       |
+--------------+----------------+-------+



In [10]:
# Update with new column
spark.sql("UPDATE demo.users SET age = 30 WHERE id = 1")
spark.sql("UPDATE demo.users SET age = 25 WHERE id = 2")

spark.sql("SELECT * FROM demo.users").show()

+---+-----+--------------------+--------------------+---+
| id| name|               email|          created_at|age|
+---+-----+--------------------+--------------------+---+
|  1|Alice|alice.new@example...|2025-11-30 08:22:...| 30|
|  2|  Bob|     bob@example.com|2025-11-30 08:22:...| 25|
+---+-----+--------------------+--------------------+---+



## Table Metadata

In [11]:
# View files that make up the table
spark.sql("SELECT file_path, file_size_in_bytes, record_count FROM demo.users.files").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------+------------------+------------+
|file_path                                                                                                               |file_size_in_bytes|record_count|
+------------------------------------------------------------------------------------------------------------------------+------------------+------------+
|s3a://warehouse/demo.db/users/data/created_at_day=2025-11-30/00000-20-6bdf6a0f-969c-4506-9968-2ec81e2c314d-00001.parquet|1472              |2           |
+------------------------------------------------------------------------------------------------------------------------+------------------+------------+



In [12]:
# View partitions
spark.sql("SELECT * FROM demo.users.partitions").show()

+------------+-------+------------+----------+-----------------------------+----------------------------+--------------------------+----------------------------+--------------------------+--------------------+------------------------+
|   partition|spec_id|record_count|file_count|total_data_file_size_in_bytes|position_delete_record_count|position_delete_file_count|equality_delete_record_count|equality_delete_file_count|     last_updated_at|last_updated_snapshot_id|
+------------+-------+------------+----------+-----------------------------+----------------------------+--------------------------+----------------------------+--------------------------+--------------------+------------------------+
|{2025-11-30}|      0|           2|         1|                         1472|                           0|                         0|                           0|                         0|2025-11-30 08:23:...|     2191539070265404586|
+------------+-------+------------+----------+--------------

## Cleanup (Optional)

In [None]:
# Drop table (uncomment to run)
# spark.sql("DROP TABLE IF EXISTS demo.users")
# print("✓ Table dropped")

# Drop database (uncomment to run)
# spark.sql("DROP DATABASE IF EXISTS demo CASCADE")
# print("✓ Database dropped")

In [None]:
# Stop Spark session (optional, frees resources)
# spark.stop()