# Loading Live Data into Apache Iceberg

In [27]:
from pyspark.sql import SparkSession
import os

In [28]:
spark = (
    SparkSession.builder
    .appName("Live Data Loading in Iceberg")
    .master("spark://spark:7077") 
    .getOrCreate()
)

In [3]:
print(spark.sparkContext.master) # should be spark://spark:7077
print(spark.sparkContext.uiWebUrl) # link to the app UI

spark://spark:7077
http://c820f10354d4:4040


In [29]:
spark.sql("SHOW NAMESPACES IN ice").show(truncate=False)

+---------+
|namespace|
+---------+
|demo     |
+---------+



In [30]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS ice.demo")

DataFrame[]

In [31]:
spark.sql("SHOW NAMESPACES IN ice").show(truncate=False)

+---------+
|namespace|
+---------+
|demo     |
+---------+



In [32]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ice.demo.orders (
        id STRING,
        timestamp TIMESTAMP,
        customer_id INT
    )
    USING iceberg
    PARTITIONED BY (days(timestamp))
""")

DataFrame[]

In [33]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ice.demo.order_items (
       id STRING,
       order_id STRING,
       product_id INT,
       quantity INT
     )
     USING iceberg
    """)

DataFrame[]

In [36]:
spark.sql("""
CREATE TABLE ice.demo.products (
    product_id INT,
    name STRING,
    description STRING,
    price DOUBLE
)
USING iceberg
""")

DataFrame[]

At this point, be sure to run Kafka so that we can have live data happening

Except here, we are going to create tons of fake data to simulate the live loading

In [37]:
spark.sql("""
    INSERT INTO ice.demo.products VALUES
        (1, 'Wireless Mouse', 'Ergonomic 2.4GHz wireless mouse with USB receiver', 24.99),
        (2, 'Mechanical Keyboard', 'RGB backlit mechanical keyboard with blue switches', 79.99),
        (3, 'USB-C Hub', '6-in-1 USB-C hub with HDMI, SD card, and Ethernet ports', 39.99),
        (4, 'Noise-Cancelling Headphones', 'Over-ear Bluetooth headphones with active noise cancellation', 129.99),
        (5, 'Webcam HD', '1080p USB webcam with built-in microphone', 49.99),
        (6, 'Gaming Monitor', '27-inch 144Hz 2K gaming monitor with HDR support', 299.99),
        (7, 'Laptop Stand', 'Adjustable aluminum laptop stand with cooling vents', 34.99),
        (8, 'Portable SSD', '1TB USB 3.2 portable solid-state drive', 119.99),
        (9, 'Smartphone Tripod', 'Flexible tripod with Bluetooth remote control', 19.99),
        (10, 'Wireless Charger', 'Fast-charging Qi wireless pad for smartphones', 29.99),
        (11, 'Coffee Maker', '12-cup programmable coffee maker with timer', 59.99),
        (12, 'Electric Kettle', '1.7L stainless steel electric kettle with auto shut-off', 44.99),
        (13, 'Standing Desk', 'Adjustable height standing desk with memory presets', 399.99),
        (14, 'Office Chair', 'Ergonomic office chair with lumbar support and mesh back', 249.99),
        (15, 'LED Desk Lamp', 'Dimmable LED desk lamp with USB charging port', 39.99),
        (16, 'Cookware Set', '10-piece nonstick cookware set with glass lids', 129.99),
        (17, 'Chef Knife', '8-inch stainless steel professional chef knife', 59.99),
        (18, 'Running Shoes', 'Lightweight breathable running shoes, size 10', 89.99),
        (19, 'Yoga Mat', 'Extra thick non-slip yoga mat, 72 x 24 inches', 29.99),
        (20, 'Bluetooth Speaker', 'Portable waterproof Bluetooth speaker with deep bass', 79.99),
        (21, 'Power Bank', '20,000mAh portable power bank with fast charging', 49.99),
        (22, 'Fitness Tracker', 'Smart fitness band with heart rate monitor', 59.99),
        (23, 'Smartwatch', 'Bluetooth smartwatch with notifications and health tracking', 149.99),
        (24, 'Book: Clean Code', 'A Handbook of Agile Software Craftsmanship by Robert C. Martin', 34.99),
        (25, 'Book: Designing Data-Intensive Applications', 'Martin Kleppmann’s guide to scalable and reliable systems', 44.99)
""")

DataFrame[]

Here we will create the tables for order and order_items so that they can populated with the data

At this point, be sure to run Kafka so that we can have live data happening

Except here, we are going to create tons of fake data to simulate the live loading

Here we will determine the number of customers that we have

In [38]:
spark.sql("SELECT count(*) from ice.demo.customers;").show()

+--------+
|count(1)|
+--------+
|      22|
+--------+



Now we will load multiple orders and order items (which should come from something like Kafka or a live data load)

In [39]:
spark.sql("""
INSERT INTO ice.demo.orders VALUES
('3f06bb49-b6cd-4a3b-8f74-2e1f8ebf27a5', timestamp'2025-10-01 09:13:45', 3),
('52d2ce38-3fc1-4a41-bcf1-c82829c68729', timestamp'2025-10-02 14:25:30', 7),
('b12e2c4a-4ff3-46a3-9f92-b6e04b65121a', timestamp'2025-10-02 16:42:19', 15),
('36d8d5cf-b6c3-4c1a-a9b7-14dc9f8249b5', timestamp'2025-10-03 10:58:05', 12),
('61b24c9a-4f2e-4b3a-bf53-236e305a6a80', timestamp'2025-10-04 18:03:17', 8),
('77d308ad-3431-4b6a-bae1-15bfa2760e74', timestamp'2025-10-05 12:47:11', 19),
('2df2f3f4-c20c-47da-bd4f-7135cb7b8b9b', timestamp'2025-10-06 08:24:09', 2),
('c9c4b25b-d2a3-48a5-b2cc-78a435b3a7a0', timestamp'2025-10-07 20:12:51', 21),
('32ed0e61-0287-4a68-8103-01fc23592c62', timestamp'2025-10-08 09:35:33', 5),
('7b4d8bfc-cd4b-4424-9ac8-dc34f47de8a9', timestamp'2025-10-08 17:40:42', 16),
('a6630e05-21b2-4b92-940f-9f03dcb4a8c1', timestamp'2025-10-09 13:10:27', 9),
('adbc34fb-f7d7-42ef-8e09-2c8658adca6f', timestamp'2025-10-10 11:55:08', 1),
('9dc64a8e-cb1a-4b3b-a303-31bcd9e127f9', timestamp'2025-10-10 16:21:55', 4),
('f71e8cf8-2a64-469d-80de-d4b295b7a249', timestamp'2025-10-11 19:44:03', 17),
('82e2e705-7ee2-4da9-a3f8-b2042b38edac', timestamp'2025-10-12 08:31:59', 22),
('4a154b76-0196-45c3-a5aa-0bdb9cbeb418', timestamp'2025-10-13 15:07:23', 14),
('40a1e68b-2682-4b7e-88d9-71c6a004bf2b', timestamp'2025-10-14 11:18:47', 10),
('ea8a2722-25b3-4b74-a02d-76c6ac960b8b', timestamp'2025-10-15 13:59:41', 6),
('9df8ce5c-10aa-4f45-9f9c-f4a64dc17616', timestamp'2025-10-16 09:11:34', 11),
('7e5ed4b7-0534-4639-aeba-2f61b9df253a', timestamp'2025-10-17 20:47:18', 13)
""")

DataFrame[]

In [40]:
spark.sql("""
INSERT INTO ice.demo.order_items VALUES
('b3f8c9b6-9f2f-4c3e-9b38-8d3e2f6c1a01', '3f06bb49-b6cd-4a3b-8f74-2e1f8ebf27a5', 3, 2),
('1d2b7a8e-7b6a-4f7a-9a3c-5d6c2b1a9f02', '3f06bb49-b6cd-4a3b-8f74-2e1f8ebf27a5', 5, 1),
('c0a1b2c3-d4e5-46f7-98a1-b2c3d4e5f603', '52d2ce38-3fc1-4a41-bcf1-c82829c68729', 1, 3),
('4e5f6a7b-8c9d-4a1b-b2c3-d4e5f6a7b804', '52d2ce38-3fc1-4a41-bcf1-c82829c68729', 4, 1),
('5a6b7c8d-9e0f-4a1b-92c3-d4e5f6a7b905', 'b12e2c4a-4ff3-46a3-9f92-b6e04b65121a', 2, 2),
('6b7c8d9e-0f1a-41b2-93c4-d5e6f7a8ba06', 'b12e2c4a-4ff3-46a3-9f92-b6e04b65121a', 8, 1),
('7c8d9e0f-1a2b-42c3-94d5-e6f7a8b9cb07', '36d8d5cf-b6c3-4c1a-a9b7-14dc9f8249b5', 14, 1),
('8d9e0f1a-2b3c-43d4-95e6-f7a8b9c0dc08', '36d8d5cf-b6c3-4c1a-a9b7-14dc9f8249b5', 21, 2),
('9e0f1a2b-3c4d-44e5-96f7-a8b9c0d1ed09', '61b24c9a-4f2e-4b3a-bf53-236e305a6a80', 17, 1),
('0f1a2b3c-4d5e-45f6-97a8-b9c0d1e2fe10', '61b24c9a-4f2e-4b3a-bf53-236e305a6a80', 18, 2),
('1a2b3c4d-5e6f-46a7-98b9-c0d1e2f3a011', '77d308ad-3431-4b6a-bae1-15bfa2760e74', 23, 1),
('2b3c4d5e-6f70-47b8-09c1-d1e2f3a4b112', '77d308ad-3431-4b6a-bae1-15bfa2760e74', 24, 1),
('3c4d5e6f-7081-48c9-10d2-e2f3a4b5c213', '2df2f3f4-c20c-47da-bd4f-7135cb7b8b9b', 7, 2),
('4d5e6f70-8192-49da-21e3-f3a4b5c6d314', '2df2f3f4-c20c-47da-bd4f-7135cb7b8b9b', 10, 1),
('5e6f7081-92a3-4aeb-32f4-0a4b5c6d7e15', 'c9c4b25b-d2a3-48a5-b2cc-78a435b3a7a0', 12, 1),
('6f708192-a3b4-4bfc-43a5-1b5c6d7e8f16', 'c9c4b25b-d2a3-48a5-b2cc-78a435b3a7a0', 13, 1),
('708192a3-b4c5-4c0d-54b6-2c6d7e8f9017', '32ed0e61-0287-4a68-8103-01fc23592c62', 9, 1),
('8192a3b4-c5d6-4d1e-65c7-3d7e8f901128', '32ed0e61-0287-4a68-8103-01fc23592c62', 20, 2),
('92a3b4c5-d6e7-4e2f-76d8-4e8f90112229', '7b4d8bfc-cd4b-4424-9ac8-dc34f47de8a9', 15, 1),
('a3b4c5d6-e7f8-4f30-87e9-5f9011222330', '7b4d8bfc-cd4b-4424-9ac8-dc34f47de8a9', 25, 1),
('b4c5d6e7-f809-4031-98fa-601122334431', 'a6630e05-21b2-4b92-940f-9f03dcb4a8c1', 11, 1),
('c5d6e7f8-091a-4132-a90b-711223344532', 'a6630e05-21b2-4b92-940f-9f03dcb4a8c1', 2, 2),
('d6e7f809-1a2b-4233-ba1c-821234455633', 'adbc34fb-f7d7-42ef-8e09-2c8658adca6f', 6, 1),
('e7f8091a-2b3c-4334-cb2d-931345566734', 'adbc34fb-f7d7-42ef-8e09-2c8658adca6f', 3, 3),
('f8091a2b-3c4d-4435-dc3e-a41455667835', '9dc64a8e-cb1a-4b3b-a303-31bcd9e127f9', 8, 1),
('091a2b3c-4d5e-4536-ed4f-b51556678936', '9dc64a8e-cb1a-4b3b-a303-31bcd9e127f9', 19, 1),
('1a2b3c4d-5e6f-4637-fe50-c61666789a37', 'f71e8cf8-2a64-469d-80de-d4b295b7a249', 4, 1),
('2b3c4d5e-6f70-4738-0f61-d7176789ab38', 'f71e8cf8-2a64-469d-80de-d4b295b7a249', 5, 2),
('3c4d5e6f-7081-4839-1072-e818789abc39', '82e2e705-7ee2-4da9-a3f8-b2042b38edac', 1, 1),
('4d5e6f70-8192-493a-2183-f91989abcc40', '82e2e705-7ee2-4da9-a3f8-b2042b38edac', 22, 2),
('5e6f7081-92a3-4a3b-3294-0a1a9abccd41', '4a154b76-0196-45c3-a5aa-0bdb9cbeb418', 13, 1),
('6f708192-a3b4-4b3c-43a5-1b2babcdd042', '4a154b76-0196-45c3-a5aa-0bdb9cbeb418', 7, 1),
('708192a3-b4c5-4c3d-54b6-2c3ccddeef43', '40a1e68b-2682-4b7e-88d9-71c6a004bf2b', 9, 1),
('8192a3b4-c5d6-4d3e-65c7-3d4dddeef044', '40a1e68b-2682-4b7e-88d9-71c6a004bf2b', 17, 1),
('92a3b4c5-d6e7-4e3f-76d8-4e5eee0ff145', 'ea8a2722-25b3-4b74-a02d-76c6ac960b8b', 18, 2),
('a3b4c5d6-e7f8-4f40-87e9-5f6ff01a0246', 'ea8a2722-25b3-4b74-a02d-76c6ac960b8b', 10, 1),
('b4c5d6e7-f809-4041-98fa-607001ab1347', '9df8ce5c-10aa-4f45-9f9c-f4a64dc17616', 24, 1),
('c5d6e7f8-091a-4142-a90b-718112bc2448', '9df8ce5c-10aa-4f45-9f9c-f4a64dc17616', 25, 2),
('d6e7f809-1a2b-4243-ba1c-829223cd3549', '7e5ed4b7-0534-4639-aeba-2f61b9df253a', 20, 1),
('e7f8091a-2b3c-4344-cb2d-93a334de4650', '7e5ed4b7-0534-4639-aeba-2f61b9df253a', 15, 3)
""")

DataFrame[]

In [41]:
spark.sql("""Select * from ice.demo.orders;""").show()

+--------------------+-------------------+-----------+
|                  id|          timestamp|customer_id|
+--------------------+-------------------+-----------+
|36d8d5cf-b6c3-4c1...|2025-10-03 10:58:05|         12|
|61b24c9a-4f2e-4b3...|2025-10-04 18:03:17|          8|
|3f06bb49-b6cd-4a3...|2025-10-01 09:13:45|          3|
|52d2ce38-3fc1-4a4...|2025-10-02 14:25:30|          7|
|b12e2c4a-4ff3-46a...|2025-10-02 16:42:19|         15|
|c9c4b25b-d2a3-48a...|2025-10-07 20:12:51|         21|
|32ed0e61-0287-4a6...|2025-10-08 09:35:33|          5|
|7b4d8bfc-cd4b-442...|2025-10-08 17:40:42|         16|
|77d308ad-3431-4b6...|2025-10-05 12:47:11|         19|
|2df2f3f4-c20c-47d...|2025-10-06 08:24:09|          2|
|f71e8cf8-2a64-469...|2025-10-11 19:44:03|         17|
|82e2e705-7ee2-4da...|2025-10-12 08:31:59|         22|
|a6630e05-21b2-4b9...|2025-10-09 13:10:27|          9|
|adbc34fb-f7d7-42e...|2025-10-10 11:55:08|          1|
|9dc64a8e-cb1a-4b3...|2025-10-10 16:21:55|          4|
|ea8a2722-

Verify that all orders have order items

In [42]:
spark.sql("""
SELECT COUNT(*) AS missing_order_refs
FROM ice.demo.order_items oi
LEFT JOIN ice.demo.orders o
ON oi.order_id = o.id
WHERE o.id IS NULL
""").show()

+------------------+
|missing_order_refs|
+------------------+
|                 0|
+------------------+



Verify that all products are valid

In [43]:
spark.sql("""
SELECT COUNT(*) AS invalid_product_refs
FROM ice.demo.order_items
WHERE product_id < 1 OR product_id > 25
""").show()

+--------------------+
|invalid_product_refs|
+--------------------+
|                   0|
+--------------------+



Verify that all customers are invalid

In [44]:
spark.sql("""
SELECT COUNT(*) AS invalid_customer_refs
FROM ice.demo.orders
WHERE customer_id < 1 OR customer_id > 22
""").show()

+---------------------+
|invalid_customer_refs|
+---------------------+
|                    0|
+---------------------+



Verify that every order has an order item

In [45]:
spark.sql("""
SELECT COUNT(*) AS orders_without_items
FROM ice.demo.orders o
LEFT JOIN ice.demo.order_items oi
ON o.id = oi.order_id
WHERE oi.id IS NULL
""").show()

+--------------------+
|orders_without_items|
+--------------------+
|                   0|
+--------------------+



In [47]:
spark.stop()