In [11]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .master("spark://iceberg-test-spark-master:7077")
    .appName("iceberg-test")

    # ðŸ”¥ Required for Docker standalone (client mode)
    .config("spark.driver.host", "iceberg-test-jupyter")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.driver.port", "7078")
    .config("spark.blockManager.port", "7079")
    .config(
        "spark.sql.extensions",
        "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
    )

    # Iceberg JDBC catalog
    .config("spark.sql.catalog.iceberg_jdbc",
            "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.iceberg_jdbc.catalog-impl",
            "org.apache.iceberg.jdbc.JdbcCatalog")
    .config("spark.sql.catalog.iceberg_jdbc.uri",
            "jdbc:postgresql://iceberg-test-postgres:5432/iceberg-jdbc-catalog")
    .config("spark.sql.catalog.iceberg_jdbc.jdbc.user", "iceberg")
    .config("spark.sql.catalog.iceberg_jdbc.jdbc.password", "iceberg")
    .config("spark.sql.catalog.iceberg_jdbc.jdbc.driver",
            "org.postgresql.Driver")
    .config("spark.sql.catalog.iceberg_jdbc.warehouse",
            "s3a://iceberg/warehouse")

    # MinIO (S3A config)
    .config("spark.hadoop.fs.s3a.endpoint",
            "http://iceberg-test-minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl",
            "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.executor.cores", "1")
    .config("spark.executor.memory", "1g")
    .config("spark.cores.max", "1")

    .getOrCreate()
)

print("Master:", spark.sparkContext.master)


Master: spark://iceberg-test-spark-master:7077


In [2]:
spark.sql("SHOW SCHEMAS FROM iceberg_jdbc").show()

+-----------------+
|        namespace|
+-----------------+
|        ecommerce|
|test_schema_trino|
+-----------------+



In [12]:
df_check = spark.sql("select * from iceberg_jdbc.ecommerce.order_items limit 5")
df_check.show()

+------------+------------+------------+------+----------------+
|    order_id|  product_id|   seller_id| price|shipping_charges|
+------------+------------+------------+------+----------------+
|Axfy13Hk4PIk|90K0C1fIyQUf|ZWM05J9LcBSF|223.51|           84.65|
|v6px92oS8cLG|qejhpMGGVcsl|IjlpYfhUbRQs| 170.8|           23.79|
|Ulpf9skrhjfm|qUS5d2pEAyxJ|77p2EYxcM9MD|  64.4|           17.38|
|bwJVWupf2keN|639iGvMyv0De|jWzS0ayv9TGf| 264.5|           30.72|
|Dd0QnrMk9Cj5|1lycYGcsic2F|l1pYW6GBnPMr| 779.9|           30.66|
+------------+------------+------------+------+----------------+



In [13]:
df_customers = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("s3a://dataset/ecommerce_order_dataset/train/df_Customers.csv")

df_customers.createOrReplaceTempView("customers_raw")

spark.sql("""CREATE OR REPLACE TABLE iceberg_jdbc.ecommerce.customers
USING iceberg
AS SELECT * FROM customers_raw;
""")

DataFrame[]

In [10]:
spark.stop()