In [1]:
import os
import datetime
import pytz
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Unset SPARK_HOME to ensure the pyspark library uses its own bundled Spark runtime,
# preventing conflicts with any globally installed Spark versions.
# Please restart the Jupyter kernel for this change to take effect.
os.environ.pop('SPARK_HOME', None)

## DEFINE VARIABLES
CATALOG_URI = "http://nessie:19120/api/v2"
WAREHOUSE = "s3://lakehouse/"
STORAGE_URI = "http://minio:9000"
POSTGRES_JDBC_URL = "jdbc:postgresql://postgres:5432/poc"

## CONFIGURE SPARK SESSION
conf = (
    pyspark.SparkConf()
    .setAppName('Iceberg Ingestion')
    .set('spark.jars.packages',
         'org.postgresql:postgresql:42.7.3,'
         'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,'
         'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,'
         'software.amazon.awssdk:bundle:2.24.8,'
         'software.amazon.awssdk:url-connection-client:2.24.8')
    .set('spark.sql.extensions',
         'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,'
         'org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
    .set('spark.sql.catalog.iceberg', 'org.apache.iceberg.spark.SparkCatalog')
    .set('spark.sql.catalog.iceberg.uri', CATALOG_URI)
    .set('spark.sql.catalog.iceberg.ref', 'main')
    .set('spark.sql.catalog.iceberg.authentication.type', 'NONE')
    .set('spark.sql.catalog.iceberg.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
    .set('spark.sql.catalog.iceberg.warehouse', WAREHOUSE)
    .set('spark.sql.catalog.iceberg.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
    .set('spark.sql.catalog.iceberg.client.region', 'us-east-1')
    .set('spark.sql.catalog.iceberg.s3.endpoint', STORAGE_URI)
    .set('spark.sql.catalog.iceberg.s3.access-key-id', 'admin')
    .set('spark.sql.catalog.iceberg.s3.secret-access-key', 'PASSWORD')
    .set('spark.sql.catalog.iceberg.s3.path-style-access', 'true')
    .set("spark.hadoop.fs.s3a.endpoint", STORAGE_URI)
    .set("spark.hadoop.fs.s3a.access.key", "admin")
    .set("spark.hadoop.fs.s3a.secret.key", "PASSWORD")
    .set("spark.hadoop.fs.s3a.path.style.access", "true")
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
)

## START SPARK SESSION
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Running @ " + datetime.datetime.now(pytz.timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S"))


Spark Running @ 2026-01-11 17:25:03


In [2]:
 # Define the JDBC connection properties
properties = {
    "user": "poc",
    "password": "PASSWORD",
    "driver": "org.postgresql.Driver"
}

# Read the sales_data table from Postgres into a Spark DataFrame
transactions_df = spark.read.jdbc(url=POSTGRES_JDBC_URL, table="transactions", properties=properties)

# Show the first few rows of the dataset
transactions_df.show()

+-------+----------------+----------------+-----------+----------------+------+
|     id|transaction_date|transaction_type|posted_date|     description|amount|
+-------+----------------+----------------+-----------+----------------+------+
|txn_001|      2024-01-01|          credit| 2024-01-01| Initial Deposit|1000.0|
|txn_002|      2024-01-02|           debit| 2024-01-03|     Coffee Shop|   4.5|
|txn_003|      2024-01-05|           debit| 2024-01-06|Online Bookstore| 25.99|
+-------+----------------+----------------+-----------+----------------+------+



In [3]:
# Manipulate the data
# Multiply each amount by 2
transactions_df = transactions_df.withColumn("amount", col("amount") * 2)

# Create a namespace
spark.sql("CREATE NAMESPACE IF NOT EXISTS iceberg.poc;")

# Write the DataFrame to an Iceberg table in the Nessie catalog
transactions_df.writeTo("iceberg.poc.transactions").createOrReplace()

# Verify that the data was written to Iceberg by reading the table
spark.read.table("iceberg.poc.transactions").show()

+-------+----------------+----------------+-----------+----------------+------+
|     id|transaction_date|transaction_type|posted_date|     description|amount|
+-------+----------------+----------------+-----------+----------------+------+
|txn_001|      2024-01-01|          credit| 2024-01-01| Initial Deposit|2000.0|
|txn_002|      2024-01-02|           debit| 2024-01-03|     Coffee Shop|   9.0|
|txn_003|      2024-01-05|           debit| 2024-01-06|Online Bookstore| 51.98|
+-------+----------------+----------------+-----------+----------------+------+

