![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)

### [Docker, Spark, and Iceberg: The Fastest Way to Try Iceberg!](https://tabular.io/blog/docker-spark-and-iceberg/)

In [1]:
import os

for key in os.environ:
    print(key,"\n==", os.environ[key])

PYTHON_SHA256 
== bfb249609990220491a1b92850a07135ed0831e41738cf681d63cf01b2a8fbd1
AWS_S3_ENDPOINT 
== http://minio:9000
HOSTNAME 
== 47d7acc97b7c
SPARK_BEELINE_OPTS 
==  -Djline.terminal=jline.UnsupportedTerminal -Djline.terminal=jline.UnsupportedTerminal
PYTHON_VERSION 
== 3.10.16
PYTHONHASHSEED 
== 0
ICEBERG_VERSION 
== 1.8.1
PYSPARK_DRIVER_PYTHON 
== jupyter-notebook
PYSPARK_DRIVER_PYTHON_OPTS 
== --notebook-dir=/home/iceberg/notebooks --ip='*' --NotebookApp.token='' --NotebookApp.password='' --port=8888 --no-browser --allow-root
AWS_REGION 
== us-east-1
PWD 
== /opt/spark
HOME 
== /root
LANG 
== C.UTF-8
PYTHONSTARTUP 
== /opt/spark/python/pyspark/shell.py
GPG_KEY 
== A035C8C19219BA821ECEA86B64E628F8D684696D
AWS_SECRET_ACCESS_KEY 
== minioroot
NESSIE_URI 
== http://nessie:19120/api/v2
SPARK_MAJOR_VERSION 
== 3.5
PYSPARK_PYTHON 
== python3
CATALOG_TYPE 
== nessie
PYTHONPATH 
== /opt/spark/python/lib/py4j-0.10.9.7-src.zip:/opt/spark/python/:/opt/spark/python:/opt/spark/python/lib/py4

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

NESSIE_URI = os.environ['NESSIE_URI']
REF = "main"
FULL_PATH_TO_WAREHOUSE = os.environ['WAREHOUSE']
AWS_S3_ENDPOINT = os.environ["AWS_S3_ENDPOINT"]
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_KEY  = os.environ['AWS_SECRET_ACCESS_KEY']

spark = SparkSession.builder \
    .appName("NessieMinIOIntegration") \
    .config("spark.sql.catalog.AwsDataCatalog.s3.access-key-id", AWS_ACCESS_KEY) \
    .config("spark.sql.catalog.AwsDataCatalog.s3.secret-access-key", AWS_SECRET_KEY) \
    .config("spark.jars.packages", 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.103.3') \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions") \
    .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog") \
    .config("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.nessie.warehouse", FULL_PATH_TO_WAREHOUSE) \
    .config("spark.sql.catalog.nessie.s3.path-style-access", "true") \
    .config("spark.sql.catalog.nessie.s3.endpoint", AWS_S3_ENDPOINT) \
    .config("spark.sql.catalog.nessie.uri", NESSIE_URI) \
    .config("spark.sql.catalog.nessie.ref", REF) \
    .config("spark.sql.catalog.nessie.authentication.type", "NONE") \
    .config("spark.sql.warehouse.dir", FULL_PATH_TO_WAREHOUSE) \
    .config("spark.sql.catalog.nessie.s3.access-key-id", AWS_ACCESS_KEY) \
    .config("spark.sql.catalog.nessie.s3.secret-access-key", AWS_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.signing-algorithm", "S3SignerType") \
    .getOrCreate()
'''

    .config("spark.driver.extraJavaOptions", "-Daws.region=eu-central-1") \
    .config("spark.executor.extraJavaOptions", "-Daws.region=eu-central-1") \
'''

25/06/09 10:56:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


'\n\n    .config("spark.driver.extraJavaOptions", "-Daws.region=eu-central-1")     .config("spark.executor.extraJavaOptions", "-Daws.region=eu-central-1") '

In [3]:
df = spark.sql("show tables in nessie.nyc")
df.head()

In [4]:
%%sql

CREATE NAMESPACE IF NOT EXISTS nessie.nyc

25/06/09 10:57:01 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
%%sql

SHOW NAMESPACES IN nessie

namespace
nyc


In [6]:
%%sql

DROP TABLE IF EXISTS nessie.nyc.user_raw

In [10]:
user_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)    
])

user_data = [
    (1, 'John Doe'),
    (2, 'Jane Doe'),
    (3, 'Janett Doe')
]

df = spark.createDataFrame(user_data, user_schema)

In [11]:
df.writeTo("nessie.nyc.user_raw").createOrReplace()

In [12]:
%%sql

select *
from nessie.nyc.user_raw

id,name
1,John Doe
2,Jane Doe
3,Janett Doe
