In [1]:
import pyspark
from pyspark.sql import SparkSession
import os
from dotenv import load_dotenv
from pathlib import Path

load_dotenv()

NESSIE_URI = os.environ.get("NESSIE_URI_TEST") 
MINIO_ACCESS_KEY=os.environ.get("AWS_ACCESS_KEY_ID_TEST") 
MINIO_SECRET_KEY=os.environ.get("AWS_SECRET_ACCESS_KEY_TEST") 
MINIO_ENDPOINT=os.environ.get("AWS_S3_ENDPOINT_TEST") 


spark = (
    SparkSession.builder
        .appName("Iceberg-Nessie-rest-catalog")
        # .master("local[*]")
        .config("spark.driver.host", "192.168.1.11")
        # .config("spark.driver.host", "spark-master")
        .master("spark://host.docker.internal:7077")
        .config("spark.driver.memory", "2g")
        .config("spark.driver.cores", "1")
        .config("spark.executor.instances", "1")
        .config("spark.executor.cores", "2")
        .config("spark.executor.memory", "2g")
        .config("spark.jars.packages", ",".join([
            "org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.103.2",
            "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1",
            "org.apache.iceberg:iceberg-aws-bundle:1.8.1",
            ## Minio
            "org.apache.hadoop:hadoop-aws:3.3.4",
            "com.amazonaws:aws-java-sdk-bundle:1.12.262"
        ]))
        .config("spark.sql.extensions", "org.projectnessie.spark.extensions.NessieSparkSessionExtensions,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
        .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
        .config("spark.sql.catalog.nessie.type", "rest")
        .config("spark.sql.catalog.nessie.uri", NESSIE_URI)
        ## Minio config
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
        .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
        .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT)
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .getOrCreate()
)

## Test iceberg connection

In [2]:
spark.sql("SHOW DATABASES IN nessie").show()

+---------+
|namespace|
+---------+
|     demo|
+---------+



In [9]:
spark.sql("CREATE DATABASE IF NOT EXISTS nessie.demo").show()

++
||
++
++



In [4]:
spark.sql(
    """
    CREATE TABLE IF NOT EXISTS nessie.demo_haha.sample_table (
        id BIGINT,
        name STRING
    ) USING iceberg
    """
).show()

++
||
++
++



In [5]:
spark.sql(
    """
    INSERT INTO nessie.demo_haha.sample_table VALUES
    (3, 'Nguuu'),
    (4, 'Lonnn')
    """
).show()

++
||
++
++



In [6]:
spark.sql("SELECT * FROM  nessie.demo_haha.sample_table;").show()

+---+-----+
| id| name|
+---+-----+
|  3|Nguuu|
|  4|Lonnn|
|  3|Nguuu|
|  4|Lonnn|
+---+-----+



## Test Minio connection

In [3]:
df = spark.read.csv("s3a://lakehouse/raw/vnstock_1d.csv", header=True, inferSchema=True)

In [4]:
df.show()

+----------+-----+-----+-----+-----+------+------+
|      time| open| high|  low|close|volume|ticker|
+----------+-----+-----+-----+-----+------+------+
|2025-01-02|39.75|39.75|39.75|39.75|     0|   YTC|
|2025-01-03|33.89|33.89|33.89|33.89|   400|   YTC|
|2025-01-06|33.89|34.09|33.89|33.99|   502|   YTC|
|2025-01-07|33.99|33.99|33.99|33.99|     0|   YTC|
|2025-01-08|33.99|33.99|33.99|33.99|     0|   YTC|
|2025-01-09|33.99|33.99|33.99|33.99|     0|   YTC|
|2025-01-10|33.99|33.99|33.99|33.99|     4|   YTC|
|2025-01-13|37.96|37.96|37.96|37.96|   114|   YTC|
|2025-01-14| 32.4| 32.4| 32.4| 32.4|   306|   YTC|
|2025-01-15| 32.4| 32.4| 32.4| 32.4|    39|   YTC|
|2025-01-16| 32.4| 32.4| 32.4| 32.4|   101|   YTC|
|2025-01-17| 32.4| 32.4| 32.4| 32.4|     1|   YTC|
|2025-01-20| 32.4| 32.4| 32.4| 32.4|     0|   YTC|
|2025-01-21| 32.4| 32.4| 32.4| 32.4|     0|   YTC|
|2025-01-22| 32.4| 32.4| 32.4| 32.4|     0|   YTC|
|2025-01-23|34.78|34.78|34.78|34.78|   111|   YTC|
|2025-01-24|34.78|34.78|34.78|3

In [5]:
df.writeTo("nessie.demo.vnstock_1d_test").createOrReplace()