In [1]:
import os
import subprocess
import sys
from typing import List
from pyspark.sql import SparkSession
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the base directory
jars_home = '/workspace/delta-jars'

# Required core JARs
jars_list = [
    # Delta Lake
    f"{jars_home}/delta-spark_2.12-3.3.0.jar",
    f"{jars_home}/delta-storage-3.3.0.jar",
    # AWS
    f"{jars_home}/hadoop-aws-3.3.2.jar",
    f"{jars_home}/aws-java-sdk-bundle-1.12.782.jar",
    # Kyuubi
    f"{jars_home}/kyuubi/externals/engines/spark/kyuubi-spark-sql-engine_2.12-1.10.0.jar",
    f"{jars_home}/kyuubi/externals/engines/spark/kyuubi-common_2.12-1.10.0.jar"
]

# Convert to comma-separated string
jars = ",".join(jars_list)

# Create SparkSession using the builder pattern
builder = (SparkSession.builder
           .appName("DeltaExample")
           .master("local[*]")
           # Add debug configurations
           .config("spark.hadoop.fs.s3a.connection.maximum", "1")
           .config("spark.hadoop.fs.s3a.attempts.maximum", "1")
           .config("spark.hadoop.fs.s3a.connection.timeout", "5000")
           .config("spark.hadoop.fs.s3a.impl.disable.cache", "true")
           .config("spark.hadoop.fs.s3a.debug.detailed.exceptions", "true")
           # Add jars directly
           .config("spark.jars", jars)
           .config("spark.driver.extraClassPath", jars)
           .config("spark.executor.extraClassPath", jars)
           # Delta Lake configurations
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
           # S3/MinIO configurations
           .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
           .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
           .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000")
           .config("spark.hadoop.fs.s3a.path.style.access", "true")
           .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
           .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
           .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
           # Additional Delta Lake configurations
           .config("spark.delta.logStore.class", "io.delta.storage.S3SingleDriverLogStore")
           .config("spark.hadoop.fs.s3a.fast.upload", "true")
           .config("spark.hadoop.fs.s3a.multipart.size", "104857600")
           .config("spark.sql.warehouse.dir", "s3a://wba/warehouse"))

# Stop any existing session
if 'spark' in locals():
    spark.stop()


# Create the session
spark = builder.enableHiveSupport().getOrCreate()

# Initialize Delta Lake settings
# spark.sql("SET spark.databricks.delta.formatCheck.enabled=false")

# Access the SparkContext
# sc = spark.sparkContext

# Set the log level to INFO
# sc.setLogLevel("DEBUG")

# Test DataFrame
data = [(1, "John"), (2, "Jane")]
df = spark.createDataFrame(data, ["id", "name"])
df.show()

# First verify the S3 connection by listing the bucket
try:
    # Try to write to a simple parquet file first to test S3 connection
    print("Testing S3 connection with parquet write...")
    df.write.format("parquet").mode("overwrite").save("s3a://wba/test.parquet")
    print("S3 connection successful")

    print("Attempting to write Delta table...")
    df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .option("delta.compatibility.symlinkFormatManifest.enabled", "false") \
        .save("s3a://wba/example-table")
    print("Successfully wrote Delta table")

except Exception as e:
    print(f"Error: {str(e)}")
    print("\nTrying local filesystem instead...")
    try:
        local_path = "/tmp/test-delta-table"
        df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .save(local_path)
        print(f"Successfully wrote to {local_path}")
    except Exception as local_e:
        print(f"Error writing to local filesystem: {str(local_e)}")

your 131072x1 screen size is bogus. expect trouble
25/04/04 16:43:40 WARN Utils: Your hostname, JBLAPTOPW11 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/04 16:43:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/04/04 16:43:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/04 16:43:41 WARN DependencyUtils: Local jar /workspace/delta-jars/delta-spark_2.12-3.3.0.jar does not exist, skipping.
25/04/04 16:43:41 WARN DependencyUtils: Local jar /workspace/delta-jars/delta-storage-3.3.0.jar does not exist, skipping.
25/04/04 16:43:41 WARN DependencyUtils: Local jar /workspace/delta-jars/hadoop-aws-3.3.2.jar does not exist, skipping.
25/04/04 16:43:41 WARN DependencyUtils: Local jar /workspace/delta-jars/aws-java-sdk-bundle-1.12.782.jar does not exist, skipping.
25/04/04 16:43:41 WARN DependencyUtils: Local jar /workspace/delta-jars/ky

+---+----+
| id|name|
+---+----+
|  1|John|
|  2|Jane|
+---+----+

Testing S3 connection with parquet write...


25/04/04 16:43:51 INFO ParquetUtils: Using default output committer for Parquet: org.apache.parquet.hadoop.ParquetOutputCommitter
25/04/04 16:43:51 INFO FileOutputCommitter: File Output Committer Algorithm version is 1
25/04/04 16:43:51 INFO FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
25/04/04 16:43:51 INFO SQLHadoopMapReduceCommitProtocol: Using user defined output committer class org.apache.parquet.hadoop.ParquetOutputCommitter
25/04/04 16:43:51 INFO FileOutputCommitter: File Output Committer Algorithm version is 1
25/04/04 16:43:51 INFO FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
25/04/04 16:43:51 INFO SQLHadoopMapReduceCommitProtocol: Using output committer class org.apache.parquet.hadoop.ParquetOutputCommitter
25/04/04 16:43:51 INFO CodeGenerator: Code generated in 13.370137 ms
25/04/04 16:43:51 INFO Spa

S3 connection successful
Attempting to write Delta table...


25/04/04 16:43:54 INFO DeltaLog: Creating initial snapshot without metadata, because the directory is empty
25/04/04 16:43:55 INFO DummySnapshot: [tableId=e7c86550-2fbe-43e3-9fd1-7cbe08afdc0e] Created snapshot DummySnapshot(path=s3a://wba/example-table/_delta_log, version=-1, metadata=Metadata(3dae6760-3192-40e2-bfbe-de7fc05fde41,null,null,Format(parquet,Map()),null,List(),Map(),Some(1743803034998)), logSegment=LogSegment(s3a://wba/example-table/_delta_log,-1,List(),org.apache.spark.sql.delta.EmptyCheckpointProvider$@4dbc4421,-1), checksumOpt=None)
25/04/04 16:43:55 INFO DeltaLog: Creating initial snapshot without metadata, because the directory is empty
25/04/04 16:43:55 INFO DummySnapshot: [tableId=3dae6760-3192-40e2-bfbe-de7fc05fde41] Created snapshot DummySnapshot(path=s3a://wba/example-table/_delta_log, version=-1, metadata=Metadata(e5c5e22a-8438-442f-a077-67f4433e0980,null,null,Format(parquet,Map()),null,List(),Map(),Some(1743803035214)), logSegment=LogSegment(s3a://wba/example-t

Successfully wrote Delta table


25/04/04 16:44:06 INFO deprecation: org.apache.hadoop.shaded.io.bytes.per.checksum is deprecated. Instead, use dfs.bytes-per-checksum
25/04/04 16:44:06 INFO CheckpointFileManager: Writing atomically to s3a://wba/example-table/_delta_log/00000000000000000000.crc using temp file s3a://wba/example-table/_delta_log/.00000000000000000000.crc.bea21759-a810-48c6-8e68-5b0368175a58.tmp
25/04/04 16:44:06 INFO CheckpointFileManager: Renamed temp file s3a://wba/example-table/_delta_log/.00000000000000000000.crc.bea21759-a810-48c6-8e68-5b0368175a58.tmp to s3a://wba/example-table/_delta_log/00000000000000000000.crc
