In [None]:
import os
import subprocess
import sys
from pyspark.sql import SparkSession

# Get the Spark home directory
spark_home = os.environ.get('SPARK_HOME', '/opt/spark')

# Define all required JARs
jars = [
    f"{spark_home}/jars/delta-core_2.12-2.4.0.jar",
    f"{spark_home}/jars/delta-storage-2.4.0.jar",
    f"{spark_home}/jars/hadoop-aws-3.3.2.jar",
    f"{spark_home}/jars/aws-java-sdk-bundle-1.12.261.jar"
]

# Create SparkSession using the builder pattern
builder = (SparkSession.builder
           .appName("DeltaExample")
           .master("local[*]")
           # Add debug configurations
           .config("spark.hadoop.fs.s3a.connection.maximum", "1")
           .config("spark.hadoop.fs.s3a.attempts.maximum", "1")
           .config("spark.hadoop.fs.s3a.connection.timeout", "5000")
           .config("spark.hadoop.fs.s3a.impl.disable.cache", "true")
           .config("spark.hadoop.fs.s3a.debug.detailed.exceptions", "true")
           # Add jars directly
           .config("spark.jars", ",".join(jars))
           .config("spark.driver.extraClassPath", ",".join(jars))
           .config("spark.executor.extraClassPath", ",".join(jars))
           # Delta Lake configurations
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
           # S3/MinIO configurations
           .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
           .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
           .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
           .config("spark.hadoop.fs.s3a.path.style.access", "true")
           .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
           .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
           .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
           # Additional Delta Lake configurations
           .config("spark.delta.logStore.class", "io.delta.storage.S3SingleDriverLogStore")
           .config("spark.hadoop.fs.s3a.fast.upload", "true")
           .config("spark.hadoop.fs.s3a.multipart.size", "104857600")
           .config("spark.sql.warehouse.dir", "/tmp/spark-warehouse"))

# Stop any existing session
if 'spark' in locals():
    spark.stop()


# Create the session
spark = builder.enableHiveSupport().getOrCreate()

# Initialize Delta Lake settings
spark.sql("SET spark.databricks.delta.formatCheck.enabled=false")

# Access the SparkContext
sc = spark.sparkContext

# Set the log level to INFO
sc.setLogLevel("DEBUG")

# Test DataFrame
data = [(1, "John"), (2, "Jane")]
df = spark.createDataFrame(data, ["id", "name"])
df.show()

# First verify the S3 connection by listing the bucket
try:
    # Try to write to a simple parquet file first to test S3 connection
    print("Testing S3 connection with parquet write...")
    df.write.format("parquet").mode("overwrite").save("s3a://wba/test.parquet")
    print("S3 connection successful")

    print("Attempting to write Delta table...")
    df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .option("delta.compatibility.symlinkFormatManifest.enabled", "false") \
        .save("s3a://wba/example-table")
    print("Successfully wrote Delta table")

except Exception as e:
    print(f"Error: {str(e)}")
    print("\nTrying local filesystem instead...")
    try:
        local_path = "/tmp/test-delta-table"
        df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .save(local_path)
        print(f"Successfully wrote to {local_path}")
    except Exception as local_e:
        print(f"Error writing to local filesystem: {str(local_e)}")

25/02/18 20:36:16 INFO SparkContext: SparkContext is stopping with exitCode 0.
25/02/18 20:36:16 INFO SparkUI: Stopped Spark web UI at http://8d144dbc4ecc:4040
25/02/18 20:36:16 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
25/02/18 20:36:16 INFO MemoryStore: MemoryStore cleared
25/02/18 20:36:16 INFO BlockManager: BlockManager stopped
25/02/18 20:36:16 INFO BlockManagerMaster: BlockManagerMaster stopped
25/02/18 20:36:16 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
25/02/18 20:36:16 INFO SparkContext: Successfully stopped SparkContext
25/02/18 20:36:16 INFO SparkContext: Running Spark version 3.4.4
25/02/18 20:36:16 INFO ResourceUtils: No custom resources configured for spark.driver.
25/02/18 20:36:16 INFO SparkContext: Submitted application: DeltaExample
25/02/18 20:36:16 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , me

+---+----+
| id|name|
+---+----+
|  1|John|
|  2|Jane|
+---+----+

Testing S3 connection with parquet write...


25/02/18 20:36:23 DEBUG wire: http-outgoing-12 << "[read] I/O error: Read timed out"
25/02/18 20:36:23 DEBUG DefaultManagedHttpClientConnection: http-outgoing-12: Close connection
25/02/18 20:36:23 DEBUG DefaultManagedHttpClientConnection: http-outgoing-12: Shutdown connection
25/02/18 20:36:23 DEBUG MainClientExec: Connection discarded
25/02/18 20:36:23 DEBUG PoolingHttpClientConnectionManager: Connection released: [id: 12][route: {}->http://localhost:9000][total available: 0; route allocated: 0 of 1; total allocated: 0 of 1]
25/02/18 20:36:23 DEBUG AmazonHttpClient: Unable to execute HTTP request: Read timed out Request will be retried.
25/02/18 20:36:23 DEBUG request: Retrying Request: HEAD http://localhost:9000 /wba/test.parquet Headers: (amz-sdk-invocation-id: 774a7502-7a66-5eba-cee1-eee8488196c9, Content-Type: application/octet-stream, User-Agent: Hadoop 3.3.4, aws-sdk-java/1.12.261 Linux/5.15.167.4-microsoft-standard-WSL2 OpenJDK_64-Bit_Server_VM/17.0.14+7-Debian-1deb12u1 java/1