In [None]:
import os
import subprocess
import sys
from typing import List
from pyspark.sql import SparkSession
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def get_jar_files(jars_home: str) -> List[str]:
    """
    Recursively find all JAR files in the specified directory.
    
    Args:
        jars_home (str): Root directory to search for JAR files
        
    Returns:
        List[str]: List of full paths to JAR files
        
    Raises:
        FileNotFoundError: If jars_home directory doesn't exist
        PermissionError: If there are permission issues accessing the directory
    """
    try:
        if not os.path.exists(jars_home):
            raise FileNotFoundError(f"Directory not found: {jars_home}")
            
        jar_files = []
        
        # Walk through directory tree
        for root, _, files in os.walk(jars_home):
            for file in files:
                if file.lower().endswith('.jar'):
                    full_path = os.path.join(root, file)
                    jar_files.append(full_path)
                    logger.info(f"Found JAR file: {full_path}")
        
        if not jar_files:
            logger.warning(f"No JAR files found in {jars_home}")
            
        return sorted(jar_files)  # Sort for consistent ordering
        
    except PermissionError as e:
        logger.error(f"Permission denied accessing {jars_home}: {str(e)}")
        raise
    except Exception as e:
        logger.error(f"Error while scanning for JAR files: {str(e)}")
        raise
    
# Get the Spark home directory
jars_home = '/workspace/delta-jars'

try:
    # Get all JAR files
    jars = get_jar_files(jars_home)
    
    # Log total count
    logger.info(f"Found {len(jars)} JAR files in total")
    
    # The jars list can now be used for Spark configuration
    
except Exception as e:
    logger.error(f"Failed to load JAR files: {str(e)}")
    raise

# Create SparkSession using the builder pattern
builder = (SparkSession.builder
           .appName("DeltaExample")
           .master("local[*]")
           # Add debug configurations
           .config("spark.hadoop.fs.s3a.connection.maximum", "1")
           .config("spark.hadoop.fs.s3a.attempts.maximum", "1")
           .config("spark.hadoop.fs.s3a.connection.timeout", "5000")
           .config("spark.hadoop.fs.s3a.impl.disable.cache", "true")
           .config("spark.hadoop.fs.s3a.debug.detailed.exceptions", "true")
           # Add jars directly
           .config("spark.jars", ",".join(jars))
           .config("spark.driver.extraClassPath", ",".join(jars))
           .config("spark.executor.extraClassPath", ",".join(jars))
           # Delta Lake configurations
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
           # S3/MinIO configurations
           .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
           .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
           .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
           .config("spark.hadoop.fs.s3a.path.style.access", "true")
           .config("fs.s3a.metrics.enabled", "false")
           .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
           .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
           .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
           # Additional Delta Lake configurations
           .config("spark.delta.logStore.class", "io.delta.storage.S3SingleDriverLogStore")
           .config("spark.hadoop.fs.s3a.fast.upload", "true")
           .config("spark.hadoop.fs.s3a.multipart.size", "104857600")
           .config("spark.sql.warehouse.dir", "/tmp/spark-warehouse"))

# Stop any existing session
if 'spark' in locals():
    spark.stop()


# Create the session
spark = builder.enableHiveSupport().getOrCreate()

# Initialize Delta Lake settings
# spark.sql("SET spark.databricks.delta.formatCheck.enabled=false")

# Access the SparkContext
# sc = spark.sparkContext

# Set the log level to INFO
# sc.setLogLevel("DEBUG")

# Test DataFrame
data = [(1, "John"), (2, "Jane")]
df = spark.createDataFrame(data, ["id", "name"])
df.show()

# First verify the S3 connection by listing the bucket
try:
    # Try to write to a simple parquet file first to test S3 connection
    print("Testing S3 connection with parquet write...")
    df.write.format("parquet").mode("overwrite").save("s3a://wba/test.parquet")
    print("S3 connection successful")

    print("Attempting to write Delta table...")
    df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .option("delta.compatibility.symlinkFormatManifest.enabled", "false") \
        .save("s3a://wba/example-table")
    print("Successfully wrote Delta table")

except Exception as e:
    print(f"Error: {str(e)}")
    print("\nTrying local filesystem instead...")
    try:
        local_path = "/tmp/test-delta-table"
        df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .save(local_path)
        print(f"Successfully wrote to {local_path}")
    except Exception as local_e:
        print(f"Error writing to local filesystem: {str(local_e)}")

25/02/20 20:47:45 INFO SparkContext: SparkContext is stopping with exitCode 0.
25/02/20 20:47:45 INFO SparkUI: Stopped Spark web UI at http://c1db4b577bd9:4040
25/02/20 20:47:45 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
25/02/20 20:47:45 INFO MemoryStore: MemoryStore cleared
25/02/20 20:47:45 INFO BlockManager: BlockManager stopped
25/02/20 20:47:45 INFO BlockManagerMaster: BlockManagerMaster stopped
25/02/20 20:47:45 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
25/02/20 20:47:45 INFO SparkContext: Successfully stopped SparkContext
25/02/20 20:47:45 INFO SparkContext: Running Spark version 3.4.4
25/02/20 20:47:45 INFO ResourceUtils: No custom resources configured for spark.driver.
25/02/20 20:47:45 INFO SparkContext: Submitted application: DeltaExample
25/02/20 20:47:45 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , me

+---+----+
| id|name|
+---+----+
|  1|John|
|  2|Jane|
+---+----+

Testing S3 connection with parquet write...


25/02/20 20:47:53 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/02/20 20:47:53 INFO MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
25/02/20 20:47:53 INFO MetricsSystemImpl: s3a-file-system metrics system started
25/02/20 20:47:55 INFO ParquetUtils: Using default output committer for Parquet: org.apache.parquet.hadoop.ParquetOutputCommitter
25/02/20 20:47:55 INFO FileOutputCommitter: File Output Committer Algorithm version is 1
25/02/20 20:47:55 INFO FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
25/02/20 20:47:55 INFO SQLHadoopMapReduceCommitProtocol: Using user defined output committer class org.apache.parquet.hadoop.ParquetOutputCommitter
25/02/20 20:47:55 INFO FileOutputCommitter: File Output Committer Algorithm version is 1
25/02/20 20:47:55 INFO FileOutputCommitter: FileOutputCommitter skip cl

S3 connection successful
Attempting to write Delta table...


25/02/20 20:47:58 INFO DeltaLog: Loading version 1.
25/02/20 20:47:59 INFO DeltaLogFileIndex: Created DeltaLogFileIndex(JSON, numFilesInSegment: 2, totalFileSize: 2903)
25/02/20 20:47:59 INFO FileSourceStrategy: Pushed Filters: 
25/02/20 20:47:59 INFO FileSourceStrategy: Post-Scan Filters: 
25/02/20 20:48:00 INFO CodeGenerator: Code generated in 167.921235 ms
25/02/20 20:48:00 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 213.3 KiB, free 433.9 MiB)
25/02/20 20:48:00 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 36.2 KiB, free 433.9 MiB)
25/02/20 20:48:00 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on c1db4b577bd9:33545 (size: 36.2 KiB, free: 434.3 MiB)
25/02/20 20:48:00 INFO SparkContext: Created broadcast 4 from toString at String.java:4220
25/02/20 20:48:00 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194304 bytes, open cost is considered as scanning 4194304 bytes.
25/02/20 20

Successfully wrote Delta table


25/02/20 20:48:11 INFO Executor: Finished task 49.0 in stage 17.0 (TID 184). 5128 bytes result sent to driver
25/02/20 20:48:11 INFO TaskSetManager: Finished task 49.0 in stage 17.0 (TID 184) in 131 ms on c1db4b577bd9 (executor driver) (50/50)
25/02/20 20:48:11 INFO TaskSchedulerImpl: Removed TaskSet 17.0, whose tasks have all completed, from pool 
25/02/20 20:48:11 INFO DAGScheduler: ShuffleMapStage 17 ($anonfun$recordDeltaOperationInternal$1 at DatabricksLogging.scala:128) finished in 1.911 s
25/02/20 20:48:11 INFO DAGScheduler: looking for newly runnable stages
25/02/20 20:48:11 INFO DAGScheduler: running: Set()
25/02/20 20:48:11 INFO DAGScheduler: waiting: Set()
25/02/20 20:48:11 INFO DAGScheduler: failed: Set()
25/02/20 20:48:11 INFO SparkContext: Starting job: $anonfun$recordDeltaOperationInternal$1 at DatabricksLogging.scala:128
25/02/20 20:48:11 INFO DAGScheduler: Got job 13 ($anonfun$recordDeltaOperationInternal$1 at DatabricksLogging.scala:128) with 1 output partitions
25/02/