In [None]:
#!/usr/bin/env python
# S3 Security Debugging for PySpark Delta Lake

from pyspark.sql import SparkSession
import logging
import sys

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.FileHandler("s3_debug.log"),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger("s3-debug")


def create_debug_spark_session(app_name="S3 Debug Session",
                               aws_access_key=None,
                               aws_secret_key=None,
                               endpoint_url=None,
                               path_style_access=False,
                               log_level="INFO"):
    """
    Create a Spark session with enhanced logging for S3 debugging.

    Args:
        app_name: Name for the Spark application
        aws_access_key: AWS access key
        aws_secret_key: AWS secret key
        endpoint_url: Custom S3 endpoint (for MinIO, LocalStack, etc.)
        path_style_access: Use path-style access instead of virtual-hosted style
        log_level: Hadoop log level (INFO, DEBUG, etc.)
    """
    logger.info(f"Creating debug Spark session with log level {log_level}")

    # Start building the session
    builder = (SparkSession.builder
               .appName(app_name)
               .config("spark.jars.packages",
                       "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.1")
               .config("spark.sql.extensions",
                       "io.delta.sql.DeltaSparkSessionExtension")
               .config("spark.sql.catalog.spark_catalog",
                       "org.apache.spark.sql.delta.catalog.DeltaCatalog")
               # Enable verbose Hadoop logging
               .config("spark.hadoop.fs.s3a.impl.disable.cache", "true")
               .config(f"spark.hadoop.fs.s3a.logger.class", "org.apache.commons.logging.impl.Log4JLogger")
               .config(f"spark.driver.extraJavaOptions", f"-Dlog4j.logger.org.apache.hadoop.fs.s3a={log_level}")
               .config(f"spark.executor.extraJavaOptions", f"-Dlog4j.logger.org.apache.hadoop.fs.s3a={log_level}")
               .config("spark.hadoop.fs.s3a.committer.name", "directory")
               .config("spark.hadoop.fs.s3a.committer.staging.conflict-mode", "append")
               .config("spark.hadoop.mapreduce.outputcommitter.factory.scheme.s3a",
                       "org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory"))

    # Configure S3 access if credentials are provided
    if aws_access_key and aws_secret_key:
        logger.info("Configuring with explicit AWS credentials")
        builder = (builder
                   .config("spark.hadoop.fs.s3a.access.key", aws_access_key)
                   .config("spark.hadoop.fs.s3a.secret.key", aws_secret_key)
                   .config("spark.hadoop.fs.s3a.aws.credentials.provider",
                           "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"))
    else:
        logger.info("Using default AWS credential chain")
        builder = builder.config("spark.hadoop.fs.s3a.aws.credentials.provider",
                                 "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")

    # Configure custom endpoint (for MinIO, etc.)
    if endpoint_url:
        logger.info(f"Using custom S3 endpoint: {endpoint_url}")
        builder = builder.config("spark.hadoop.fs.s3a.endpoint", endpoint_url)

        # For MinIO and other S3-compatible storage systems
        if path_style_access:
            logger.info("Enabling path-style access")
            builder = builder.config(
                "spark.hadoop.fs.s3a.path.style.access", "true")

    spark = builder.getOrCreate()

    # Print all S3A configurations for debugging
    logger.info("Current S3A configuration:")
    hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
    s3a_keys = [key for key in hadoop_conf.iterator() if 's3a' in key]

    for key in s3a_keys:
        # Don't log the actual secret keys
        if 'secret' in key or 'password' in key:
            logger.info(f"{key}: ********")
        else:
            logger.info(f"{key}: {hadoop_conf.get(key)}")

    return spark


def test_s3_connection(spark, s3_path):
    """
    Test basic S3 connectivity by listing objects.

    Args:
        spark: SparkSession
        s3_path: S3 path to test (e.g., s3a://bucket/path/)
    """
    logger.info(f"Testing S3 connectivity to {s3_path}")
    try:
        # Try to list files
        file_count = spark.sparkContext.textFile(s3_path).count()
        logger.info(
            f"Successfully connected to {s3_path}. Found {file_count} partitions.")
        return True
    except Exception as e:
        logger.error(f"Failed to connect to {s3_path}: {str(e)}")
        return False


def verify_s3_permissions(spark, s3_bucket, test_operations=None):
    """
    Verify S3 permissions by attempting different operations.

    Args:
        spark: SparkSession
        s3_bucket: Bucket name (without s3:// prefix)
        test_operations: List of operations to test ['read', 'write', 'delete']
    """
    if test_operations is None:
        test_operations = ['read']

    s3_path = f"s3a://{s3_bucket}/"
    results = {}

    logger.info(f"Verifying S3 permissions on bucket: {s3_bucket}")

    if 'read' in test_operations:
        logger.info("Testing READ permission...")
        try:
            # Try listing files
            files = spark.sparkContext._jsc.hadoopConfiguration().get("fs.s3a.impl")
            logger.info(f"S3A implementation: {files}")
            files = spark.read.text(s3_path).count()
            logger.info(f"READ test: Success. Found {files} files.")
            results['read'] = True
        except Exception as e:
            logger.error(f"READ test: Failed. Error: {str(e)}")
            results['read'] = False

    if 'write' in test_operations:
        logger.info("Testing WRITE permission...")
        test_data_path = f"{s3_path}test_write_permissions.txt"
        try:
            # Try writing a test file
            test_df = spark.createDataFrame([("test",)], ["col1"])
            test_df.write.mode("overwrite").text(test_data_path)
            logger.info("WRITE test: Success")
            results['write'] = True
        except Exception as e:
            logger.error(f"WRITE test: Failed. Error: {str(e)}")
            results['write'] = False

    if 'delete' in test_operations and results.get('write', False):
        logger.info("Testing DELETE permission...")
        try:
            # Try deleting the test file
            fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(
                spark._jsc.hadoopConfiguration())
            path = spark._jvm.org.apache.hadoop.fs.Path(test_data_path)
            deleted = fs.delete(path, True)
            logger.info(f"DELETE test: {'Success' if deleted else 'Failed'}")
            results['delete'] = deleted
        except Exception as e:
            logger.error(f"DELETE test: Failed. Error: {str(e)}")
            results['delete'] = False

    return results


def diagnose_s3_issue(error_message):
    """
    Analyze error message and suggest potential fixes.

    Args:
        error_message: The error message from the exception
    """
    logger.info(f"Diagnosing S3 error: {error_message}")

    diagnoses = {
        "Status Code: 403": {
            "issue": "Access Denied - Permission issue",
            "solutions": [
                "Check IAM permissions for the bucket",
                "Verify AWS credentials are correct",
                "Check bucket policy for necessary permissions"
            ]
        },
        "Status Code: 404": {
            "issue": "Not Found - Bucket or path doesn't exist",
            "solutions": [
                "Verify bucket name and region",
                "Check if the specified path exists in the bucket"
            ]
        },
        "No such file or directory": {
            "issue": "Path not found",
            "solutions": [
                "Verify the path exists in the bucket",
                "Check for typos in the path"
            ]
        },
        "Unable to find credentials": {
            "issue": "Missing or invalid credentials",
            "solutions": [
                "Provide explicit AWS credentials",
                "Check environment variables for AWS credentials",
                "Configure instance profile if running on EC2"
            ]
        },
        "Connection timed out": {
            "issue": "Network connectivity issue",
            "solutions": [
                "Check network connectivity to S3",
                "Verify firewall rules allow access to S3",
                "Check VPC endpoint configuration if using VPC"
            ]
        },
        "Credentials expired": {
            "issue": "Temporary credentials have expired",
            "solutions": [
                "Refresh temporary credentials",
                "Use long-term credentials or role assumption"
            ]
        },
        "Signature doesn't match": {
            "issue": "Authentication issue - signature mismatch",
            "solutions": [
                "Check clock synchronization on your machine",
                "Verify access key and secret key are correct",
                "Ensure you're using the correct region for the bucket"
            ]
        },
        "NoSuchBucket": {
            "issue": "Bucket doesn't exist",
            "solutions": [
                "Verify bucket name",
                "Create the bucket if it doesn't exist",
                "Check if you have permission to list buckets"
            ]
        },
        "couldn't be established": {
            "issue": "Endpoint configuration problem",
            "solutions": [
                "Check endpoint URL if using a custom endpoint",
                "Verify SSL/TLS configuration if using HTTPS",
                "Test connectivity to the endpoint from your environment"
            ]
        },
        "path.style.access": {
            "issue": "Path style access issue with S3-compatible storage",
            "solutions": [
                "Set spark.hadoop.fs.s3a.path.style.access to true for MinIO/custom S3",
                "Check if your S3-compatible storage supports virtual-hosted style"
            ]
        }
    }

    matched_issues = []
    for key, diagnosis in diagnoses.items():
        if key.lower() in error_message.lower():
            matched_issues.append(diagnosis)

    if not matched_issues:
        return {
            "issue": "Unknown issue",
            "solutions": [
                "Enable DEBUG level logging for fs.s3a",
                "Check Hadoop S3A documentation for your specific error",
                "Verify all S3 configuration parameters"
            ]
        }

    return matched_issues[0]


# Example usage
if __name__ == "__main__":
    # Example configurations
    # For AWS S3
    # aws_access_key = "YOUR_AWS_ACCESS_KEY"
    # aws_secret_key = "YOUR_AWS_SECRET_KEY"
    # s3_path = "s3a://your-bucket/path/"
    # endpoint_url = None
    # path_style_access = False

    # For MinIO or other S3-compatible storage
    aws_access_key = "minioadmin"
    aws_secret_key = "minioadmin"
    endpoint_url = "http://minio:9000"
    s3_path = "s3a://ehr/"
    path_style_access = True

    try:
        # Create debug session with verbose logging
        spark = create_debug_spark_session(
            aws_access_key=aws_access_key,
            aws_secret_key=aws_secret_key,
            endpoint_url=endpoint_url,
            path_style_access=path_style_access,
            log_level="DEBUG"
        )

        # Test connection
        connection_test = test_s3_connection(spark, s3_path)

        if connection_test:
            logger.info("S3 connection successful! Now testing permissions...")
            # Extract bucket name from path
            bucket_name = s3_path.split('/')[2]
            if not bucket_name:
                # Handle empty string after split
                bucket_name = s3_path.split('/')[3]

            # Test permissions
            permissions = verify_s3_permissions(
                spark,
                bucket_name,
                test_operations=['read', 'write', 'delete']
            )

            for op, result in permissions.items():
                logger.info(
                    f"Permission {op.upper()}: {'✅ Success' if result else '❌ Failed'}")
        else:
            logger.error("S3 connection failed!")

    except Exception as e:
        error_msg = str(e)
        logger.error(f"Error encountered: {error_msg}")

        # Diagnose the issue
        diagnosis = diagnose_s3_issue(error_msg)

        logger.info("\n=== DIAGNOSIS ===")
        logger.info(f"Issue: {diagnosis['issue']}")
        logger.info("Potential solutions:")
        for i, solution in enumerate(diagnosis['solutions'], 1):
            logger.info(f"  {i}. {solution}")
        logger.info("=================")