In [None]:
import os
import sys

import boto3
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions

In [None]:
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [None]:
hadoop_conf = sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.access.key", os.getenv('MINIO_ACCESS_KEY'))
hadoop_conf.set("fs.s3a.secret.key", os.getenv('MINIO_SECRET_KEY'))
hadoop_conf.set("fs.s3a.path.style.access", "true")
hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("fs.s3a.endpoint", "http://minio:9000")

In [None]:
# Initialize the S3 client for MinIO
s3 = boto3.client(
    's3',
    endpoint_url='http://minio:9000',
    aws_access_key_id=os.getenv('MINIO_ACCESS_KEY'),
    aws_secret_access_key=os.getenv('MINIO_SECRET_KEY'),
)

In [None]:
# Define the bucket and prefix
minio_bucket = os.getenv('MINIO_BUCKET')
minio_prefix = os.getenv('MINIO_PREFIX')

# List the objects in the bucket
object_keys = []
response = s3.list_objects_v2(Bucket=minio_bucket, Prefix=minio_prefix)
for obj in response.get('Contents', []):
    print('---', obj.get('Key'))
    path_s3 = "s3://{}/{}".format(minio_bucket, obj.get('Key'))
    print(path_s3)
    df = spark.read.csv(path_s3, header=True, inferSchema=True)
    print(df.show(5))
    