In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("S3") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.1.0,io.delta:delta-storage:3.1.0,org.antlr:antlr4-runtime:4.9.3,org.scala-lang:scala-library:2.12.17,org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.access.key", "accesskey") \
    .config("spark.hadoop.fs.s3a.secret.key", "secretkey") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.hadoop.hive.metastore.warehouse.dir", "s3a://wba/warehouse") \
    .getOrCreate()

# spark = SparkSession.builder \
#     .appName("S3Test") \
#     .getOrCreate()

hadoop_conf = spark._jsc.hadoopConfiguration()
print("fs.s3a.endpoint:", hadoop_conf.get("fs.s3a.endpoint"))
print("fs.s3a.access.key:", hadoop_conf.get("fs.s3a.access.key"))
print("fs.s3a.secret.key:", hadoop_conf.get("fs.s3a.secret.key"))
print("fs.s3a.path.style.access:", hadoop_conf.get("fs.s3a.path.style.access"))

In [None]:
spark.sql("SHOW DATABASES").show()

In [None]:
# Criar e verificar uma tabela teste
spark.sql("CREATE DATABASE IF NOT EXISTS warehouse_db")
spark.sql("SHOW DATABASES").show()
spark.sql("USE warehouse_db")

In [None]:
data = [
    {"first_name": "bob", "age": 47},
    {"first_name": "li", "age": 23},
    {"first_name": "leah", "age": 51},
]
df = spark.createDataFrame(data=data)
df.show()

In [None]:
df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("example")

In [None]:
spark.sql("SHOW TABLES").show()

In [None]:
df = spark.sql("SELECT * FROM example")
df.show()

In [None]:
spark.catalog.listDatabases()