In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from spark_fs import init_spark_utils
from spark_init import create_spark_session

spark = create_spark_session("MinioDataLake")
utils = init_spark_utils(spark)

hadoop_conf = spark._jsc.hadoopConfiguration()
print("fs.s3a.endpoint:", hadoop_conf.get("fs.s3a.endpoint"))
print("fs.s3a.access.key:", hadoop_conf.get("fs.s3a.access.key"))
print("fs.s3a.secret.key:", hadoop_conf.get("fs.s3a.secret.key"))
print("fs.s3a.path.style.access:", hadoop_conf.get("fs.s3a.path.style.access"))

fs.s3a.endpoint: http://minio:9000
fs.s3a.access.key: accesskey
fs.s3a.secret.key: secretkey
fs.s3a.path.style.access: true


In [2]:
spark.sql("SHOW DATABASES").show()

+------------+
|   namespace|
+------------+
|     default|
|  sb_digital|
|warehouse_db|
+------------+



In [3]:
# Criar e verificar uma tabela teste
spark.sql("CREATE DATABASE IF NOT EXISTS warehouse_db")
spark.sql("SHOW DATABASES").show()
spark.sql("USE warehouse_db")

+------------+
|   namespace|
+------------+
|     default|
|warehouse_db|
+------------+



DataFrame[]

In [3]:
spark.sql("USE warehouse_db")
spark.sql("SHOW TABLES").show()

+------------+---------+-----------+
|   namespace|tableName|isTemporary|
+------------+---------+-----------+
|warehouse_db|new_table|      false|
+------------+---------+-----------+



In [None]:
data = [
    {"first_name": "bob", "age": 47},
    {"first_name": "li", "age": 23},
    {"first_name": "leah", "age": 51},
    {"first_name": "ivan", "age": 30},
]
df = spark.createDataFrame(data=data)
df.show()

In [None]:
df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("new_table")

In [3]:
df = spark.read.parquet("s3a://wba/warehouse/warehouse_db.db/new_table")
df.show()

+---+----------+
|age|first_name|
+---+----------+
| 51|      leah|
| 30|      ivan|
| 47|       bob|
| 23|        li|
+---+----------+



In [4]:
spark.catalog.listDatabases()

[Database(name='default', catalog='spark_catalog', description='Default Hive database', locationUri='s3a://wba/warehouse'),
 Database(name='sb_digital', catalog='spark_catalog', description='', locationUri='s3a://wba/warehouse/sb_digital.db'),
 Database(name='warehouse_db', catalog='spark_catalog', description='', locationUri='s3a://wba/warehouse/warehouse_db.db')]