# Exemplo de Conexão Spark com MinIO
Este notebook demonstra como conectar o Spark com o MinIO para ler e escrever dados.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import logging
import sys

In [2]:
# Configuração de logs
logging.getLogger('py4j').setLevel(logging.ERROR)
logging.getLogger('pyspark').setLevel(logging.WARN)

## Configuração do Spark com MinIO

In [None]:
# Configuração do Spark com MinIO
spark = SparkSession.builder \
            .appName("MinIO Delta Example") \
            .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
            .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
            .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
            .config("spark.sql.warehouse.dir", "s3a://datalake/warehouse") \
            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.4,org.apache.hadoop:hadoop-common:3.3.4") \
            .config("spark.hadoop.fs.s3a.buffer.dir", "/tmp") \
            .config("spark.hadoop.fs.s3a.fast.upload", "true") \
            .config("spark.hadoop.fs.s3a.fast.upload.buffer", "disk") \
            .getOrCreate()

# Configuração adicional para reduzir logs
spark.sparkContext.setLogLevel("WARN")

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-common added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f4c8add0-3cd6-40d0-bc57-768d42ebbfea;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.hadoop#hadoop-common;3.3.4 in central
	found org.apache.hadoop.thirdparty#hadoop-shaded-protobuf_3_7;1.1.1 in central
	found org.apache.hadoop#hadoop-annotations;3.3.4 in central
	found org.apache.hadoop.thirdparty#hadoop-shaded-guava;1.1.1 in central
	found com.google.guava#guava;

In [None]:
# Criando dados de exemplo
data = [("João", 25), ("Maria", 30), ("Pedro", 35)]

schema = StructType([
    StructField("nome", StringType(), True),
    StructField("idade", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)
df.show()

In [None]:
# Escrevendo dados no MinIO em formato Delta
df.write.format("delta") \
    .mode("overwrite") \
    .save("s3a://raw/exemplo_pessoas")

In [None]:
# Lendo dados do MinIO
df_read = spark.read.format("delta") \
    .load("s3a://raw/exemplo_pessoas")
df_read.show()

In [None]:
# Verificando o schema
df_read.printSchema()