In [1]:
# Agrupar os jogadores em uma única tabela

In [2]:
from pyspark.sql.types import StructType
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import DataFrame
from pyspark.sql.streaming import DataStreamWriter
from minio import Minio
from delta.tables import *
import os

def minio_session_spark():
    spark = (
        SparkSession.builder
            .master("local[*]")
            .appName("appMinIO")
            ### Config Fields
            .config('spark.sql.debug.maxToStringFields', 5000)
            .config('spark.debug.maxToStringFields', 5000)
            ### Optimize
            .config("delta.autoOptimize.optimizeWrite", "true")
            .config("delta.autoOptimize.autoCompact", "true")
            ### Delta Table
            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0")
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
            ## MinIO
            #.config("spark.hadoop.fs.s3a.endpoint", "http://172.20.0.2:9000")
             .config("spark.hadoop.fs.s3a.endpoint", "minio:9000")

            .config("spark.hadoop.fs.s3a.access.key", "tcc_user")
            .config("spark.hadoop.fs.s3a.secret.key", "Acnmne@a9h!")
            .config("spark.hadoop.fs.s3a.path.style.access", "true")
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            ## Jars
            .config("spark.jars", "/home/jovyan/work/jars/hadoop-common-3.3.2.jar,\
                                    /home/jovyan/work/jars/hadoop-aws-3.3.2.jar, \
                                    /home/jovyan/work/jars/aws-java-sdk-bundle-1.11.874.jar")
            .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
            .getOrCreate()
    )
    return spark

In [3]:
spark = minio_session_spark()

# spark
print(f"Spark version = {spark.version}")

# hadoop
print(f"Hadoop version = {spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()}")

Spark version = 3.3.2
Hadoop version = 3.3.2


In [6]:
# Configure Minio connection
minio_endpoint = 'minio:9000'
access_key = 'tcc_fia'
secret_key = 'tcc_fia'
secure = False  # Set to True for HTTPS
minio_client = Minio(endpoint=minio_endpoint, access_key=access_key, secret_key=secret_key, secure=secure)

# Specify the Minio bucket and path
minio_bucket = 'silver'
silver_tables = ['bronze_players','silver_players','gold_players','platina_players']

#ALL PLAYERS
final_df = None
for i in range(len(silver_tables)):
    df = (
    spark
    .read
    .format('delta')
    .load(f"s3a://{minio_bucket}/{silver_tables[i]}")
    )
    
    # Union the dataframes
    if final_df is None:
        final_df = df
    else:
        final_df = final_df.union(df)
        

#Salvando delta table
(
    final_df
    .write
    .format("delta")
    .mode("overwrite") 
    .option("overwriteSchema", "True")
    .save(f"s3a://gold/" + 'tb_all_players')
)

In [9]:
final_df.select('tier').distinct().show()

+--------+
|    tier|
+--------+
|  BRONZE|
|  SILVER|
|    GOLD|
|PLATINUM|
+--------+

