In [1]:
from pyspark.sql.types import StructType
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import DataFrame
from pyspark.sql.streaming import DataStreamWriter
from minio import Minio
from datetime import timedelta
from delta.tables import *
import os

def minio_session_spark():
    spark = (
        SparkSession.builder
            .master("local[*]")
            .appName("appMinIO")
            ### Config Fields
            .config('spark.sql.debug.maxToStringFields', 5000)
            .config('spark.debug.maxToStringFields', 5000)
            ### Optimize
            .config("delta.autoOptimize.optimizeWrite", "true")
            .config("delta.autoOptimize.autoCompact", "true")
            ### Delta Table
            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0")
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
            ## MinIO
            #.config("spark.hadoop.fs.s3a.endpoint", "http://172.20.0.2:9000")
             .config("spark.hadoop.fs.s3a.endpoint", "minio:9000")

            .config("spark.hadoop.fs.s3a.access.key", "tcc_user")
            .config("spark.hadoop.fs.s3a.secret.key", "Acnmne@a9h!")
            .config("spark.hadoop.fs.s3a.path.style.access", "true")
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            ## Jars
            .config("spark.jars", "/home/jovyan/work/jars/hadoop-common-3.3.2.jar,\
                                    /home/jovyan/work/jars/hadoop-aws-3.3.2.jar, \
                                    /home/jovyan/work/jars/aws-java-sdk-bundle-1.11.874.jar")
            .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
            .getOrCreate()
    )
    return spark

In [2]:
spark = minio_session_spark()

# spark
print(f"Spark version = {spark.version}")

# hadoop
print(f"Hadoop version = {spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()}")

Spark version = 3.3.2
Hadoop version = 3.3.2


In [3]:
df = (
spark
    .read
    .format('delta')
    .load(f"s3a://gold/tb_all_players")
)

In [4]:
df.show()

+----------+---------+--------+--------------------+------------+------+---------------+----+--------------------+----------------+------+-------+----+
|freshBlood|hotStreak|inactive|            leagueId|leaguePoints|losses|      queueType|rank|          summonerId|    summonerName|  tier|veteran|wins|
+----------+---------+--------+--------------------+------------+------+---------------+----+--------------------+----------------+------+-------+----+
|     false|    false|   false|ed208be9-601d-44e...|          30|   134|RANKED_SOLO_5x5|  II|ioXqlbBBxZP56o_uv...|         ojakale|BRONZE|  false| 122|
|     false|    false|   false|799a9e6f-8602-4ce...|          69|     6|RANKED_SOLO_5x5|  II|RrL-LDqyrLPzrtJ5N...|          Filler|BRONZE|  false|   7|
|     false|    false|   false|0323eb98-e427-492...|          75|    64|RANKED_SOLO_5x5|  II|8SSYZKdH6qaddI0Ig...|  Um cafetão top|BRONZE|  false|  88|
|     false|    false|   false|ba5d8a03-2d18-4e1...|          19|    47|RANKED_SOLO_5x5|

In [5]:
(
    df
    .filter(col('summonerName') == 'FGS theone')
).show()

+----------+---------+--------+--------------------+------------+------+---------------+----+--------------------+------------+--------+-------+----+
|freshBlood|hotStreak|inactive|            leagueId|leaguePoints|losses|      queueType|rank|          summonerId|summonerName|    tier|veteran|wins|
+----------+---------+--------+--------------------+------------+------+---------------+----+--------------------+------------+--------+-------+----+
|      true|    false|   false|7165b127-44ee-472...|          22|   139|RANKED_SOLO_5x5|  IV|q4_9UDSTxGSTtVTgx...|  FGS theone|PLATINUM|  false| 158|
+----------+---------+--------+--------------------+------------+------+---------------+----+--------------------+------------+--------+-------+----+



In [6]:
(
    df
    .groupBy("tier")
    .agg(count("*").alias("player_count"))
).show()

+--------+------------+
|    tier|player_count|
+--------+------------+
|PLATINUM|      213122|
|    GOLD|      263929|
|  SILVER|      257509|
|  BRONZE|      235265|
+--------+------------+



In [35]:
(
    df.coalesce(1).write
     .format("csv")
     .mode("overwrite")
     .option("header", "true")
     .save("s3a://datasetv2/tb_all_players")
)

In [None]:
(