In [4]:
from pyspark.sql.types import StructType
from pyspark.sql.types import *
from pyspark.sql import SparkSession
# from pyspark.sql.functions import *
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql.streaming import DataStreamWriter
from minio import Minio
from datetime import timedelta
from delta.tables import *
import os

def minio_session_spark():
    spark = (
        SparkSession.builder
            .master("local[*]")
            .appName("appMinIO")
            ### Config Fields
            .config('spark.sql.debug.maxToStringFields', 5000)
            .config('spark.debug.maxToStringFields', 5000)
            ### Optimize
            .config("delta.autoOptimize.optimizeWrite", "true")
            .config("delta.autoOptimize.autoCompact", "true")
            ### Delta Table
            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0")
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
            ## MinIO
            #.config("spark.hadoop.fs.s3a.endpoint", "http://172.20.0.2:9000")
             .config("spark.hadoop.fs.s3a.endpoint", "minio:9000")

            .config("spark.hadoop.fs.s3a.access.key", "tcc_user")
            .config("spark.hadoop.fs.s3a.secret.key", "Acnmne@a9h!")
            .config("spark.hadoop.fs.s3a.path.style.access", "true")
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            ## Jars
            .config("spark.jars", "/home/jovyan/work/jars/hadoop-common-3.3.2.jar,\
                                    /home/jovyan/work/jars/hadoop-aws-3.3.2.jar, \
                                    /home/jovyan/work/jars/aws-java-sdk-bundle-1.11.874.jar")
            .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
            .getOrCreate()
    )
    return spark

In [5]:
spark = minio_session_spark()

# # spark
# print(f"Spark version = {spark.version}")

# # hadoop
# print(f"Hadoop version = {spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()}")

In [7]:
rank = 'bronze_I'
df = (
spark
    .read
    .format('delta')
    .load(f"s3a://bronze/{rank}")
)


In [8]:
df.show()

+----------+---------+--------+--------------------+------------+------+---------------+----+--------------------+----------------+------+-------+----+
|freshBlood|hotStreak|inactive|            leagueId|leaguePoints|losses|      queueType|rank|          summonerId|    summonerName|  tier|veteran|wins|
+----------+---------+--------+--------------------+------------+------+---------------+----+--------------------+----------------+------+-------+----+
|     false|    false|   false|376a45f8-a14c-4d3...|           0|    38|RANKED_SOLO_5x5|  IV|GwRX5Jkw2zFpg7hIl...|     WithoutThis|BRONZE|  false|  31|
|     false|    false|   false|eac48c8f-272d-436...|           0|     8|RANKED_SOLO_5x5|  IV|cQ7c13ftchM_c1eFF...|          Guimzt|BRONZE|  false|   3|
|     false|    false|   false|cdf4380e-1acc-463...|           7|   117|RANKED_SOLO_5x5|  IV|Ol-0E7znQLBW6NYpP...|        ciprand1|BRONZE|  false| 127|
|     false|    false|   false|62bf8d69-beb0-4ae...|           0|    81|RANKED_SOLO_5x5|

In [9]:
df.dtypes

[('freshBlood', 'boolean'),
 ('hotStreak', 'boolean'),
 ('inactive', 'boolean'),
 ('leagueId', 'string'),
 ('leaguePoints', 'bigint'),
 ('losses', 'bigint'),
 ('queueType', 'string'),
 ('rank', 'string'),
 ('summonerId', 'string'),
 ('summonerName', 'string'),
 ('tier', 'string'),
 ('veteran', 'boolean'),
 ('wins', 'bigint')]