In [1]:
from pyspark.sql.types import StructType
from pyspark.sql.types import *
from pyspark.sql import SparkSession
# from pyspark.sql.functions import *
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql.streaming import DataStreamWriter
from minio import Minio
from datetime import timedelta
from delta.tables import *
import os
from pyspark.sql.window import Window

def minio_session_spark():
    spark = (
        SparkSession.builder
            .master("local[*]")
            .appName("appMinIO")
            ### Config Fields
            .config('spark.sql.debug.maxToStringFields', 5000)
            .config('spark.debug.maxToStringFields', 5000)
            ### Optimize
            .config("delta.autoOptimize.optimizeWrite", "true")
            .config("delta.autoOptimize.autoCompact", "true")
            ### Delta Table
            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0")
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
            ## MinIO
            #.config("spark.hadoop.fs.s3a.endpoint", "http://172.20.0.2:9000")
             .config("spark.hadoop.fs.s3a.endpoint", "minio:9000")

            .config("spark.hadoop.fs.s3a.access.key", "tcc_user")
            .config("spark.hadoop.fs.s3a.secret.key", "Acnmne@a9h!")
            .config("spark.hadoop.fs.s3a.path.style.access", "true")
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            ## Jars
            .config("spark.jars", "/home/jovyan/work/jars/hadoop-common-3.3.2.jar,\
                                    /home/jovyan/work/jars/hadoop-aws-3.3.2.jar, \
                                    /home/jovyan/work/jars/aws-java-sdk-bundle-1.11.874.jar")
            .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
            .getOrCreate()
    )
    return spark

spark = minio_session_spark()

In [14]:
df_bronze = (
spark
    .read
    .format('delta')
    .load(f"s3a://gold/tb_lol_bronze_matchs")
)

df_silver = (
spark
    .read
    .format('delta')
    .load(f"s3a://gold/tb_lol_silver_matchs")
)

df_gold = (
spark
    .read
    .format('delta')
    .load(f"s3a://gold/tb_lol_gold_matchs")
)


df_platina = (
spark
    .read
    .format('delta')
    .load(f"s3a://gold/tb_lol_platina_matchs")
)

In [6]:
list_int_cols = [i[0] for i in  df_bronze.dtypes if i[1] == 'bigint']

In [18]:
df_bronze.dtypes

[('matchId', 'string'),
 ('puuid', 'string'),
 ('summonerId', 'string'),
 ('championName', 'string'),
 ('role', 'string'),
 ('assists', 'bigint'),
 ('damageDealtToTurrets', 'bigint'),
 ('damageDealtToObjectives', 'bigint'),
 ('detectorWardsPlaced', 'bigint'),
 ('visionScore', 'bigint'),
 ('visionWardsBoughtInGame', 'bigint'),
 ('wardsKilled', 'bigint'),
 ('wardsPlaced', 'bigint'),
 ('enemyMissingPings', 'bigint'),
 ('enemyVisionPings', 'bigint'),
 ('getBackPings', 'bigint'),
 ('goldEarned', 'bigint'),
 ('goldSpent', 'bigint'),
 ('longestTimeSpentLiving', 'bigint'),
 ('magicDamageDealt', 'bigint'),
 ('magicDamageDealtToChampions', 'bigint'),
 ('magicDamageTaken', 'bigint'),
 ('physicalDamageDealt', 'bigint'),
 ('physicalDamageDealtToChampions', 'bigint'),
 ('physicalDamageTaken', 'bigint'),
 ('totalDamageDealt', 'bigint'),
 ('totalDamageDealtToChampions', 'bigint'),
 ('totalDamageTaken', 'bigint'),
 ('totalTimeSpentDead', 'bigint'),
 ('spell1Casts', 'bigint'),
 ('spell2Casts', 'bigint')

In [27]:
(
    df_bronze
    # .select(list_int_cols)
    .select('totalDamageDealt')
    .describe()
    .show()
)

+-------+------------------+
|summary|  totalDamageDealt|
+-------+------------------+
|  count|                30|
|   mean| 85461.53333333334|
| stddev|63468.244113172026|
|    min|             10371|
|    max|            248731|
+-------+------------------+



In [28]:
(
    df_silver
    # .select(list_int_cols)
    .select('totalDamageDealt')
    .describe()
    .show()
)

+-------+------------------+
|summary|  totalDamageDealt|
+-------+------------------+
|  count|               206|
|   mean|131928.78155339806|
| stddev| 98245.56793804554|
|    min|                 0|
|    max|            438335|
+-------+------------------+



In [29]:
(
    df_gold
    # .select(list_int_cols)
    .select('totalDamageDealt')
    .describe()
    .show()
)

+-------+-----------------+
|summary| totalDamageDealt|
+-------+-----------------+
|  count|              572|
|   mean|92774.94755244756|
| stddev|72959.61077580982|
|    min|                0|
|    max|           413145|
+-------+-----------------+



In [30]:
(
    df_platina
    # .select(list_int_cols)
    .select('totalDamageDealt')
    .describe()
    .show()
)

+-------+------------------+
|summary|  totalDamageDealt|
+-------+------------------+
|  count|               330|
|   mean| 66838.79696969697|
| stddev|44566.721576976546|
|    min|                 0|
|    max|            276676|
+-------+------------------+

