In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

In [8]:
spark = SparkSession.builder \
    .appName("Test App") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.yarn.queue", "default")\
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryo.registrationRequired", "false")\
    .getOrCreate()

In [9]:
try:
    spark.read.parquet("hdfs://hadoop-namenode:8020/datalake/bronze/matchs/*/*.parquet").count()
except AnalysisException as e:
    print(f"Erro do Spark ao acessar os dados: {e}")

2559

In [11]:
spark.read.parquet("hdfs://hadoop-namenode:8020/datalake/bronze/matchs/*/*.parquet").show(1,False)
except AnalysisException as e:
    print(f"Erro do Spark ao acessar os dados: {e}")

+--------------------------+--------------------------+--------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
try:
    spark.read.parquet("hdfs://hadoop-namenode:8020/datalake/silver/games/*/*.parquet").show(10,False) 
except AnalysisException as e:
    print(f"Erro do Spark ao acessar os dados: {e}")

+----------+-------------+-------------+---------+--------------+------+--------+-----------+--------------------+------------------+
|match_id  |game_creation|game_duration|game_mode|game_version  |map_id|queue_id|platform_id|game_start_timestamp|game_end_timestamp|
+----------+-------------+-------------+---------+--------------+------+--------+-----------+--------------------+------------------+
|3030284879|1732751269061|1802         |CLASSIC  |14.23.636.9832|11    |420     |BR1        |1732751296576       |1732753098738     |
|3030214552|1732744594242|2129         |CLASSIC  |14.23.636.9832|11    |420     |BR1        |1732744651922       |1732746780874     |
|3029903932|1732677786802|1475         |CLASSIC  |14.23.636.9832|11    |420     |BR1        |1732677817132       |1732679292578     |
|3029779417|1732666767753|1501         |CLASSIC  |14.23.636.9832|11    |420     |BR1        |1732666798344       |1732668299980     |
|3030226962|1732745905247|1879         |CLASSIC  |14.23.636.98

In [10]:
try:
    spark.read.parquet("hdfs://hadoop-namenode:8020/datalake/silver/participants/*/*.parquet").filter(col("match_id")==3028715261).orderBy("win").show(30,False) 
except AnalysisException as e:
    print(f"Erro do Spark ao acessar os dados: {e}")

+--------+--------------+-----------+-------------+-------+-------------------+----+-----+------+-------+-----+-----+-----+-----+-----+-----+-----+----------+--------------+-----------+-----------+------------------+------------------+-----------+---+
|match_id|participant_id|champion_id|champion_name|team_id|individual_position|lane|kills|deaths|assists|item0|item1|item2|item3|item4|item5|item6|allInPings|itemsPurchased|wardsKilled|wardsPlaced|total_damage_dealt|total_damage_taken|gold_earned|win|
+--------+--------------+-----------+-------------+-------+-------------------+----+-----+------+-------+-----+-----+-----+-----+-----+-----+-----+----------+--------------+-----------+-----------+------------------+------------------+-----------+---+
+--------+--------------+-----------+-------------+-------+-------------------+----+-----+------+-------+-----+-----+-----+-----+-----+-----+-----+----------+--------------+-----------+-----------+------------------+------------------+---------

In [11]:
try:
    spark.read.parquet("hdfs://hadoop-namenode:8020/datalake/silver/teams_bans/*/*.parquet").show(10,False) 
except AnalysisException as e:
    print(f"Erro do Spark ao acessar os dados: {e}")

+----------+-------+--------+-----------+
|match_id  |team_id|ban_turn|champion_id|
+----------+-------+--------+-----------+
|3030098770|100    |4       |104        |
|3029935983|200    |8       |78         |
|3029935983|200    |6       |104        |
|3030187466|100    |2       |7          |
|3030187466|100    |1       |234        |
|3030187466|100    |5       |96         |
|3030173901|100    |2       |72         |
|3029864210|200    |8       |7          |
|3029997395|200    |9       |777        |
|3029935983|200    |9       |799        |
+----------+-------+--------+-----------+
only showing top 10 rows



In [12]:
try:
    spark.read.parquet("hdfs://hadoop-namenode:8020/datalake/silver/teams_stats/*/*.parquet").show(10,False) 
except AnalysisException as e:
    print(f"Erro do Spark ao acessar os dados: {e}")

+----------+-------+-----+-----------+------------+-----------+
|match_id  |team_id|win  |baron_kills|dragon_kills|tower_kills|
+----------+-------+-----+-----------+------------+-----------+
|3030083372|100    |false|0          |0           |1          |
|3030244855|100    |false|0          |0           |0          |
|3029779417|100    |true |1          |2           |11         |
|3030226962|100    |false|1          |0           |5          |
|3030187466|200    |true |1          |3           |10         |
|3029813423|100    |false|1          |1           |6          |
|3030187466|100    |false|1          |2           |8          |
|3030104591|100    |true |1          |1           |9          |
|3030140364|200    |false|0          |0           |1          |
|3030038972|100    |false|1          |1           |9          |
+----------+-------+-----+-----------+------------+-----------+
only showing top 10 rows



In [21]:
try:
    spark.read.parquet("hdfs://hadoop-namenode:8020/datalake/gold/match_summary/*/*.parquet").show(10,False) 
except AnalysisException as e:
    print(f"Erro do Spark ao acessar os dados: {e}")

+----------+-------------+-------------+---------+--------------+------+--------+-----------+--------------------+------------------+
|match_id  |game_creation|game_duration|game_mode|game_version  |map_id|queue_id|platform_id|game_start_timestamp|game_end_timestamp|
+----------+-------------+-------------+---------+--------------+------+--------+-----------+--------------------+------------------+
|3030284879|1732751269061|1802         |CLASSIC  |14.23.636.9832|11    |420     |BR1        |1732751296576       |1732753098738     |
|3029991904|1732711544297|1687         |CLASSIC  |14.23.636.9832|11    |420     |BR1        |1732711661696       |1732713348656     |
|3030282492|1732751568469|1054         |CLASSIC  |14.23.636.9832|11    |0       |BR1        |1732751683374       |1732752797968     |
|3030198532|1732742724588|1576         |ARAM     |14.23.636.9832|12    |450     |BR1        |1732742824359       |1732744400424     |
|3029897739|1732676858685|1586         |CLASSIC  |14.23.636.98

In [22]:
try:
    spark.read.parquet("hdfs://hadoop-namenode:8020/datalake/gold/player_performance/*/*.parquet").show(10,False) 
except AnalysisException as e:
    print(f"Erro do Spark ao acessar os dados: {e}")

+----------+--------------+-----------+-------------+-------+-----------+------------+-------------+------------------+------------------+-----------------+------------+------------+---------------+----------+
|match_id  |participant_id|champion_id|champion_name|team_id|total_kills|total_deaths|total_assists|total_damage_dealt|total_damage_taken|total_gold_earned|wards_placed|wards_killed|items_purchased|win_status|
+----------+--------------+-----------+-------------+-------+-----------+------------+-------------+------------------+------------------+-----------------+------------+------------+---------------+----------+
|3029761510|1             |42         |Corki        |200    |8          |9           |12           |206346            |34016             |14079            |8           |5           |30             |false     |
|3029761510|2             |43         |Karma        |100    |1          |7           |20           |27292             |23315             |9664             |34  

In [23]:
try:
    spark.read.parquet("hdfs://hadoop-namenode:8020/datalake/gold/team_performance/*/*.parquet").show(10,False) 
except AnalysisException as e:
    print(f"Erro do Spark ao acessar os dados: {e}")

+----------+-------+----------+-----------------+------------------+-----------------+
|match_id  |team_id|win_status|total_baron_kills|total_dragon_kills|total_tower_kills|
+----------+-------+----------+-----------------+------------------+-----------------+
|3029761510|100    |true      |1                |4                 |11               |
|3029761510|200    |false     |0                |1                 |2                |
|3029770896|100    |true      |1                |3                 |6                |
|3029770896|200    |false     |0                |0                 |3                |
|3029779417|100    |true      |1                |2                 |11               |
|3029779417|200    |false     |0                |1                 |0                |
|3029781884|100    |true      |1                |4                 |11               |
|3029781884|200    |false     |0                |1                 |2                |
|3029789956|100    |false     |0           