In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, year, month, when, lit, mean
from pyspark.sql.functions import coalesce

In [None]:
spark = SparkSession.builder \
    .appName("SilverZone") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0,org.apache.hadoop:hadoop-aws:3.3.1") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

player_df = spark.read.format("delta").load("s3a://bronze/soccer/player")
player_attr_df = spark.read.format("delta").load("s3a://bronze/soccer/player_attributes")

In [None]:
# 1. Join nas tabelas
players_enhanced = player_df.join(
    player_attr_df,
    player_df.player_api_id == player_attr_df.player_api_id,
    "left"
).drop(player_attr_df.player_api_id)

players_enhanced = players_enhanced.withColumn(
    "birth_date",
    to_date("birthday", "yyyy-MM-dd HH:mm:ss")
).withColumn(
    "attributes_date",
    to_date("date", "yyyy-MM-dd HH:mm:ss")
).drop("date", "birthday")

players_enhanced = players_enhanced.withColumn(
    "birth_year",
    year("birth_date")
)

# Convertendo a altura (cm para m) e peso (lbs para kg)
players_enhanced = players_enhanced.withColumn(
    "height_m",
    col("height") / 100
).withColumn(
    "weight_kg",
    col("weight") * 0.453592
)

# Calculaando o IMC
players_enhanced = players_enhanced.withColumn(
    "bmi",
    col("weight_kg") / (col("height_m") ** 2)
)

# Tratando nulos em atributos numéricos
numeric_cols = [col_name for col_name, dtype in players_enhanced.dtypes if dtype in ['int', 'double']]
for col_name in numeric_cols:
    avg_value = players_enhanced.select(mean(col(col_name))).collect()[0][0]
    players_enhanced = players_enhanced.withColumn(
        col_name,
        coalesce(col(col_name), lit(avg_value))
    )

# Classificação de jogador
players_enhanced = players_enhanced.withColumn(
    "position_category",
    when(col("gk_diving") > 50, "Goalkeeper")
    .when(col("defensive_work_rate") == "High", "Defender")
    .when(col("attacking_work_rate") == "High", "Forward")
    .otherwise("Midfielder")
)

# Criando tabela Silver
players_enhanced.write.format("delta") \
    .partitionBy("birth_year", "position_category") \
    .mode("overwrite") \
    .save("s3a://silver/soccer/players_enhanced")

print("Tabela players_enhanced criada na camada Silver!")

In [None]:
match_df = spark.read.format("delta").load("s3a:///bucket-teste/soccer/bronze/match")


from pyspark.sql.functions import concat_ws

matches_cleaned = match_df.withColumn(
    "match_date",
    to_date("date", "yyyy-MM-dd HH:mm:ss")
).drop("date")

# Extraindo temporada e ano
matches_cleaned = matches_cleaned.withColumn(
    "season_start_year",
    year("match_date")
).withColumn(
    "season_end_year",
    col("season_start_year") + 1
).withColumn(
    "season_formatted",
    concat_ws("/", col("season_start_year"), col("season_end_year"))
)

# Calculando o resultado das partidas
matches_cleaned = matches_cleaned.withColumn(
    "match_result",
    when(col("home_team_goal") > col("away_team_goal"), "Home Win")
    .when(col("home_team_goal") < col("away_team_goal"), "Away Win")
    .otherwise("Draw")
)

# Calculando total de gols
matches_cleaned = matches_cleaned.withColumn(
    "total_goals",
    col("home_team_goal") + col("away_team_goal")
)

# média das casas de aposta
odds_columns = ["B365H", "BWH", "IWH", "PSH", "WHH", "VCH", "PSCH"]
matches_cleaned = matches_cleaned.withColumn(
    "avg_home_odds",
    mean([col(c) for c in odds_columns])
)

matches_cleaned.write.format("delta") \
    .partitionBy("season_start_year", "league_id") \
    .mode("overwrite") \
    .save("s3a://soccer/silver/matches_cleaned")

print("Tabela matches_cleaned criada na camada Silver!")



In [None]:
# Mostrando schemas e estatísticas
silver_tables = {
    "players_enhanced": ["birth_year", "position_category"],
    "matches_cleaned": ["season_start_year", "league_id"]
}

for table, partitions in silver_tables.items():
    df = spark.read.format("delta").load(f"s3a://soccer/silver/{table}")
    print(f"\nSchema da tabela {table}:")
    df.printSchema()
    print(f"\nContagem de registros: {df.count()}")
    print(f"\nPartições: {partitions}")