In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, year, when, lit, mean, coalesce, concat_ws, upper
from delta.tables import DeltaTable

In [2]:
# Configurar acesso ao MinIO
minio_config = {
    "fs.s3a.access.key": "admin",
    "fs.s3a.secret.key": "senhasegura",
    "fs.s3a.endpoint": "http://minio:9000",
    "fs.s3a.path.style.access": "true",
    "fs.s3a.connection.ssl.enabled": "false",
    "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "fs.s3a.connection.ssl.enabled": "false",
    "spark.hadoop.fs.s3a.impl.disable.cache": "true",
    "fs.spark.hadoop.fs.s3a.attempts.maximum":"5"
}

def create_spark_session() -> SparkSession:
    spark = (
        SparkSession.builder \
            .appName("GoldZone") \
            .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0,org.apache.hadoop:hadoop-aws:3.3.1") 
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 
            .config("spark.hadoop.fs.s3minio.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .getOrCreate()
    )
    spark.sparkContext.setLogLevel("WARN")
    return spark

spark = create_spark_session()

for key, value in minio_config.items():
    spark._jsc.hadoopConfiguration().set(key, value)

In [3]:
spark.sql("CREATE DATABASE IF NOT EXISTS gold")

DataFrame[]

In [4]:
def transform_matches():
    print("Transformando tabela matches...")
    
    matches = spark.read.format("delta").options(**minio_config).load("s3a://silver/soccer/matches_cleaned")
    
    gold_matches = matches.select(
        col("id").alias("match_id"),
        col("season_formatted").alias("season"),
        col("match_date"),
        col("home_team_api_id").alias("home_team_id"),
        col("away_team_api_id").alias("away_team_id"),
        col("home_team_goal").alias("home_score"),
        col("away_team_goal").alias("away_score"),
        col("league_id"),
        col("season_start_year"),
        col("match_result"),
        col("total_goals"),
        col("avg_home_odds"),
        col("avg_draw_odds"),
        col("avg_away_odds")
    ).dropDuplicates(["match_id"])

    gold_matches = gold_matches.fillna({
        "home_score": 0,
        "away_score": 0,
        "total_goals": 0,
        "avg_home_odds": 0,
        "avg_draw_odds": 0,
        "avg_away_odds": 0
    })
    
    gold_matches.write.format("delta") \
        .options(**minio_config) \
        .partitionBy("season_start_year", "league_id") \
        .mode("overwrite") \
        .save("s3a://gold/soccer/matches")
    
    print("Transformação de matches concluída!")

In [5]:
def transform_players():
    print("Transformando tabela players...")
    
    players = spark.read.format("delta").options(**minio_config).load("s3a://silver/soccer/players_enhanced")
    
    gold_players = players.select(
        col("player_api_id").alias("player_id"),
        col("player_name"),
        col("birth_date"),
        col("birth_year"),
        col("height_m"),
        col("weight_kg"),
        col("bmi"),
        col("position_category"),
        col("overall_rating"),
        col("potential"),
        col("preferred_foot")
    ).dropDuplicates(["player_id"])
    
    gold_players.write.format("delta") \
        .options(**minio_config) \
        .mode("overwrite") \
        .save("s3a://gold/soccer/players")
    
    print("Transformação de players concluída!")

In [6]:
transform_matches()
transform_players()

spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW gold_matches AS
SELECT * FROM delta.`s3a://gold/soccer/matches`
""")

spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW gold_players AS
SELECT * FROM delta.`s3a://gold/soccer/players`
""")

print("\nTabelas na camada Gold:")
spark.sql("SHOW TABLES IN gold").show()

Transformando tabela matches...
Transformação de matches concluída!
Transformando tabela players...
Transformação de players concluída!

Tabelas na camada Gold:
+---------+------------+-----------+
|namespace|   tableName|isTemporary|
+---------+------------+-----------+
|         |gold_matches|       true|
|         |gold_players|       true|
+---------+------------+-----------+

