In [2]:
# ========================================
# SILVER â†’ GOLD (FINAL - CLEAN VERSION)
# ========================================

# 1) STOP SparkSession cÅ© náº¿u cÃ³
try:
    spark.stop()
except:
    pass

# 2) START SparkSession má»›i
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window
import os

spark = (
    SparkSession.builder
    .appName("Silverâ†’Gold Modeling (FINAL CLEAN)")
    .config("spark.jars.packages",
            "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262")
    # Náº¿u dÃ¹ng MinIO:
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "admin")
    .config("spark.hadoop.fs.s3a.secret.key", "admin12345")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .getOrCreate()
)

print("âœ… Spark Started:", spark.version)

# 3) Path Silver / Gold
SILVER_BASE = "s3a://transfermarkt-silver"
GOLD_BASE   = "s3a://transfermarkt-gold"

# 4) HÃ m tiá»‡n Ã­ch
def normalize(col):
    return F.lower(F.trim(
        F.regexp_replace(
            F.regexp_replace(col, r"\s+", " "),
            r"[^a-zA-Z0-9\s\-\_]", ""
        )
    ))

def make_key(df, business_cols, key_name):
    w = Window.orderBy(*business_cols)
    return df.withColumn(key_name, F.dense_rank().over(w))

def read_silver(name):
    path = f"{SILVER_BASE}/{name}"
    print("â–¶ Reading SILVER:", path)
    return spark.read.parquet(path)

def write_gold(df, name, mode="overwrite"):
    path = f"{GOLD_BASE}/{name}"
    print("ðŸ’¾ Writing GOLD:", path)
    df.write.mode(mode).parquet(path)
    return path

# 5) LOAD SILVER DATA
players_silver     = read_silver("players")
clubs_silver       = read_silver("clubs")
transfers_silver   = read_silver("transfers")
performance_silver = read_silver("performance")
# 6) CHUáº¨N HÃ“A PLAYERS
plr = players_silver
if "name" not in plr.columns:
    fn = F.coalesce(F.col("first_name"), F.lit(""))
    ln = F.coalesce(F.col("last_name"), F.lit(""))
    plr = plr.withColumn("name", F.trim(F.concat_ws(" ", fn, ln)))

plr = plr.select(
    *[c for c in ["player_id","name","nationality","position","age"] if c in plr.columns]
).dropDuplicates(["player_id"])

plr = plr.withColumn("name_norm", normalize(F.col("name")))

# 7) CHUáº¨N HÃ“A CLUBS
clb = clubs_silver
if "club_name" not in clb.columns and "name" in clb.columns:
    clb = clb.withColumnRenamed("name", "club_name")

clb = clb.select(
    *[c for c in ["club_id","club_name","league","country"] if c in clb.columns]
)

clb = clb.dropDuplicates(["club_name"]) \
         .withColumn("club_name_norm", normalize(F.col("club_name")))

# 8) DIMENSIONS
dim_players = make_key(plr.dropDuplicates(["player_id"]), ["player_id"], "player_key")
dim_clubs   = make_key(clb.dropDuplicates(["club_name_norm"]), ["club_name_norm"], "club_key")
tx = transfers_silver.select(
    *[c for c in ["player_id","club_from","club_to","season","fee","transfer_date"]
      if c in transfers_silver.columns]
)

tx = tx.withColumn("club_from_norm", normalize(F.col("club_from"))) \
       .withColumn("club_to_norm",   normalize(F.col("club_to")))   \
       .withColumn("season", F.col("season").cast("string"))        \
       .withColumn("fee", F.col("fee").cast("double"))

# Map keys
clubs_map   = dim_clubs.select("club_key","club_name_norm")
players_map = dim_players.select("player_id","player_key")

# JOIN club_from
tx = tx.join(
    clubs_map.withColumnRenamed("club_key","club_from_key")
             .withColumnRenamed("club_name_norm","club_from_norm_dim"),
    tx["club_from_norm"] == F.col("club_from_norm_dim"),
    "left"
)

# JOIN club_to
tx = tx.join(
    clubs_map.withColumnRenamed("club_key","club_to_key")
             .withColumnRenamed("club_name_norm","club_to_norm_dim"),
    tx["club_to_norm"] == F.col("club_to_norm_dim"),
    "left"
)

# JOIN player_key
tx = tx.join(players_map, "player_id", "left")

# FACT TRANSFERS
fact_transfers = tx.select(
    "player_key","player_id",
    "club_from_key","club_to_key",
    "club_from","club_to",
    "season","fee","transfer_date"
)
# Chuáº©n hÃ³a performance
perf = performance_silver.select(
    *[c for c in ["player_id","goals","assists","minutes_played"] if c in performance_silver.columns]
).dropna(subset=["player_id"])

# JOIN láº¥y player_key + age tá»« dim_players
dim_players_age = dim_players.select("player_id","player_key","age")

fact_performance = perf.join(dim_players_age, "player_id", "left") \
    .select("player_key","player_id","goals","assists","minutes_played","age")
write_gold(dim_players, "dim_players")
write_gold(dim_clubs,   "dim_clubs")
write_gold(fact_transfers, "fact_transfers")
write_gold(fact_performance, "fact_performance")

print("""
====================================================
âœ… GOLD STAR SCHEMA CREATED SUCCESSFULLY (NO UNKNOWN)
- dim_players
- dim_clubs
- fact_transfers (season tháº­t)
- fact_performance (NO season, cÃ³ age)
====================================================
""")
print("\n--- ROW COUNTS ---")
print("dim_players:", dim_players.count())
print("dim_clubs:", dim_clubs.count())
print("fact_transfers:", fact_transfers.count())
print("fact_performance:", fact_performance.count())

print("\n--- NULL KEY CHECK ---")
dim_players.select(F.count(F.when(F.col("player_key").isNull(),1)).alias("null_player_key")).show()
dim_clubs.select(F.count(F.when(F.col("club_key").isNull(),1)).alias("null_club_key")).show()
fact_transfers.select(F.count(F.when(F.col("player_key").isNull(),1)).alias("null_player_key")).show()
fact_performance.select(F.count(F.when(F.col("player_key").isNull(),1)).alias("null_player_key")).show()

print("\n DONE!")


âœ… Spark Started: 3.5.0
â–¶ Reading SILVER: s3a://transfermarkt-silver/players
â–¶ Reading SILVER: s3a://transfermarkt-silver/clubs
â–¶ Reading SILVER: s3a://transfermarkt-silver/transfers
â–¶ Reading SILVER: s3a://transfermarkt-silver/performance
ðŸ’¾ Writing GOLD: s3a://transfermarkt-gold/dim_players
ðŸ’¾ Writing GOLD: s3a://transfermarkt-gold/dim_clubs
ðŸ’¾ Writing GOLD: s3a://transfermarkt-gold/fact_transfers
ðŸ’¾ Writing GOLD: s3a://transfermarkt-gold/fact_performance

âœ… GOLD STAR SCHEMA CREATED SUCCESSFULLY (NO UNKNOWN)
- dim_players
- dim_clubs
- fact_transfers (season tháº­t)
- fact_performance (NO season, cÃ³ age)


--- ROW COUNTS ---
dim_players: 32601
dim_clubs: 439
fact_transfers: 79646
fact_performance: 1706806

--- NULL KEY CHECK ---
+---------------+
|null_player_key|
+---------------+
|              0|
+---------------+

+-------------+
|null_club_key|
+-------------+
|            0|
+-------------+

+---------------+
|null_player_key|
+---------------+
|            