In [3]:
# =============================================================
# Notebook: 02_silver_to_gold_modeling.ipynb
# Mục tiêu: Silver → Gold (Star Schema) cho Transfermarkt (Final v4)
# =============================================================

# ----------------------------
# 0) CẤU HÌNH CHUNG
# ----------------------------
MODE = "minio"   # hoặc "local"

SILVER_BASE = {
    "minio": "s3a://transfermarkt-silver",
    "local": "/home/jovyan/work/silver"
}[MODE]

GOLD_BASE = {
    "minio": "s3a://transfermarkt-gold",
    "local": "/home/jovyan/work/gold"
}[MODE]

# ----------------------------
# 1) SPARK + KẾT NỐI MINIO
# ----------------------------
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window
import os

spark = (
    SparkSession.builder
    .appName("Silver→Gold Modeling (Transfermarkt Final)")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.sql.shuffle.partitions", "200")
    .config("spark.jars.packages",
            "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262")
    .getOrCreate()
)

if MODE == "minio":
    spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio:9000")
    spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", "admin")
    spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "admin12345")
    spark._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
    spark._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")

print("✅ Spark:", spark.version)
print("SILVER_BASE:", SILVER_BASE)
print("GOLD_BASE:", GOLD_BASE)

# ----------------------------
# 2) FOLDER GOLD (LOCAL)
# ----------------------------
if MODE == "local":
    os.makedirs(GOLD_BASE, exist_ok=True)
else:
    print("⚙️ Using existing MinIO bucket: transfermarkt-gold")

# ----------------------------
# 3) HÀM TIỆN ÍCH
# ----------------------------
def normalize(col):
    """Chuẩn hoá chuỗi để join theo tên (giảm sai khác)."""
    return F.lower(F.trim(F.regexp_replace(F.regexp_replace(col, r"\s+", " "), r"[^a-zA-Z0-9\s\-\_]", "")))

def make_key(df, business_cols, key_name):
    """Tạo surrogate key ổn định bằng dense_rank."""
    w = Window.orderBy(*business_cols)
    return df.withColumn(key_name, F.dense_rank().over(w))

def read_silver(name):
    path = f"{SILVER_BASE}/{name}"
    print("▶ Reading:", path)
    return spark.read.parquet(path)

def write_gold(df, name, mode="overwrite"):
    path = f"{GOLD_BASE}/{name}"
    print("💾 Writing:", path)
    df.write.mode(mode).parquet(path)
    return path

# ----------------------------
# 4) ĐỌC SILVER
# ----------------------------
players_silver     = read_silver("players")
clubs_silver       = read_silver("clubs")
transfers_silver   = read_silver("transfers")
performance_silver = read_silver("performance")

# ----------------------------
# 5) CHUẨN HOÁ DỮ LIỆU
# ----------------------------
# Players
plr = players_silver
if "name" not in plr.columns:
    fn = F.coalesce(F.col("first_name"), F.lit(""))
    ln = F.coalesce(F.col("last_name"), F.lit(""))
    plr = plr.withColumn("name", F.trim(F.concat_ws(" ", fn, ln)))

plr = plr.select(*[c for c in ["player_id","name","nationality","position","age","dob"] if c in plr.columns]).dropDuplicates(["player_id"])
plr = plr.withColumn("name_norm", normalize(F.col("name")))

# Clubs
clb = clubs_silver
if "club_name" not in clb.columns and "name" in clb.columns:
    clb = clb.withColumnRenamed("name", "club_name")
clb = clb.select(*[c for c in ["club_id","club_name","league","country"] if c in clb.columns])
clb = clb.dropDuplicates(["club_name"]).withColumn("club_name_norm", normalize(F.col("club_name")))

# Transfers
tx = transfers_silver.select(*[c for c in ["player_id","club_from","club_to","season","fee","transfer_date"] if c in transfers_silver.columns])
if "club_from" in tx.columns: tx = tx.withColumn("club_from_norm", normalize(F.col("club_from")))
if "club_to" in tx.columns:   tx = tx.withColumn("club_to_norm", normalize(F.col("club_to")))
tx = tx.withColumn("season", F.col("season").cast("string")).withColumn("fee", F.col("fee").cast("double"))

# Performance – KHÔNG có season → thêm season & age
perf = performance_silver
if "season" not in perf.columns: perf = perf.withColumn("season", F.lit("Unknown"))
if "age" not in perf.columns: perf = perf.withColumn("age", F.lit(None).cast("int"))

perf = perf.select("player_id","season","goals","assists","minutes_played","age")

# ----------------------------
# 6) DIMENSIONS
# ----------------------------
dim_players = make_key(plr.dropDuplicates(["player_id"]), ["player_id"], "player_key")
dim_clubs = make_key(clb.dropDuplicates(["club_name_norm"]), ["club_name_norm"], "club_key")

# ----------------------------
# 7) FACT TRANSFERS
# ----------------------------
clubs_map = dim_clubs.select("club_key","club_name_norm")
players_map = dim_players.select("player_id","player_key")

# Join CLUB FROM / TO
tx = tx.join(
    clubs_map.withColumnRenamed("club_key","club_from_key").withColumnRenamed("club_name_norm","club_from_norm_dim"),
    tx["club_from_norm"] == F.col("club_from_norm_dim"), "left"
)
tx = tx.join(
    clubs_map.withColumnRenamed("club_key","club_to_key").withColumnRenamed("club_name_norm","club_to_norm_dim"),
    tx["club_to_norm"] == F.col("club_to_norm_dim"), "left"
)
tx = tx.join(players_map, "player_id", "left")

fact_transfers = tx.select("player_key","player_id","club_from_key","club_to_key","club_from","club_to","season","fee","transfer_date")

# ----------------------------
# 8) FACT PERFORMANCE
# ----------------------------
fact_performance = perf.join(players_map, "player_id", "left").select("player_key","player_id","season","goals","assists","minutes_played","age")

# ----------------------------
# 9) DIM_SEASONS
# ----------------------------
seasons_src = fact_transfers.select("season").union(fact_performance.select("season")).dropna().dropDuplicates(["season"])
dim_seasons = make_key(seasons_src.orderBy("season"), ["season"], "season_key")

# ----------------------------
# 10) GHI GOLD
# ----------------------------
write_gold(dim_players, "dim_players")
write_gold(dim_clubs, "dim_clubs")
write_gold(fact_transfers, "fact_transfers")
write_gold(fact_performance, "fact_performance")
write_gold(dim_seasons, "dim_seasons")

print("""
====================================================
✅ GOLD STAR SCHEMA CREATED SUCCESSFULLY!
- Dimensions: dim_players, dim_clubs, dim_seasons
- Facts: fact_transfers, fact_performance
====================================================
""")

# ----------------------------
# 11) DATA QUALITY CHECKS
# ----------------------------
def null_counts(df, cols):
    return df.select([F.count(F.when(F.col(c).isNull(), c)).alias(f"null_{c}") for c in cols])

print("\n--- COUNTS ---")
print("dim_players:", dim_players.count())
print("dim_clubs:", dim_clubs.count())
print("fact_transfers:", fact_transfers.count())
print("fact_performance:", fact_performance.count())

print("\n--- NULL KEYS ---")
null_counts(dim_players, ["player_key","player_id"]).show()
null_counts(dim_clubs, ["club_key"]).show()
null_counts(fact_transfers, ["player_key","club_from_key","club_to_key"]).show()
null_counts(fact_performance, ["player_key"]).show()


✅ Spark: 3.5.0
SILVER_BASE: s3a://transfermarkt-silver
GOLD_BASE: s3a://transfermarkt-gold
⚙️ Using existing MinIO bucket: transfermarkt-gold
▶ Reading: s3a://transfermarkt-silver/players
▶ Reading: s3a://transfermarkt-silver/clubs
▶ Reading: s3a://transfermarkt-silver/transfers
▶ Reading: s3a://transfermarkt-silver/performance
💾 Writing: s3a://transfermarkt-gold/dim_players
💾 Writing: s3a://transfermarkt-gold/dim_clubs
💾 Writing: s3a://transfermarkt-gold/fact_transfers
💾 Writing: s3a://transfermarkt-gold/fact_performance
💾 Writing: s3a://transfermarkt-gold/dim_seasons

✅ GOLD STAR SCHEMA CREATED SUCCESSFULLY!
- Dimensions: dim_players, dim_clubs, dim_seasons
- Facts: fact_transfers, fact_performance


--- COUNTS ---
dim_players: 32601
dim_clubs: 439
fact_transfers: 79646
fact_performance: 1706806

--- NULL KEYS ---
+---------------+--------------+
|null_player_key|null_player_id|
+---------------+--------------+
|              0|             0|
+---------------+--------------+

+-----