In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower, year, current_date, concat_ws, when
from pyspark.sql import functions as F

# 1. Khởi tạo SparkSession + cấu hình MinIO
spark = (
    SparkSession.builder
    .appName("Transfermarkt ETL - Bronze to Silver (Real DataRaw)")
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "admin")
    .config("spark.hadoop.fs.s3a.secret.key", "admin12345")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .getOrCreate()
)

BRONZE = "s3a://transfermarkt-bronze"
SILVER = "s3a://transfermarkt-silver"

# --- 2. Players ---
print("Processing players")
df_players = spark.read.csv(f"{BRONZE}/players.csv", header=True, inferSchema=True)

df_players_clean = (
    df_players
    .withColumn("name", when(col("name").isNotNull(), col("name"))
                .otherwise(concat_ws(" ", trim(col("first_name")), trim(col("last_name")))))
    .dropna(subset=["name"])
    .withColumn("name", trim(lower(col("name"))))
)

if "date_of_birth" in df_players_clean.columns:
    df_players_clean = df_players_clean.withColumn(
        "age", year(current_date()) - year(col("date_of_birth"))
    )

df_players_clean.write.mode("overwrite").parquet(f"{SILVER}/players")
print("Silver: players done")

# --- 3. Clubs ---
print("Processing clubs")
df_clubs = spark.read.csv(f"{BRONZE}/clubs.csv", header=True, inferSchema=True)

df_clubs_clean = (
    df_clubs
    .withColumnRenamed("name", "club_name")
    .dropna(subset=["club_name"])
    .withColumn("club_name", trim(lower(col("club_name"))))
    .dropDuplicates(["club_name"])
)

df_clubs_clean.write.mode("overwrite").parquet(f"{SILVER}/clubs")
print("Silver: clubs done")

# --- 4. Transfers ---
print("Processing transfers")
df_transfers = spark.read.csv(f"{BRONZE}/transfers.csv", header=True, inferSchema=True)

df_transfers_clean = (
    df_transfers
    .withColumnRenamed("from_club_name", "club_from")
    .withColumnRenamed("to_club_name", "club_to")
    .withColumnRenamed("transfer_season", "season")
    .withColumnRenamed("transfer_fee", "fee")
    .withColumn("club_from", trim(lower(col("club_from"))))
    .withColumn("club_to", trim(lower(col("club_to"))))
    .withColumn("season", col("season").cast("string"))
    .withColumn("fee", when(col("fee").isNotNull(), col("fee")).otherwise(0))
    .dropna(subset=["player_id"])
)

df_transfers_clean.write.mode("overwrite").parquet(f"{SILVER}/transfers")
print("Silver: transfers done")

# --- 5. Performance ---
print("Processing performance from appearances")
df_appear = spark.read.csv(f"{BRONZE}/appearances.csv", header=True, inferSchema=True)

# giữ các cột thích hợp nếu có
cols = [c for c in ["player_id", "goals", "assists", "minutes_played"] if c in df_appear.columns]
df_perf_clean = (
    df_appear.select(cols)
    .dropna(subset=["player_id"])
    .withColumn("goals", col("goals").cast("int"))
    .withColumn("assists", col("assists").cast("int"))
    .withColumn("minutes_played", col("minutes_played").cast("int"))
)

df_perf_clean.write.mode("overwrite").parquet(f"{SILVER}/performance")
print("Silver: performance done")

# --- 6. Kiểm tra kết quả Silver ---
print("\nSilver dataset summary:")
for t in ["players", "clubs", "transfers", "performance"]:
    df = spark.read.parquet(f"{SILVER}/{t}")
    print(f"{t}: count = {df.count()}, cols = {len(df.columns)}")


Processing players
Silver: players done
Processing clubs
Silver: clubs done
Processing transfers
Silver: transfers done
Processing performance from appearances
Silver: performance done

Silver dataset summary:
players: count = 32601, cols = 24
clubs: count = 439, cols = 17
transfers: count = 79646, cols = 10
performance: count = 1706806, cols = 4
