In [1]:
# =============================================================
# EXPORT GOLD (Parquet in MinIO) → PostgreSQL (for Superset)
# =============================================================
 
from pyspark.sql import SparkSession
 
# ----------------------------
# 1) Spark Session
# ----------------------------
spark = (
    SparkSession.builder
    .appName("Export Gold to PostgreSQL")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.jars.packages",
        	"org.apache.hadoop:hadoop-aws:3.3.4,"
        	"com.amazonaws:aws-java-sdk-bundle:1.12.262,"
        	"org.postgresql:postgresql:42.7.3")   # JDBC driver
    .getOrCreate()
)
 
# ----------------------------
# 2) Cấu hình MinIO (S3A)
# ----------------------------
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio:9000")
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", "admin")
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "admin12345")
spark._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")
 
print("✅ Spark ready!")
 
# ----------------------------
# 3) Đường dẫn Gold Layer (Parquet)
# ----------------------------
GOLD_BASE = "s3a://transfermarkt-gold"
 
tables = {
    "dim_players": f"{GOLD_BASE}/dim_players",
    "dim_clubs": f"{GOLD_BASE}/dim_clubs",
    "fact_transfers": f"{GOLD_BASE}/fact_transfers",
    "fact_performance": f"{GOLD_BASE}/fact_performance",
}
 
# ----------------------------
# 4) Thông tin PostgreSQL (docker-compose)
# ----------------------------
jdbc_url = "jdbc:postgresql://postgres:5432/metastore"
jdbc_properties = {
    "user": "hive",
    "password": "metastore",
    "driver": "org.postgresql.Driver"
}
 
# ----------------------------
# 5) EXPORT TỪ PARQUET → POSTGRESQL
# ----------------------------
for table_name, path in tables.items():
    try:
        print(f"▶ Reading Parquet: {path}")
        df = spark.read.parquet(path)
 
        print(f"💾 Writing to PostgreSQL: public.{table_name}")
        (
        	df.write
        	.format("jdbc")
        	.option("url", jdbc_url)
        	.option("dbtable", f"public.{table_name}")
        	.option("user", jdbc_properties["user"])
        	.option("password", jdbc_properties["password"])
        	.option("driver", jdbc_properties["driver"])
        	.mode("overwrite")  # hoặc 'append'
        	.save()
        )
 
        print(f"✅ Done: {table_name}")
 
    except Exception as e:
        print(f"❌ Skipped {table_name} | Error: {e}")
 
print("\n🎉 ALL GOLD TABLES EXPORTED TO POSTGRESQL SUCCESSFULLY!")
print("➡ Superset now can connect and create dashboards.")


✅ Spark ready!
▶ Reading Parquet: s3a://transfermarkt-gold/dim_players
💾 Writing to PostgreSQL: public.dim_players
✅ Done: dim_players
▶ Reading Parquet: s3a://transfermarkt-gold/dim_clubs
💾 Writing to PostgreSQL: public.dim_clubs
✅ Done: dim_clubs
▶ Reading Parquet: s3a://transfermarkt-gold/fact_transfers
💾 Writing to PostgreSQL: public.fact_transfers
✅ Done: fact_transfers
▶ Reading Parquet: s3a://transfermarkt-gold/fact_performance
💾 Writing to PostgreSQL: public.fact_performance
✅ Done: fact_performance

🎉 ALL GOLD TABLES EXPORTED TO POSTGRESQL SUCCESSFULLY!
➡ Superset now can connect and create dashboards.
