In [None]:

# Databricks Notebook
# ===================================
# 03_Silver_Layer
# ===================================

from pyspark.sql.functions import *
from pyspark.sql.types import *

print("="*60)
print("シルバーレイヤー構築開始 (Unity Catalog & saveAsTable)")
print("="*60)

# -------------------------
# 出力（Unity Catalog）
# -------------------------
CATALOG_NAME = "azure_databricks_test"   # ←環境に合わせて変更可
SCHEMA_NAME  = "ad_analytics"            # ←あなたのスキーマ名
DB_3PART     = f"{CATALOG_NAME}.{SCHEMA_NAME}"

# カタログ／スキーマ準備（存在しなければ作成）
spark.sql(f"USE CATALOG {CATALOG_NAME}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {DB_3PART}")

# -------------------------
# ブロンズ・テーブル読み込み
# -------------------------
bronze_items        = spark.table(f"{DB_3PART}.bronze_items")
bronze_ads          = spark.table(f"{DB_3PART}.bronze_digital_ads")
bronze_transactions = spark.table(f"{DB_3PART}.bronze_transactions")

# ===================================
# 1. 商品マスタのクレンジング
# ===================================
print("\n[1/3] 商品マスタクレンジング中...")

silver_items = (
    bronze_items
    .filter(col("item_id").isNotNull())
    .filter(col("is_active") == True)
    .withColumn("launch_date", to_date(col("launch_date")))
    # ゼロ割り回避（price > 0 の場合のみ利益率計算）
    .withColumn(
        "profit_margin",
        when(col("price") > 0, ((col("price") - col("cost")) / col("price")) * 100).otherwise(lit(None))
    )
    .withColumn(
        "price_tier",
        when(col("price") < 5000, "Low")
        .when((col("price") >= 5000) & (col("price") < 15000), "Medium")
        .otherwise("High")
    )
    .dropDuplicates(["item_id"])
    .select(
        "item_id",
        "item_name",
        "category_l1",
        "category_l2",
        "category_l3",
        "brand",
        "price",
        "cost",
        "profit_margin",
        "price_tier",
        "color",
        "size",
        "season",
        "launch_date",
        "ingestion_timestamp"
    )
)

# UCマネージドテーブルとして保存（3部構成, saveAsTable）
silver_items.write \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{DB_3PART}.silver_items")

print(f"✓ 商品マスタクレンジング完了: {silver_items.count():,}件")

# ===================================
# 2. 広告データのクレンジング
# ===================================
print("\n[2/3] 広告データクレンジング中...")

silver_ads = (
    bronze_ads
    .filter(col("campaign_id").isNotNull())
    .filter(col("clicks") >= 0)
    .filter(col("cost") >= 0)
    .withColumn("date", to_date(col("date")))
    .withColumn("year_month", date_format(col("date"), "yyyy-MM"))
    .withColumn("day_of_week", dayofweek(col("date")))
    # CTR: impressions > 0 のときのみ計算（%）
    .withColumn("ctr", when(col("impressions") > 0, (col("clicks") / col("impressions")) * 100).otherwise(lit(None)))
    # CPC: clicks > 0 のときのみ計算
    .withColumn("cpc", when(col("clicks") > 0, col("cost") / col("clicks")).otherwise(lit(None)))
    .withColumn("utm_campaign", lower(trim(col("utm_campaign"))))
    .dropDuplicates(["campaign_id", "ad_id", "date"])
    .select(
        "campaign_id",
        "campaign_name",
        "ad_id",
        "ad_platform",
        "ad_format",
        "target_url",
        "impressions",
        "clicks",
        "cost",
        "ctr",
        "cpc",
        "date",
        "year_month",
        "day_of_week",
        "utm_source",
        "utm_medium",
        "utm_campaign",
        "target_category",
        "ingestion_timestamp"
    )
)

# パーティション列を指定して UC 管理テーブルへ保存
silver_ads.write \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .partitionBy("date") \
    .saveAsTable(f"{DB_3PART}.silver_digital_ads")

print(f"✓ 広告データクレンジング完了: {silver_ads.count():,}件")

# ===================================
# 3. トランザクションデータのクレンジング
# ===================================
print("\n[3/3] トランザクションデータクレンジング中...")

silver_transactions = (
    bronze_transactions
    .filter(col("transaction_id").isNotNull())
    .filter(col("price") > 0)
    .filter(col("quantity") > 0)
    .withColumn("transaction_timestamp", to_timestamp(col("transaction_timestamp")))
    .withColumn("transaction_date", to_date(col("transaction_timestamp")))
    .withColumn("year_month", date_format(col("transaction_timestamp"), "yyyy-MM"))
    .withColumn("hour", hour(col("transaction_timestamp")))
    .withColumn("day_of_week", dayofweek(col("transaction_date")))
    .withColumn("total_amount", col("price") * col("quantity"))
    .withColumn("user_email_hashed", sha2(col("user_email"), 256))
    .withColumn("utm_campaign", lower(trim(col("utm_campaign"))))
    .withColumn("is_paid_ad", when(col("utm_campaign").isNotNull(), lit(True)).otherwise(lit(False)))
    .dropDuplicates(["transaction_id"])
    .select(
        "transaction_id",
        "transaction_timestamp",
        "transaction_date",
        "year_month",
        "hour",
        "day_of_week",
        "item_id",
        "user_id",
        "user_email_hashed",
        "quantity",
        "price",
        "total_amount",
        "referrer_url",
        "landing_page_url",
        "utm_source",
        "utm_medium",
        "utm_campaign",
        "device_type",
        "session_id",
        "conversion_time_minutes",
        "is_paid_ad",
        "ingestion_timestamp"
    )
)

silver_transactions.write \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .partitionBy("transaction_date") \
    .saveAsTable(f"{DB_3PART}.silver_transactions")

print(f"✓ トランザクションデータクレンジング完了: {silver_transactions.count():,}件")

# ===================================
# データ品質レポート
# ===================================
print("\n" + "="*60)
print("シルバーレイヤー構築完了 (UC & saveAsTable)")
print("="*60)

print("\n【データ品質レポート】")

# ブロンズ vs シルバーの件数比較
bronze_items_count = spark.table(f"{DB_3PART}.bronze_items").count()
silver_items_count = spark.table(f"{DB_3PART}.silver_items").count()
print(f"商品マスタ: {bronze_items_count:,} → {silver_items_count:,} ({(silver_items_count/bronze_items_count)*100:.1f}%)")

bronze_ads_count = spark.table(f"{DB_3PART}.bronze_digital_ads").count()
silver_ads_count = spark.table(f"{DB_3PART}.silver_digital_ads").count()
print(f"広告データ: {bronze_ads_count:,} → {silver_ads_count:,} ({(silver_ads_count/bronze_ads_count)*100:.1f}%)")

bronze_txn_count = spark.table(f"{DB_3PART}.bronze_transactions").count()
silver_txn_count = spark.table(f"{DB_3PART}.silver_transactions").count()
print(f"トランザクション: {bronze_txn_count:,} → {silver_txn_count:,} ({(silver_txn_count/bronze_txn_count)*100:.1f}%)")

# サンプルデータ表示（先頭5件）
print("\n【シルバー: 商品マスタ（先頭5件）】")
display(spark.table(f"{DB_3PART}.silver_items").orderBy("item_id").limit(5))

print("\n【シルバー: 広告データ（先頭5件）】")
display(spark.table(f"{DB_3PART}.silver_digital_ads").orderBy("date", "campaign_id").limit(5))

print("\n【シルバー: トランザクション（先頭5件）】")
display(spark.table(f"{DB_3PART}.silver_transactions").orderBy("transaction_id").limit(5))