In [None]:

# Databricks Notebook
# ===================================
# 02_Bronze_Layer
# ===================================

from pyspark.sql.types import *
from pyspark.sql.functions import *

# -------------------------
# 入力データ（Volumes）
# -------------------------
VOLUME_PATH = "/Volumes/azure_databricks_test/default/sample_data"

# -------------------------
# 出力（Unity Catalog）
# -------------------------
CATALOG_NAME = "azure_databricks_test"   # ←環境に合わせて変更可
SCHEMA_NAME  = "ad_analytics"            # ←あなたのスキーマ名
DB_3PART     = f"{CATALOG_NAME}.{SCHEMA_NAME}"

# カタログ／スキーマ準備（存在しなければ作成）
spark.sql(f"USE CATALOG {CATALOG_NAME}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {DB_3PART}")

print("="*68)
print("ブロンズレイヤー構築開始 (catalog.schema = " + DB_3PART + ")")
print("="*68)

# ============================================================
# 1. 商品マスタ（JSON）: トップレベルが配列 → 展開して読み込み
# ============================================================

print("\n[1/3] 商品マスタ（JSON配列）取り込み中...")

# ① そのまま JSON を読み込む（今の構造に合う）
items_parsed_df = spark.read.option("multiline", "true").json(f"{VOLUME_PATH}/items/*.json")

# ② メタ列付与
items_bronze_df = items_parsed_df \
    .withColumn("ingestion_timestamp", current_timestamp()) \
    .withColumn("source_system", lit("product_master_db")) \
    .withColumn("file_name", lit(f"{VOLUME_PATH}/items"))

# ③ 保存（Unity Catalog マネージドテーブル）
items_bronze_df.write \
    .format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{DB_3PART}.bronze_items")

print(f"✓ 商品マスタ取り込み完了: {items_bronze_df.count():,}件")

# ============================================================
# 2. 広告データ（CSV）
# ============================================================

print("\n[2/3] 広告データ（CSV）取り込み中...")

ads_schema = StructType([
    StructField("campaign_id", StringType(), True),
    StructField("campaign_name", StringType(), True),
    StructField("ad_id", StringType(), True),
    StructField("ad_platform", StringType(), True),
    StructField("ad_format", StringType(), True),
    StructField("target_url", StringType(), True),
    StructField("impressions", IntegerType(), True),
    StructField("clicks", IntegerType(), True),
    StructField("cost", DoubleType(), True),
    StructField("date", StringType(), True),          # ブロンズでは文字列のまま
    StructField("utm_source", StringType(), True),
    StructField("utm_medium", StringType(), True),
    StructField("utm_campaign", StringType(), True),
    StructField("target_category", StringType(), True)
])

ads_raw_df = spark.read.schema(ads_schema) \
    .option("header", "true") \
    .csv(f"{VOLUME_PATH}/digital_ads/*-ads.csv")

ads_bronze_df = ads_raw_df \
    .withColumn("ingestion_timestamp", current_timestamp()) \
    .withColumn("source_system", lit("google_ads_api")) \
    .withColumn("file_name", lit(f"{VOLUME_PATH}/digital_ads"))

ads_bronze_df.write \
    .format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{DB_3PART}.bronze_digital_ads")

print(f"✓ 広告データ取り込み完了: {ads_bronze_df.count():,}件")


# ============================================================
# 3. トランザクションデータ（CSV）
# ============================================================

print("\n[3/3] トランザクションデータ（CSV）取り込み中...")

transactions_schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("transaction_timestamp", StringType(), True),  # ブロンズでは文字列
    StructField("item_id", StringType(), True),
    StructField("user_email", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", IntegerType(), True),
    StructField("referrer_url", StringType(), True),
    StructField("landing_page_url", StringType(), True),
    StructField("utm_source", StringType(), True),
    StructField("utm_medium", StringType(), True),
    StructField("utm_campaign", StringType(), True),
    StructField("device_type", StringType(), True),
    StructField("session_id", StringType(), True),
    StructField("conversion_time_minutes", IntegerType(), True)
])

transactions_raw_df = spark.read.schema(transactions_schema) \
    .option("header", "true") \
    .csv(f"{VOLUME_PATH}/transactions/*-transactions.csv")

transactions_bronze_df = transactions_raw_df \
    .withColumn("ingestion_timestamp", current_timestamp()) \
    .withColumn("source_system", lit("ec_site_db")) \
    .withColumn("file_name", lit(f"{VOLUME_PATH}/transactions"))

transactions_bronze_df.write \
    .format("delta").mode("overwrite").option("overwriteSchema", "true") \
    .saveAsTable(f"{DB_3PART}.bronze_transactions")

print(f"✓ トランザクション取り込み完了: {transactions_bronze_df.count():,}件")


# ============================================================
# 完了確認（SHOW TABLES / 先頭表示 / 総件数）
# ============================================================

print("\n" + "="*68)
print("ブロンズレイヤー構築完了")
print("="*68)

print("\n【テーブル一覧 (SHOW TABLES)】")
display(spark.sql(f"SHOW TABLES IN {DB_3PART}"))

print("\n【テーブル先頭表示 & 総件数】")
for t in ["bronze_items", "bronze_digital_ads", "bronze_transactions"]:
    print(f"\n== {DB_3PART}.{t} ==")
    display(spark.table(f"{DB_3PART}.{t}").limit(5))
    cnt = spark.table(f"{DB_3PART}.{t}").count()
    print(f"record count: {cnt}")
