#项目：HR_PROJECT
##这是用来定义所有的表对象的DDL语句
https://learn.microsoft.com/zh-cn/azure/databricks/sql/get-started/data-warehousing-concepts

https://learn.microsoft.com/zh-cn/azure/databricks/lakehouse/medallion

#项目：HR_PROJECT
##这是用来定义所有的表对象的DDL语句
https://learn.microsoft.com/zh-cn/azure/databricks/sql/get-started/data-warehousing-concepts

https://learn.microsoft.com/zh-cn/azure/databricks/lakehouse/medallion

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType, TimestampNTZType,TimestampType
from pyspark.sql.functions import current_timestamp


# 定义表结构
schemas = {
    # 接入层(Bronze)表结构
    "raw_hr_employees": StructType([
        StructField("EMPLOYEE_ID", StringType(), True),
        StructField("FIRST_NAME", StringType(), True),
        StructField("LAST_NAME", StringType(), True),
        StructField("EMAIL", StringType(), True),
        StructField("PHONE_NUMBER", StringType(), True),
        StructField("HIRE_DATE", StringType(), True),
        StructField("JOB_ID", StringType(), True),
        StructField("SALARY", StringType(), True),
        StructField("MANAGER_ID", StringType(), True),
        StructField("DEPARTMENT_ID", StringType(), True)
    ]),
    
    "raw_hr_employees_countries": StructType([
        StructField("COUNTRY_ID", StringType(), True),
        StructField("COUNTRY_NAME", StringType(), True)
    ]),
    
    "raw_hr_employees_departments": StructType([
        StructField("DEPARTMENT_ID", StringType(), True),
        StructField("DEPARTMENT_NAME", StringType(), True),
        StructField("MANAGER_ID", StringType(), True),
        StructField("LOCATION_ID", StringType(), True)
    ]),
    
    "raw_hr_employees_locations": StructType([
        StructField("LOCATION_ID", StringType(), True),
        StructField("STREET_ADDRESS", StringType(), True),
        StructField("POSTAL_CODE", StringType(), True),
        StructField("CITY", StringType(), True),
        StructField("STATE_PROVINCE", StringType(), True),
        StructField("COUNTRY_ID", StringType(), True)
    ]),
    
    # 清洗层(Silver)表结构
    "fct_hr_employees": StructType([
        StructField("EMPLOYEE_ID", StringType(), True),
        StructField("FIRST_NAME", StringType(), True),
        StructField("LAST_NAME", StringType(), True),
        StructField("EMAIL", StringType(), True),
        StructField("PHONE_NUMBER", StringType(), True),
        StructField("HIRE_DATE", TimestampNTZType(), True),
        StructField("JOB_ID", StringType(), True),
        StructField("SALARY", StringType(), True),
        StructField("MANAGER_ID", StringType(), True),
        StructField("DEPARTMENT_ID", StringType(), True),
        StructField("DB_CREATED_DATE", TimestampNTZType(), True),
        StructField("DB_UPDATED_DATE", TimestampNTZType(), True)
    ]),
    
    "dim_hr_employees_countries": StructType([
        StructField("COUNTRY_ID", StringType(), True),
        StructField("COUNTRY_NAME", StringType(), True),
        StructField("DB_CREATED_DATE", TimestampNTZType(), True),
        StructField("DB_UPDATED_DATE", TimestampNTZType(), True)
    ]),
    
    "dim_hr_employees_departments": StructType([
        StructField("DEPARTMENT_ID", LongType(), True),
        StructField("DEPARTMENT_NAME", StringType(), True),
        StructField("MANAGER_ID", LongType(), True),
        StructField("LOCATION_ID", LongType(), True),
        StructField("DB_CREATED_DATE", TimestampNTZType(), True),
        StructField("DB_UPDATED_DATE", TimestampNTZType(), True)
    ]),
    
    "dim_hr_employees_locations": StructType([
        StructField("LOCATION_ID", LongType(), True),
        StructField("STREET_ADDRESS", StringType(), True),
        StructField("POSTAL_CODE", StringType(), True),
        StructField("CITY", StringType(), True),
        StructField("STATE_PROVINCE", StringType(), True),
        StructField("COUNTRY_ID", StringType(), True),
        StructField("DB_CREATED_DATE", TimestampNTZType(), True),
        StructField("DB_UPDATED_DATE", TimestampNTZType(), True)
    ]),
    
    # 汇总层(Gold)表结构
    "dm_hr_employees_wide": StructType([
        StructField("EMPLOYEE_ID", StringType(), True),
        StructField("FIRST_NAME", StringType(), True),
        StructField("LAST_NAME", StringType(), True),
        StructField("EMAIL", StringType(), True),
        StructField("PHONE_NUMBER", StringType(), True),
        StructField("HIRE_DATE", TimestampNTZType(), True),
        StructField("JOB_ID", StringType(), True),
        StructField("SALARY", StringType(), True),
        StructField("MANAGER_FIRST_NAME", StringType(), True),
        StructField("MANAGER_LAST_NAME", StringType(), True),
        StructField("DEPARTMENT_NAME", StringType(), True),
        StructField("STREET_ADDRESS", StringType(), True),
        StructField("POSTAL_CODE", StringType(), True),
        StructField("CITY", StringType(), True),
        StructField("STATE_PROVINCE", StringType(), True),
        StructField("COUNTRY_NAME", StringType(), True),
        StructField("DB_CREATED_DATE", TimestampNTZType(), True),
        StructField("DB_UPDATED_DATE", TimestampNTZType(), True)
    ])
}

# 创建表函数
def create_table(table_name, schema, comment=""):
    db_name = "uc_data001.uc_schema001"
    full_table_name = f"{db_name}.{table_name}"
    
    # 创建空 DataFrame
    empty_df = spark.createDataFrame([], schema)
    
    # 写入 Delta 表
    empty_df.write.format("delta") \
        .mode("overwrite") \
        .option("comment", comment) \
        .saveAsTable(full_table_name)
    
    print(f"表 {full_table_name} 创建成功")

# 批量创建所有表
def create_all_tables():
    for table_name, schema in schemas.items():
        # 根据表名生成注释
        layer = "接入层(Bronze)" if table_name.startswith("raw_") else \
                "清洗层(Silver)" if table_name.startswith(("fct_", "dim_")) else \
                "汇总层(Gold)"
        comment = f"{layer}表：{table_name}"
        
        create_table(table_name, schema, comment)

# 执行创建
create_all_tables()