In [0]:
#spark.sql('DROP CATALOG IF EXISTS governance_prod CASCADE')
#spark.sql('DROP CATALOG IF EXISTS domain_dev CASCADE')
#spark.sql('DROP CATALOG IF EXISTS domain_prod CASCADE')

In [0]:
spark.sql('CREATE CATALOG IF NOT EXISTS governance_prod')
spark.sql('CREATE CATALOG IF NOT EXISTS domain_dev')
spark.sql('CREATE CATALOG IF NOT EXISTS domain_prod')

spark.sql('CREATE SCHEMA IF NOT EXISTS governance_prod.metadata')
spark.sql('CREATE SCHEMA IF NOT EXISTS governance_prod.metrics')
spark.sql('CREATE SCHEMA IF NOT EXISTS domain_dev.bronze_analytics')
spark.sql('CREATE SCHEMA IF NOT EXISTS domain_dev.silver_analytics')
spark.sql('CREATE SCHEMA IF NOT EXISTS domain_dev.gold_analytics')    


In [0]:
#GOVERNANCE - METADATA - CATALOGS
array_id = [1, 2, 3]
array_name = ['governance_prod', 'domain_dev', 'domain_prod']
array_domain = ['governance', 'domain', 'domain']
array_environment = ['prod', 'dev', 'prod']
array_description = ['Governance data and Metadata', 'Development environment for custom domain', 'Production environment for custom domain']
array_location = ['Databricks'] * 3
array_created_at = ['2025-11-27 00:00:00'] * 3
array_owner = ['armando.n90@gmail.com', 'armando.n90@gmail.com', 'armando.n90@gmail.com']

columns_comments = {
    "catalog_id": "Identifier of the catalog",
    "catalog_name": "Name of the catalog",
    "domain": "Domain of the catalog",
    "environment": "Environment of the catalog among dev and prod",
    "description": "Description of the catalog",
    "location": "Location of the catalog",
    "created_at": "Date of creation of the catalog",
    "owner": "Owner of the catalog",
}

metadata_catalog = spark.createDataFrame(data = list(zip(array_id, array_name, array_domain, array_environment, array_description, array_location, array_created_at, array_owner)), schema=['catalog_id', 'catalog_name', 'domain', 'environment', 'description', 'location', 'created_at', 'owner'])

print(metadata_catalog.count())
metadata_catalog.show(10)

metadata_catalog.write.format('delta').mode('overwrite').saveAsTable('governance_prod.metadata.catalogs')

for column, comment in columns_comments.items():
    spark.sql(f"ALTER TABLE governance_prod.metadata.catalogs ALTER COLUMN {column} COMMENT '{comment}'")

In [0]:
#GOVERNANCE - METADATA - SCHEMAS
array_id = [1, 2, 3, 4, 5, 6, 7, 8]
array_catalog_id = [1, 1, 2, 2, 2, 3, 3, 3]
array_name = ['metadata', 'metrics', 
              'bronze_analytics', 'silver_analytics', 'gold_analytics', 
              'bronze_analytics', 'silver_analytics', 'gold_analytics']
array_description = ['Metadata', 'Metrics', 
                     'Bronze medallion for analytics', 'Silver medallion for analytics', 'Gold medallion for analytics', 'Bronze medallion for analytics', 'Silver medallion for analytics', 'Gold medallion for analytics']
array_created_at = ['2025-11-27 00:00:00'] * 8
array_owner = ['armando.n90@gmail.com'] * 8

columns_comments = {
    "schema_id": "Identifier of the schema",
    "catalog_id": "Identifier of the catalog",
    "schema_name": "Name of the schema",
    "description": "Description of the schema",
    "created_at": "Date of creation of the schema",
    "owner": "Owner of the schema",
}

metadata_table = spark.createDataFrame(data = list(zip(array_id, array_catalog_id, array_name,  array_description, array_created_at, array_owner)), schema=['schema_id', 'catalog_id', 'schema_name', 'description', 'created_at', 'owner'])

print(metadata_table.count())
metadata_table.show(10)

metadata_table.write.format('delta').mode('overwrite').saveAsTable('governance_prod.metadata.schemas')

for column, comment in columns_comments.items():
    spark.sql(f"ALTER TABLE governance_prod.metadata.schemas ALTER COLUMN {column} COMMENT '{comment}'")     


In [0]:
#GOVERNANCE - METADATA - TABLES
spark.sql("""CREATE TABLE IF NOT EXISTS governance_prod.metadata.tables 
             (
                table_id INTEGER COMMENT 'Identifier of the table', 
                schema_id INTEGER COMMENT 'Identifier of the schema', 
                table_name STRING COMMENT 'Name of the table', 
                etl_module STRING COMMENT 'Name of the ETL module that ingests the table', 
                write_mode STRING COMMENT 'Write mode of the table among overwrite_partition, overwrite, merge and cdc', 
                quality STRING COMMENT 'Quality of the table among bronze, silver and gold', 
                table_type STRING COMMENT 'Type of the table among table, view and materialized', 
                version INTEGER COMMENT 'Version of the table', 
                current_flag BOOLEAN COMMENT 'Flag to indicate if the record is the current one',
                valid_from TIMESTAMP COMMENT 'Date of validity of the record',
                valid_to TIMESTAMP COMMENT 'Date of invalidity of the record', 
                description STRING COMMENT 'Description of the table',
                owner STRING COMMENT 'Owner of the table',
                retention_policy STRING COMMENT 'Retention policy of the table among permanent and temporary'
             )""")

In [0]:
#GOVERNANCE - METADATA - TABLES DETAIL
spark.sql("""CREATE TABLE IF NOT EXISTS governance_prod.metadata.tables_detail 
             (
                table_id INTEGER COMMENT 'Identifier of the table', 
                column_id INTEGER COMMENT 'Identifier of the column', 
                column_name STRING COMMENT 'Name of the column', 
                rename_from STRING COMMENT 'Name of the column on the prevous layer',
                data_type STRING COMMENT 'Data type of the column', 
                ordinal_position INTEGER COMMENT 'Ordinal position of the column',
                is_primary_key BOOLEAN COMMENT 'Flag to indicate if the column is a primary key',
                is_nullable BOOLEAN COMMENT 'Flag to indicate if the column is nullable',
                is_partition BOOLEAN COMMENT 'Flag to indicate if the column is a partition',
                is_pii BOOLEAN COMMENT 'Flag to indicate if the column is a personal identification information',
                validations STRING COMMENT 'Validation in JSON string',
                comment STRING COMMENT 'Comment of the column'
             )""")

In [0]:
#GOVERNANCE - METRICS - INGESTIONS
spark.sql("""CREATE TABLE IF NOT EXISTS governance_prod.metrics.ingestions (table_id INTEGER, 
                catalog_name STRING, schema_name STRING, table_name STRING, batch_id STRING, loaded_at TIMESTAMP, etl_module STRING, 
                write_mode STRING, initial_rows LONG, final_rows LONG, accumulated_rows LONG, quality STRING)""")

columns_comments = {
    "table_id": "Identifier of the table",
    "catalog_name": "Name of the catalog",
    "schema_name": "Name of the schema",
    "table_name": "Name of the table",
    "batch_id": "Identifier of the ingested batch",
    "loaded_at": "Date of loading",
    "etl_module": "Name of the ETL module that created the table",
    "write_mode": "Write mode of the table among overwrite_partition, overwrite, merge and cdc",
    "initial_rows": "Number of rows in the batch when the table was read",
    "final_rows": "Number of rows in the batch when the the table was written",
    "accumulated_rows": "Number of rows accumulated in the table after the ingestion",
    "quality": "Quality of the table among bronze, silver and gold"
}

for column, comment in columns_comments.items():
    spark.sql(f"ALTER TABLE governance_prod.metrics.ingestions ALTER COLUMN {column} COMMENT '{comment}'")

In [0]:
#GOVERNANCE - METRICS - EVENT ERRORS
spark.sql("""CREATE TABLE IF NOT EXISTS governance_prod.metrics.event_errors (
    error_event_id STRING, batch_id STRING, screen_code STRING, catalog_name STRING, schema_name STRING,
    table_name STRING, column_name STRING, record_identifier STRING, original_value STRING, replaced_value STRING,
    error_condition STRING
)""")

columns_comments = {
    "error_event_id": "Unique identifier of the error event",
    "batch_id": "Unique identifier of the batch",
    "screen_code": "Unique code of the screen",
    "catalog_name": "Name of the catalog",
    "schema_name": "Name of the schema",
    "table_name": "Name of the table",
    "column_name": "Name of the column",
    "record_identifier": "Unique identifier of the record",
    "original_value": "Original value of the record",
    "replaced_value": "Replaced value of the record",
    "error_condition": "Error condition of the record"
}

for column, comment in columns_comments.items():
    spark.sql(f"ALTER TABLE governance_prod.metrics.event_errors ALTER COLUMN {column} COMMENT '{comment}'")

In [0]:
#test = spark.sql('SELECT * FROM governance_prod.metadata.catalogs')
#test = spark.sql('SELECT * FROM governance_prod.metadata.schemas')
#test = spark.sql('SELECT * FROM governance_prod.metadata.tables')
#test = spark.sql('SELECT * FROM governance_prod.metadata.tables_detail')
#test = spark.sql('SELECT * FROM governance_prod.metrics.ingestions')
test = spark.sql('SELECT * FROM governance_prod.metrics.event_errors')
test.show(truncate=False)

In [0]:
#spark.sql('DELETE FROM governance_prod.metrics.event_errors')