# Bulk table column description generator


Customization required
- Configure LLM prompt as required

Authors
- Scott Eade
- Sierra Yap

In [0]:
dbutils.widgets.text("Catalog", "", "Enter Catalog Name (Mandatory):")
dbutils.widgets.text("Schema", "", "Enter Schema Name (Optional):")
dbutils.widgets.text("Table", "", "Enter Table Name (Optional):")
dbutils.widgets.text("Output Path", "", "Enter Output Path (Mandatory):")
dbutils.widgets.text("Model Serving Endpoint Name", "", "Model Serving Endpoint Name (Mandatory):")

In [0]:
catalog = dbutils.widgets.get("Catalog")
schema = dbutils.widgets.get("Schema")
table = dbutils.widgets.get("Table")
output_path = dbutils.widgets.get("Output Path")
endpoint_name = dbutils.widgets.get("Model Serving Endpoint Name")

In [0]:
print(f"{catalog},{schema},{table},{output_path},{endpoint_name}")

In [0]:
# Function to retrieve the table column comments for a given catalog, schema, table.
def get_table_comments(catalog, schema="", table=""):
  query = f"""
    SELECT c.table_catalog, c.table_schema, c.table_name, c.column_name, c.ordinal_position, c.comment IS NULL or length(c.comment) == 0 AS replace_comment, c.comment AS existing_comment
    , ai_query('{endpoint_name}', 'Generate a 1 sentence description of the type of information that the column "' || c.column_name || '" from the table "' || c.table_name || '" in schema "' || c.table_schema || '" within the catalog "' || c.table_catalog || '" would contain (the data type of the column is "' || c.data_type || '"). This will be used as a column description, so there is no need to mention that this is a column within a schema within a catalog.') AS new_comment
    FROM system.information_schema.columns AS c
    JOIN system.information_schema.tables AS t USING (table_catalog, table_schema, table_name)
    WHERE table_catalog = :catalog
    """
  if schema:
    query += " AND table_schema = :schema"
    if table:
      query += " AND table_name = :table"
  query += " ORDER BY table_catalog, table_schema, table_name, ordinal_position"
  # query += " LIMIT 5"
  table_comments = spark.sql(query, args = {"catalog": catalog, "schema": schema, "table": table})
  return table_comments

In [0]:
commented_columns = get_table_comments(catalog, schema, table)
display(commented_columns)

### In case user wants to update some column comments after reviewing

In [0]:
from pyspark.sql import functions as F

# Example mapping of updates you want to apply
# updates = [
#     (catalog, "fgac", "customer", "record_id", "This is the updated comment for customer's record_id."),
#     (catalog, "fgac", "customer_pii_data_parquet", "ssn", "This is the updated comment for SSN in customer_pii_data_parquet."),
# ]

updates = []

if not updates:
  commented_columns_updated=commented_columns
else:
  # Create a DataFrame with the updates
  updates_df = spark.createDataFrame(updates, ["table_catalog", "table_schema", "table_name", "column_name", "updated_comment"])

  # Left join with the original DataFrame
  commented_columns_updated = (
      commented_columns
      .join(
          updates_df,
          on=["table_catalog", "table_schema", "table_name", "column_name"],
          how="left"
      )
      .withColumn(
          "new_comment",
          F.when(F.col("updated_comment").isNotNull(), F.col("updated_comment"))
          .otherwise(F.col("new_comment"))
      )
      .drop("updated_comment")  # cleanup temp column
  )

In [0]:
display(commented_columns_updated)

In [0]:
# Choose your desired file format
# commented_columns.coalesce(1).write.mode("overwrite").option("header", "true").csv(output_path + "/csv")
commented_columns_updated.coalesce(1).write.mode("overwrite").option("header", "true").json(output_path + "/json")