databricks · lennartkats-db · Feb 19, 2024 · Dec 12, 2023 · Dec 13, 2023 · Dec 19, 2023
diff --git a/libs/template/templates/dbt-sql/README.md b/libs/template/templates/dbt-sql/README.md
@@ -0,0 +1,8 @@
+# dbt template
+
+This folder provides a template for using dbt-core with Databricks Asset Bundles.
+It follows the standard dbt project structure and has an additional `resources`
+directory to define Databricks resources such as jobs that run dbt models.
+
+* Learn more about the dbt and its standard project structure here: https://docs.getdbt.com/docs/build/projects.
+* Learn more about Databricks Asset Bundles here: https://docs.databricks.com/en/dev-tools/bundles/index.html
diff --git a/libs/template/templates/dbt-sql/databricks_template_schema.json b/libs/template/templates/dbt-sql/databricks_template_schema.json
@@ -0,0 +1,46 @@
+{
+    "welcome_message": "\nWelcome to the dbt template for Databricks Asset Bundles!",
+    "properties": {
+        "project_name": {
+            "type": "string",
+            "pattern": "^[A-Za-z_][A-Za-z0-9_]+$",
+            "pattern_match_failure_message": "Name must consist of letters, numbers, and underscores.",
+            "default": "my_dbt_project",
+            "description": "\nPlease provide a unique name for this project.\nproject_name",
+            "order": 1
+        },
+        "workspace_host_override": {
+            "comment": "We explicitly ask users for the workspace_host since we ask for a http_path below. A downside of doing this is that {{user_name}} may not be correct if they pick a different workspace than the one from the current profile.",
+            "type": "string",
+            "pattern": "^https:\\/\\/[^/]+$",
+            "pattern_match_failure_message": "URL must be of the form https://my.databricks.host",
+            "description": "\nPlease provide the workspace URL to use.\nworkspace_url",
+            "default": "{{workspace_host}}",
+            "order": 2
+        },
+        "http_path": {
+            "type": "string",
+            "pattern": "^/sql/.\\../warehouses/[a-z0-9]+$",
+            "pattern_match_failure_message": "Path must be of the form /sql/1.0/warehouses/abcdef1234567890",
+            "description": "\nPlease provide the HTTP Path of the SQL warehouse you would like to use with dbt during development\nYou can find this path by clicking on \"Connection Details\" for your SQL warehouse.\nhttp_path [example: /sql/1.0/warehouses/abcdef1234567890]",
+            "order": 3
+        },
+        "catalog": {
+            "type": "string",
+            "default": "",
+            "pattern": "^\\w*$",
+            "pattern_match_failure_message": "Invalid catalog name.",
+            "description": "\nPlease provide an initial catalog (leave blank if you would not want to use an initial catalog).\ncatalog",
+            "order": 4
+        },
+        "schema": {
+            "type": "string",
+            "default": "default",
+            "pattern": "^\\w+$",
+            "pattern_match_failure_message": "Invalid schema name.",
+            "description": "\nPlease provide a default schema for this project.\nNote that you can pick a different schema for local development when you first use the 'dbt init' command.\nschema",
+            "order": 4
+        }
+    },
+    "success_message": "\n📊 Your new project has been created in the '{{.project_name}}' directory!\nPlease refer to the README.md file for \"getting started\" instructions."
+}
diff --git a/libs/template/templates/dbt-sql/library/versions.tmpl b/libs/template/templates/dbt-sql/library/versions.tmpl
@@ -0,0 +1,7 @@
+{{define "latest_lts_dbr_version" -}}
+  13.3.x-scala2.12
+{{- end}}
+
+{{define "latest_lts_db_connect_version_spec" -}}
+  >=13.3,<13.4
+{{- end}}
diff --git a/libs/template/templates/dbt-sql/template/__preamble.tmpl b/libs/template/templates/dbt-sql/template/__preamble.tmpl
@@ -0,0 +1,9 @@
+# Preamble
+
+This file only template directives; it is skipped for the actual output.
+
+{{skip "__preamble"}}
+
+{{if eq .project_name "dbt"}}
+{{fail "Project name 'dbt' is not supported"}}
+{{end}}
diff --git a/libs/template/templates/dbt-sql/template/{{.project_name}}/.vscode/__builtins__.pyi b/libs/template/templates/dbt-sql/template/{{.project_name}}/.vscode/__builtins__.pyi
@@ -0,0 +1,3 @@
+# Typings for Pylance in Visual Studio Code
+# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md
+from databricks.sdk.runtime import *
diff --git a/libs/template/templates/dbt-sql/template/{{.project_name}}/.vscode/extensions.json b/libs/template/templates/dbt-sql/template/{{.project_name}}/.vscode/extensions.json
@@ -0,0 +1,8 @@
+{
+    "recommendations": [
+        "databricks.databricks",
+        "ms-python.vscode-pylance",
+        "redhat.vscode-yaml",
+        "databricks.sqltools-databricks-driver",
+    ]
+}
diff --git a/libs/template/templates/dbt-sql/template/{{.project_name}}/.vscode/settings.json.tmpl b/libs/template/templates/dbt-sql/template/{{.project_name}}/.vscode/settings.json.tmpl
@@ -0,0 +1,30 @@
+{
+    "python.analysis.stubPath": ".vscode",
+    "databricks.python.envFile": "${workspaceFolder}/.env",
+    "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
+    "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
+    "python.testing.pytestArgs": [
+        "."
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "python.analysis.extraPaths": ["src"],
+    "files.exclude": {
+        "**/*.egg-info": true,
+        "**/__pycache__": true,
+        ".pytest_cache": true,
+    },
+    "python.envFile": "${workspaceFolder}/.databricks/.databricks.env",
+    "python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
+    "sqltools.connections": [
+        {
+            "connectionMethod": "VS Code Extension (beta)",
+            "catalog": "hive_metastore",
+            "previewLimit": 50,
+            "driver": "Databricks",
+            "name": "databricks",
+            "path": "/sql/1.0/warehouses/ec7fa4bd0f0afc8f"
+        }
+    ],
+    "sqltools.autoConnectTo": "",
+}
diff --git a/libs/template/templates/dbt-sql/template/{{.project_name}}/README.md b/libs/template/templates/dbt-sql/template/{{.project_name}}/README.md
@@ -0,0 +1,119 @@
+# {{.project_name}}
+
+The '{{.project_name}}' project was generated by using the dbt template for
+Databricks Asset Bundles. It follows the standard dbt project structure
+and has an additional `resources` directory to define Databricks resources such as jobs
+that run dbt models.
+
+* Learn more about the dbt and its standard project structure here: https://docs.getdbt.com/docs/build/projects.
+* Learn more about Databricks Asset Bundles here: https://docs.databricks.com/en/dev-tools/bundles/index.html
+
+## Development setup
+
+1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
+
+2. Authenticate to your Databricks workspace:
+    ```
+    $ databricks configure
+    ```
+
+3. Install dbt
+
+   To install dbt, you need a recent version of Python. For the instructions below,
+   we assume `python3` refers to the Python version you want to use. On some systems,
+   you may need to refer to a different Python version, e.g. `python` or `/usr/bin/python`.
+
+   Run these instructions from the `{{.project_name}}` directory. We recommend making
+   use of a Python virtual environment and installing dbt as follows:
+
+   ```
+   $ python3 -m venv .venv
+   $ . .venv/bin/activate
+   $ pip install -r requirements-dev.txt
+   ```
+
+4. Initialize your dbt profile
+
+   Use `dbt init` to initialize your profile.
+
+   ```
+   $ dbt init
+   ```
+
+   Note that dbt authentication uses personal access tokens by default
+   (see https://docs.databricks.com/dev-tools/auth/pat.html).
+   You can use OAuth as an alternative, but this currently requires manual configuration.
+   See https://github.com/databricks/dbt-databricks/blob/main/docs/oauth.md
+   for general instructions, or https://community.databricks.com/t5/technical-blog/using-dbt-core-with-oauth-on-azure-databricks/ba-p/46605
+   for advice on setting up OAuth for Azure Databricks.
+
+   To setup up additional profiles, such as a 'prod' profile,
+   see https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles.
+
+5. Activate dbt so it can be used from the terminal
+
+   ```
+   $ . .venv/bin/activate
+    ```
+
+## Local development with dbt
+
+Use `dbt` to [run this project locally using a SQL warehouse](https://docs.databricks.com/partners/prep/dbt.html):
+
+```
+$ dbt seed
+$ dbt run
+```
+
+(Did you get an error that the dbt command could not be found? You may need
+to try the last step from the development setup above to re-activate
+your Python virtual environment!)
+
+Use `dbt test` to run tests generated from yml files such as `models/schema.yml`
+and any SQL tests from `tests/`
+
+```
+$ dbt test
+```
+
+## Deploying to Databricks with Databricks Asset Bundles
+
+Databricks Asset Bundles can be used to deploy to Databricks and to execute
+dbt commands as a job using Databricks Workflows. See
+https://docs.databricks.com/dev-tools/bundles/index.html to learn more.
+
+Use the Databricks CLI to deploy a development copy of this project to a workspace:
+
+```
+$ databricks bundle deploy --target dev
+```
+
+(Note that "dev" is the default target, so the `--target` parameter
+is optional here.)
+
+This deploys everything that's defined for this project.
+For example, the default template would deploy a job called
+`[dev yourname] {{.project_name}}_job` to your workspace.
+You can find that job by opening your workpace and clicking on **Workflows**.
+
+To run the deployed job, use the "run" command:
+```
+$ databricks bundle run --targed dev
+```
+
+To deploy a production copy, type:
+
+```
+$ databricks bundle deploy --target prod
+```
+
+## IDE support
+
+Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
+https://docs.databricks.com/dev-tools/vscode-ext.html. Third-party extensions
+related to dbt may further enhance your dbt development experience!
+
+## CI/CD
+
+See https://docs.databricks.com/dev-tools/bundles/ci-cd.html for documentation
+on CI/CD setup.
diff --git a/libs/template/templates/dbt-sql/template/{{.project_name}}/analyses/.gitkeep b/libs/template/templates/dbt-sql/template/{{.project_name}}/analyses/.gitkeep
diff --git a/libs/template/templates/dbt-sql/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/dbt-sql/template/{{.project_name}}/databricks.yml.tmpl
@@ -0,0 +1,43 @@
+# This is a Databricks asset bundle definition for {{.project_name}}.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+  name: {{.project_name}}
+
+include:
+  - resources/*.yml
+
+# Variable declarations. These variables are assigned in the dev/prod targets below.
+variables:
+  warehouse_id:
+    description: The warehouse to use
+  catalog:
+    description: The catalog to use
+  schema:
+    description: The schema to use
+
+# Deployment targets.
+targets:
+  dev:
+    default: true
+    mode: development
+    workspace:
+      host: {{.workspace_host_override}}
+    variables:
+      warehouse_id: {{index ((regexp "[^/]+$").FindStringSubmatch .http_path) 0}}
+      catalog: {{.catalog}}
+      schema: {{.schema}} # tip: use ${workspace.current_user.short_name} if you want your own schema
+
+  prod:
+    mode: production
+    workspace:
+      host: {{.workspace_host_override}}
+    variables:
+      warehouse_id: {{index ((regexp "[^/]+$").FindStringSubmatch .http_path) 0}}
+      catalog: {{.catalog}}
+      schema: {{.schema}}
+    {{- if not is_service_principal}}
+    run_as:
+      # This runs as {{user_name}} in production. We could also use a service principal here
+      # using service_principal_name (see the Databricks documentation).
+      user_name: {{user_name}}
+    {{- end}}
diff --git a/libs/template/templates/dbt-sql/template/{{.project_name}}/dbt_project.yml.tmpl b/libs/template/templates/dbt-sql/template/{{.project_name}}/dbt_project.yml.tmpl
@@ -0,0 +1,36 @@
+
+# Name your project! Project names should contain only lowercase characters
+# and underscores. A good package name should reflect your organization's
+# name or the intended use of these models
+name: '{{.project_name}}'
+version: '1.0.0'
+config-version: 2
+
+# This setting configures which "profile" dbt uses for this project.
+profile: '{{.project_name}}'
+
+# These configurations specify where dbt should look for different types of files.
+# The `model-paths` config, for example, states that models in this project can be
+# found in the "models/" directory. You probably won't need to change these!
+model-paths: ["models"]
+analysis-paths: ["analyses"]
+test-paths: ["tests"]
+seed-paths: ["seeds"]
+macro-paths: ["macros"]
+snapshot-paths: ["snapshots"]
+
+clean-targets:         # directories to be removed by `dbt clean`
+  - "target"
+  - "dbt_packages"
+
+# Configuring models
+# Full documentation: https://docs.getdbt.com/docs/configuring-models
+
+# In this example config, we tell dbt to build all models in the example/
+# directory as views. These settings can be overridden in the individual model
+# files using the `{{"{{"}} config(...) {{"}}"}}` macro.
+models:
+  {{.project_name}}:
+    # Config indicated by + and applies to all files under models/example/
+    example:
+      +materialized: view
diff --git a/libs/template/templates/dbt-sql/template/{{.project_name}}/macros/.gitkeep b/libs/template/templates/dbt-sql/template/{{.project_name}}/macros/.gitkeep
diff --git a/...mplate/templates/dbt-sql/template/{{.project_name}}/models/example/my_first_dbt_model.sql b/...mplate/templates/dbt-sql/template/{{.project_name}}/models/example/my_first_dbt_model.sql
@@ -0,0 +1,27 @@
+
+/*
+    Welcome to your first dbt model!
+    Did you know that you can also configure models directly within SQL files?
+    This will override configurations stated in dbt_project.yml
+
+    Try changing "table" to "view" below
+*/
+
+{{ config(materialized='table') }}
+
+with source_data as (
+
+    select 1 as id
+    union all
+    select null as id
+
+)
+
+select *
+from source_data
+
+/*
+    Uncomment the line below to remove records with null `id` values
+*/
+
+-- where id is not null
diff --git a/...plate/templates/dbt-sql/template/{{.project_name}}/models/example/my_second_dbt_model.sql b/...plate/templates/dbt-sql/template/{{.project_name}}/models/example/my_second_dbt_model.sql
@@ -0,0 +1,6 @@
+
+-- Use the `ref` function to select from other models
+
+select *
+from {{ ref('my_first_dbt_model') }}
+where id = 1
diff --git a/libs/template/templates/dbt-sql/template/{{.project_name}}/models/example/schema.yml b/libs/template/templates/dbt-sql/template/{{.project_name}}/models/example/schema.yml
@@ -0,0 +1,21 @@
+
+version: 2
+
+models:
+  - name: my_first_dbt_model
+    description: "A starter dbt model"
+    columns:
+      - name: id
+        description: "The primary key for this table"
+        tests:
+          - unique
+          - not_null
+
+  - name: my_second_dbt_model
+    description: "A starter dbt model"
+    columns:
+      - name: id
+        description: "The primary key for this table"
+        tests:
+          - unique
+          - not_null