diff --git a/.github/workflows/create_next_pr.yml b/.github/workflows/create_next_pr.yml new file mode 100644 index 0000000000..becef0db65 --- /dev/null +++ b/.github/workflows/create_next_pr.yml @@ -0,0 +1,18 @@ +on: + push: + branches: + - "current" + +jobs: + pull-request: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: pull-request + uses: repo-sync/pull-request@v2 + with: + source_branch: "current" + destination_branch: "next" + pr_title: "Merge current branch into next" + pr_body: "*An automated PR to keep the next branch up to date with current*" + github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/_redirects b/_redirects index 185fd106e6..ab0e4890b0 100644 --- a/_redirects +++ b/_redirects @@ -6,6 +6,7 @@ /dbt-cli/installation-guides/ubuntu-debian /dbt-cli/install/overview 302 /dbt-cli/installation-guides/windows /dbt-cli/install/overview 302 /dbt-cli/installation /dbt-cli/install/overview 302 +/dbt-jinja-functions /reference/dbt-jinja-functions 302 /docs /docs/introduction 302 /docs/adapter /docs/writing-code-in-dbt/jinja-context/adapter 302 /docs/analyses /docs/building-a-dbt-project/analyses 302 @@ -13,6 +14,7 @@ /docs/archival /docs/building-a-dbt-project/archival 302 /docs/artifacts /docs/dbt-cloud/using-dbt-cloud/artifacts 302 /docs/best-practices /guides/best-practices 302 +/docs/guides/best-practices /guides/best-practices 302 /docs/bigquery-configs /reference/resource-configs/bigquery-configs 302 /docs/building-a-dbt-project/building-models/bigquery-configs /reference/resource-configs/bigquery-configs 302 /docs/building-a-dbt-project/building-models/configuring-models /reference/model-configs @@ -96,6 +98,9 @@ /docs/global-cli-flags /reference/global-cli-flags 302 /docs/graph /docs/writing-code-in-dbt/jinja-context/graph 302 /docs/guides/writing-custom-schema-tests /docs/guides/writing-custom-generic-tests +/docs/guides/best-practices /guides/best-practices +/docs/guides/best-practices#choose-your-materializations-wisely /guides/best-practices 302 +/docs/guides/best-practices#version-control-your-dbt-project /guides/legacy/best-practices#version-control-your-dbt-project 302 /docs/hooks /docs/building-a-dbt-project/hooks-operations 302 /docs/init /reference/commands/init 302 /docs/install-from-source /dbt-cli/installation 302 @@ -251,6 +256,8 @@ /docs/writing-code-in-dbt/macros /docs/building-a-dbt-project/jinja-macros 302 /docs/writing-code-in-dbt/using-jinja /guides/getting-started/learning-more/using-jinja 302 /faqs/getting-help/ /guides/legacy/getting-help 302 +/migration-guide/upgrading-to-0-17-0 /guides/migration/versions 302 +/migration-guide/upgrading-to-0-18-0 /guides/migration/versions 302 /reference/accounts /dbt-cloud/api 302 /reference/api /dbt-cloud/api 302 /reference/connections /dbt-cloud/api 302 @@ -278,6 +285,7 @@ https://tutorial.getdbt.com/* https://docs.getdbt.com/:splat 301! /reference/project-configs/modules-paths /reference/project-configs/packages-install-path 302 /docs/dbt-cloud/using-dbt-cloud/cloud-slack-notifications /docs/dbt-cloud/using-dbt-cloud/cloud-notifications 302 /reference/warehouse-profiles/presto-profile /reference/profiles.yml 302 +/setting-up /guides/getting-started/getting-set-up/setting-up-bigquery 302 /tutorial/setting-up /guides/getting-started 302 /tutorial/test-and-document-your-project /guides/getting-started/building-your-first-project/test-and-document-your-project 302 /tutorial/build-your-first-models /guides/getting-started/building-your-first-project/build-your-first-models 302 @@ -295,15 +303,23 @@ https://tutorial.getdbt.com/* https://docs.getdbt.com/:splat 301! /tutorial/learning-more/* /guides/getting-started/learning-more/:splat 301 /tutorial/getting-set-up/* /guides/getting-started/getting-set-up/:splat 301 /tutorial/building-your-first-project/* /guides/getting-started/building-your-first-project/:splat 301 +/tutorial/refactoring-legacy-sql /guides/getting-started/learning-more/refactoring-legacy-sql 302 # migration and legacy guides -/docs/guides/migration-guide/upgrading-from-0-10-to-0-11 /guides/migration/versions/upgrading-to-0-11-0 302 -/docs/guides/migration-guide/upgrading-to-014 /guides/migration/versions/upgrading-to-0-14-0 302 -/docs/upgrading-to-014 /guides/migration/versions/upgrading-to-0-14-0 302 -/docs/upgrading-to-0-14-1 /guides/migration/versions/upgrading-to-0-14-1 302 -/docs/upgrading-to-0-16-0 /guides/migration/versions/upgrading-to-0-16-0 302 +/docs/guides/migration-guide/upgrading-to-0-14-0 /guides/migration/versions 302 +/docs/guides/migration-guide/upgrading-to-0-15-0 /guides/migration/versions 302 +/docs/guides/migration-guide/upgrading-to-0-16-0 /guides/migration/versions 302 +/docs/guides/migration-guide/upgrading-to-0-17-0 /guides/migration/versions 302 +/docs/guides/migration-guide/upgrading-to-0-18-0 /guides/migration/versions 302 +/docs/guides/migration-guide/upgrading-to-0-19-0 /guides/migration/versions 302 +/docs/guides/migration-guide/upgrading-from-0-10-to-0-11 /guides/migration/versions 302 +/docs/guides/migration-guide/upgrading-to-014 /guides/migration/versions 302 +/docs/upgrading-to-014 /guides/migration/versions 302 +/docs/upgrading-to-0-14-1 /guides/migration/versions 302 +/docs/upgrading-to-0-16-0 /guides/migration/versions 302 /docs/guides/migration-guide/upgrading-to-0-20-0 /guides/migration/versions/upgrading-to-v0.20 302 /docs/guides/migration-guide/upgrading-to-0-21-0 /guides/migration/versions/upgrading-to-v0.21 302 /docs/guides/migration-guide/upgrading-to-1-0-0 /guides/migration/versions/upgrading-to-v1.0 302 +/docs/guides/migration-guide/upgrading-to-v1.0 /guides/migration/versions/upgrading-to-v1.0 302 /docs/guides/getting-help /guides/legacy/getting-help 302 /docs/guides/migration-guide/* /guides/migration/versions/:splat 301! /docs/guides/best-practices /guides/best-practices diff --git a/website/blog/2019-05-01-how-we-structure-dbt-projects.md b/website/blog/2019-05-01-how-we-structure-dbt-projects.md index 8f38ec3de3..361373ae9d 100644 --- a/website/blog/2019-05-01-how-we-structure-dbt-projects.md +++ b/website/blog/2019-05-01-how-we-structure-dbt-projects.md @@ -24,11 +24,11 @@ It’s important to note that **this is not the only, or the objectively best, w * our views on data model design; which in turn are influenced by: * the kinds of analytics problems we are solving for clients -* the data stack we typically work within, in which multiple data sources are loaded by third party tools, and the data warehouse is optimized for analytical queries (therefore we aren’t tightly bounded by performance optimization considerations). +* the data stack we typically work within, in which multiple data sources are loaded by third party tools, and the is optimized for analytical queries (therefore we aren’t tightly bounded by performance optimization considerations). Our opinions are **almost guaranteed to change over time** as we update our views on modeling, are exposed to more analytics problems, and data stacks evolve. It’s also worth clearly stating here: the way we structure dbt projects makes sense for our projects, but may not be the best fit for yours! This article exists on Discourse so that we can have a conversation – I would love to know how others in the community are structuring their projects. -In comparison, the (recently updated) [best practices](/docs/guides/best-practices) reflect principles that we believe to be true for any dbt project. Of course, these two documents go hand in hand – our projects are structured in such a way that makes the those principles easy to observe, in particular: +In comparison, the (recently updated) [best practices](/guides/best-practices) reflect principles that we believe to be true for any dbt project. Of course, these two documents go hand in hand – our projects are structured in such a way that makes the those principles easy to observe, in particular: * Limit references to raw data * Rename and recast fields once @@ -127,7 +127,7 @@ Some dbt users prefer to have one `.yml` file per model (e.g. `stg_braintree__cu Earlier versions of the dbt documentation recommended implementing “base models” as the first layer of transformation – and we used to organize and name our models in this way, for example `models/braintree/base/base_payments.sql`. -We realized that while the reasons behind this convention were valid, the naming was an opinion, so in our recent update to the [best practices](/docs/guides/best-practices), we took the mention of base models out. Instead, we replaced it with the principles of “renaming and recasting once” and “limiting the dependencies on raw data”. +We realized that while the reasons behind this convention were valid, the naming was an opinion, so in our recent update to the [best practices](/guides/best-practices), we took the mention of base models out. Instead, we replaced it with the principles of “renaming and recasting once” and “limiting the dependencies on raw data”. That being said, in our dbt projects every source flows through exactly one model of the following form: diff --git a/website/blog/2021-02-05-dbt-project-checklist.md b/website/blog/2021-02-05-dbt-project-checklist.md index 4e07b57631..19ff0cd3ed 100644 --- a/website/blog/2021-02-05-dbt-project-checklist.md +++ b/website/blog/2021-02-05-dbt-project-checklist.md @@ -87,7 +87,7 @@ This post is the checklist I created to guide our internal work, and I’m shari ## ✅ Project structure ------------------------------------------------------------------------------------------------------------------------------------------------------ -* If you are using dimensional modeling techniques, do you have staging and marts models? +* If you are using techniques, do you have staging and marts models? * Do they use table prefixes like ‘fct\_’ and ‘dim\_’? * Is the code modular? Is it one transformation per one model? * Are you filtering as early as possible? @@ -156,7 +156,7 @@ This post is the checklist I created to guide our internal work, and I’m shari **Useful links** -* [Version control](/docs/guides/best-practices/#version-control-your-dbt-project) +* [Version control](/guides/legacy/best-practices#version-control-your-dbt-project) * [dbt Labs' PR Template](/blog/analytics-pull-request-template) ## ✅ Documentation diff --git a/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md b/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md index 14985782bc..15c27988d5 100644 --- a/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md +++ b/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md @@ -148,7 +148,7 @@ This approach is nearly identical to the former (completely separate repositorie * Does not prevent conflicting business logic or duplicate macros * All models must have unique names across all packages -\*\* The project will include the information from the dbt projects but might be missing information that is pulled from your data warehouse if you are on multiple Snowflake accounts/Redshift instances. This is because dbt is only able to query the information schema from that one connection. +\*\* The project will include the information from the dbt projects but might be missing information that is pulled from your if you are on multiple Snowflake accounts/Redshift instances. This is because dbt is only able to query the information schema from that one connection. ## So… to mono-repo or not to mono-repo? ------------------------------------------------------------------------------- diff --git a/website/blog/2021-09-11-sql-dateadd.md b/website/blog/2021-09-11-sql-dateadd.md index d020dd5b41..eae9a50096 100644 --- a/website/blog/2021-09-11-sql-dateadd.md +++ b/website/blog/2021-09-11-sql-dateadd.md @@ -81,7 +81,7 @@ I am sorry - that’s just a blank 2x2 matrix. I've surrendered to just searchin But couldn’t we be doing something better with those keystrokes, like typing out and then deleting a tweet? -dbt (and the [dbt_utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/#dateadd-source-macros-cross_db_utils-dateadd-sql-) macro package) helps us smooth out these wrinkles of writing SQL across data warehouses. +dbt (and the [dbt_utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/#dateadd-source-macros-cross_db_utils-dateadd-sql-) macro package) helps us smooth out these wrinkles of writing SQL across data warehouses. Instead of looking up the syntax each time you use it, you can just write it the same way each time, and the macro compiles it to run on your chosen warehouse: diff --git a/website/blog/2021-11-22-primary-keys.md b/website/blog/2021-11-22-primary-keys.md index fe29d47726..0467565ee2 100644 --- a/website/blog/2021-11-22-primary-keys.md +++ b/website/blog/2021-11-22-primary-keys.md @@ -89,7 +89,7 @@ Having tests configured and running in production using the [`dbt test`](https:/ Does your warehouse even _support_ primary keys at all? If it does, how can you actually find out if a table has a primary key set, and what that primary key is? -Let’s walk through primary key support + access across the major cloud data warehouse platforms. +Let’s walk through primary key support + access across the major cloud platforms. ### TL;DR on primary key support across warehouses diff --git a/website/blog/2021-11-22-sql-surrogate-keys.md b/website/blog/2021-11-22-sql-surrogate-keys.md index c9cbabeb16..05422ba350 100644 --- a/website/blog/2021-11-22-sql-surrogate-keys.md +++ b/website/blog/2021-11-22-sql-surrogate-keys.md @@ -156,7 +156,7 @@ output: | `null` | 123 | \|123 | -Let’s take a look at how generating surrogate keys specifically looks in practice across data warehouses, and how you can use one simple dbt macro ([dbt_utils.surrogate_key](https://github.com/dbt-labs/dbt-utils#surrogate_key-source)) to abstract away the null value problem. +Let’s take a look at how generating surrogate keys specifically looks in practice across data warehouses, and how you can use one simple dbt macro ([dbt_utils.surrogate_key](https://github.com/dbt-labs/dbt-utils#surrogate_key-source)) to abstract away the null value problem. ### A surrogate_key macro to the rescue diff --git a/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md b/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md index 0b4699e4f6..6c8de43458 100644 --- a/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md +++ b/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md @@ -69,7 +69,7 @@ In addition to learning the basic pieces of dbt, we're familiarizing ourselves w If we decide not to do this, we end up missing out on what the dbt workflow has to offer. If you want to learn more about why we think analytics engineering with dbt is the way to go, I encourage you to read the [dbt Viewpoint](/docs/about/viewpoint)! -In order to learn the basics, we’re going to [port over the SQL file](/tutorial/refactoring-legacy-sql) that powers our existing "patient_claim_summary" report that we use in our KPI dashboard in parallel to our old transformation process. We’re not ripping out the old plumbing just yet. In doing so, we're going to try dbt on for size and get used to interfacing with a dbt project. +In order to learn the basics, we’re going to [port over the SQL file](/guides/getting-started/learning-more/refactoring-legacy-sql) that powers our existing "patient_claim_summary" report that we use in our KPI dashboard in parallel to our old transformation process. We’re not ripping out the old plumbing just yet. In doing so, we're going to try dbt on for size and get used to interfacing with a dbt project. **Project Appearance** diff --git a/website/blog/2022-02-07-customer-360-view-census-playbook.md b/website/blog/2022-02-07-customer-360-view-census-playbook.md index c254919c55..8da0a48837 100644 --- a/website/blog/2022-02-07-customer-360-view-census-playbook.md +++ b/website/blog/2022-02-07-customer-360-view-census-playbook.md @@ -75,7 +75,7 @@ JaffleGaggle has to keep track of information about their interactions with thei All of these questions require aggregating + syncing data from application usage, workspace information, and orders into the CRM for the sales team to have at their fingertips. -This aggregation process requires an analytics warehouse, as all of these things need to be synced together outside of the application database itself to incorporate other data sources (billing / events information, past touchpoints in the CRM, etc). Thus, we can create our fancy customer 360 within JaffleGaggle’s data warehouse, which is a standard project for a B2B company’s data team. +This aggregation process requires an analytics warehouse, as all of these things need to be synced together outside of the application database itself to incorporate other data sources (billing / events information, past touchpoints in the CRM, etc). Thus, we can create our fancy customer 360 within JaffleGaggle’s , which is a standard project for a B2B company’s data team. **Diving into data modeling** diff --git a/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md b/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md index 5e65586c7d..2f8a48c7a1 100644 --- a/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md +++ b/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md @@ -30,7 +30,7 @@ Enter this story. I’m Nate and I manage the Analytics Engineering team at [Sma ## State of Analytics Before Analytics Engineering -Smartsheet, in general, has a great analytics setup. Strong data engineering and data analytics teams. A cloud data warehouse and an on-prem BI tool for front-end data visibility.  However, even with that foundation, there were some limitations under the hood requiring action: +Smartsheet, in general, has a great analytics setup. Strong data engineering and data analytics teams. A cloud and an on-prem BI tool for front-end data visibility.  However, even with that foundation, there were some limitations under the hood requiring action: ### (1) Multiple undocumented transformation databases diff --git a/website/blog/2022-04-19-complex-deduplication.md b/website/blog/2022-04-19-complex-deduplication.md index fdb3cf1f60..be61eae643 100644 --- a/website/blog/2022-04-19-complex-deduplication.md +++ b/website/blog/2022-04-19-complex-deduplication.md @@ -15,7 +15,7 @@ Let’s get rid of these dupes and send you on your way to do the rest of the *s -You’re here because your duplicates are *special* duplicates. These special dupes are not the basic ones that have same exact values in every column and duplicate primary keys that can be easily fixed by haphazardly throwing in a `distinct` (yeah that’s right, I called using `distinct` haphazard!). These are *partial* duplicates, meaning your entity of concern's primary key is not unique *on purpose* (or perhaps you're just dealing with some less than ideal data syncing). You may be capturing historical, type-two slowly changing dimensional data, or incrementally building a table with an append-only strategy, because you actually want to capture some change over time for the entity your recording. (Or, as mentioned, your loader may just be appending data indiscriminately on a schedule without much care for your time and sanity.) Whatever has brought you here, you now have a table where the is not your entity’s primary key, but instead the entity’s primary key + the column values that you’re tracking. Confused? Let’s look at an example. +You’re here because your duplicates are *special* duplicates. These special dupes are not the basic ones that have same exact values in every column and duplicate primary keys that can be easily fixed by haphazardly throwing in a `distinct` (yeah that’s right, I called using `distinct` haphazard!). These are *partial* duplicates, meaning your entity of concern's primary key is not unique *on purpose* (or perhaps you're just dealing with some less than ideal data syncing). You may be capturing historical, type-two slowly changing dimensional data, or incrementally building a table with an append-only strategy, because you actually want to capture some change over time for the entity your recording. (Or, as mentioned, your loader may just be appending data indiscriminately on a schedule without much care for your time and sanity.) Whatever has brought you here, you now have a table where the is not your entity’s primary key, but instead the entity’s primary key + the column values that you’re tracking. Confused? Let’s look at an example. Here’s your raw table: diff --git a/website/blog/2022-05-17-stakeholder-friendly-model-names.md b/website/blog/2022-05-17-stakeholder-friendly-model-names.md index 6d57d7f333..f074c7ff8c 100644 --- a/website/blog/2022-05-17-stakeholder-friendly-model-names.md +++ b/website/blog/2022-05-17-stakeholder-friendly-model-names.md @@ -40,7 +40,7 @@ Analysts are interfacing with data from the outside in. They are in meetings wit - Precomputed views/tables in a BI tool - Read-only access to the dbt Cloud IDE docs -- Full list of tables and views in their data warehouse +- Full list of tables and views in their #### Precomputed views/tables in a BI tool diff --git a/website/blog/2022-06-30-coalesce-sql.md b/website/blog/2022-06-30-coalesce-sql.md new file mode 100644 index 0000000000..4abbbcdd8e --- /dev/null +++ b/website/blog/2022-06-30-coalesce-sql.md @@ -0,0 +1,85 @@ +--- +title: "COALESCE SQL function: Why we love it" +description: "The COALESCE SQL function is an incredibly useful function that allows you to fill in unhelpful blank values that may show up in your data." +slug: coalesce-sql-love-letter + +authors: [kira_furuichi] + +tags: [SQL Magic] +hide_table_of_contents: false + +date: 2022-06-30 +is_featured: false +--- + +It’s inevitable in the field of analytics engineering: you’re going to encounter moments when there’s mysterious or unhelpful blank values in your data. Null values surely have their time and place, but when you need those null values filled with more meaningful data, COALESCE comes to the rescue. + +COALESCE is an incredibly useful function that allows you to fill in unhelpful blank values that may show up in your data. In the words of analytics engineer [Lauren Benezra](https://docs.getdbt.com/author/lauren_benezra), you will probably almost never see a data model that doesn’t use COALESCE somewhere. + + + +> **What is a SQL Function?** +> +> At a high level, a function takes an input (or multiple inputs) and returns a manipulation of those inputs. Some common SQL functions are [EXTRACT](https://docs.getdbt.com/blog/extract-sql-love-letter/), [LOWER](https://docs.getdbt.com/blog/lower-sql-love-letter/), and [DATEDIFF](https://docs.getdbt.com/blog/datediff-sql-love-letter/). For example, the LOWER function takes a string value and returns an all lower-case version of that input string. + +## How to use the COALESCE function + +In formal terms, using the COALESCE function on a series of values will return the first non-null value.  + +The general syntax for using the COALESCE function looks like the following: + +```sql +coalesce(, ,...) +``` + +You can have as many input values/columns to the COALESCE function as you like, but remember: order is important here since the first non-null value is the one that is returned. In practice, you’ll likely only ever use the COALESCE function with two inputs: a column and the value you want to fill null values of that column with. + +> **Fun Fact** +> The COALESCE function is used in the [surrogate_key](https://docs.getdbt.com/blog/sql-surrogate-keys) macro to replace null column values. + +### Data warehouse support for the COALESCE function + +Most, if not all, modern data warehouses support the COALESCE function; [Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/conditional_expressions#coalesce), [Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_COALESCE.html), [Snowflake](https://docs.snowflake.com/en/sql-reference/functions/coalesce.html), [Postgres](https://www.postgresqltutorial.com/postgresql-tutorial/postgresql-coalesce/), and [Databricks](https://docs.databricks.com/sql/language-manual/functions/coalesce.html) all support the COALESCE function. In addition, the syntax to use COALESCE is the same across all of them. + +## COALESCE SQL function example + +Let’s look at an actual example using COALESCE. Below, we have an `orders` with three column values: an `order_id`, `order_date`, and `order_status`. + +| **order_id** | **order_date** | **order_status** | +| ------------ | -------------- | ---------------- | +| 12389 | 2022-01-02 | | +| 34553 | 2020-04-23 | returned | +| 78411 | 2022-06-06 | | + +If you do a little exploration on this table, you would see that there are only two unique values for `order_status`: NULL and `returned`. As we said before, null values have their time and place, but if you first look at this table, the null value for an order could mean many things–has the order been processed? Was the order successful? + +In this `orders` table, you can assume here that any NULL `order_status` value means that the order was not returned. To make this more clear to anyone who looks at this table, you can utilize a COALESCE function to return a newer, more readable `order_status`. + +```sql +select + order_id, + order_date, + coalesce(order_status, 'not_returned') as order_status +from orders +``` + +Running this query would return the following: + +| **order_id** | **order_date** | **order_status** | +| ------------ | -------------- | ---------------- | +| 12389 | 2022-01-02 | not_returned | +| 34553 | 2020-04-23 | returned | +| 78411 | 2022-06-06 | not_returned | + +Now, there are no null values in the `order_status` column since any null value was replaced by a `not_returned` string. Order 34553’s `order_status` remained unchanged because its original `order_status` was the first non-null value passed in the COALESCE function. By providing more context into what these null values mean, anyone who looks at this table can quickly understand the order status for a specific order. + +> **Important:** +> COALESCE has a straightforward use case—fill missing values with values you specify—but you also want to ensure you’re not changing non-empty values when using it. This is where the order of the input values to the COALESCE function are important: from left to right, the first non-null value is the one that’s returned. + +## Why we love it + +We checked our data team’s dbt project, and we used the COALESCE function over 100 times. We like the COALESCE function so much we named the [annual data conference on analytics engineering](https://coalesce.getdbt.com/) after it. + +At its core, the COALESCE function is an efficient way to fill in missing column values with values you specify. You can additionally use COALESCE across most, if not all, modern data warehouses and there’s [no tricky cross-database syntax like there is for DATEADD](https://docs.getdbt.com/blog/sql-dateadd). + +Thank you COALESCE for always finding our moments of emptiness, and filling them with valuable stuff. diff --git a/website/blog/2022-06-30-extract-sql-function.md b/website/blog/2022-06-30-extract-sql-function.md new file mode 100644 index 0000000000..6378da835b --- /dev/null +++ b/website/blog/2022-06-30-extract-sql-function.md @@ -0,0 +1,85 @@ +--- +title: "EXTRACT SQL function: Why we love it" +description: "In this post, we’re going to give a deep-dive into the EXTRACT function, how it works, and why we use it. The EXTRACT function allows you to extract a specified date part from a date/time. " +slug: extract-sql-love-letter + +authors: [kira_furuichi] + +tags: [SQL Magic] +hide_table_of_contents: false + +date: 2022-06-30 +is_featured: false +--- +There are so many different date functions in SQL—you have [DATEDIFF](https://docs.getdbt.com/blog/datediff-sql-love-letter/), [DATEADD](https://docs.getdbt.com/blog/sql-dateadd), DATE_PART, and [DATE_TRUNC](https://docs.getdbt.com/date-trunc-sql) to name a few. They all have their different use cases and understanding how and when they should be used is a SQL fundamental to get down. Are any of those as easy to use as the EXTRACT function? Well, that debate is for another time… + +In this post, we’re going to give a deep dive into the EXTRACT function, how it works, and why we use it. + + + +The EXTRACT function allows you to extract a specified date part from a date/time. For example, if you were to extract the month from the date February 14, 2022, it would return 2 since February is the second month in the year. + +> **What is a SQL function?** +> At a high level, a function takes an input (or multiple inputs) and returns a manipulation of those inputs. Some common SQL functions are [COALESCE](https://docs.getdbt.com/blog/coalesce-sql-love-letter/), [LOWER],(https://docs.getdbt.com/blog/lower-sql-love-letter/) and [DATEDIFF](https://docs.getdbt.com/blog/datediff-sql-love-letter/). For example, the COALESCE function takes a group of values and returns the first non-null value from that group. + +## How to use the EXTRACT function + +One of our favorite things about the EXTRACT function is how readable it is. Sometimes you may encounter SQL functions and not immediately understand what the arguments are and what the expected output should be. (We’re looking at you, SPLIT_PART.) The EXTRACT function isn’t like that. + +To use the EXTRACT function, you’ll simply specify the date part you want extracted out and the field you want to extract from. You can extract many different date parts, but you’ll most commonly see year, month, week of year, or quarter extracted from a date. + +```yaml +extract( from ) +``` + +Depending on the data warehouse you use, the value returned from an EXTRACT function is often a numeric value or the same date type as the input ``. Read the [documentation for your data warehouse](#data-warehouse-support-for-extract-function) to better understand EXTRACT outputs. + +> **Note:** +> You may additionally see a comma used in place of the ‘from’ in the EXTRACT function, like `extract(, )`. We feel that using that ‘from’ in the function makes it a little more readable. + +### The DATE_PART function + +You may also see the DATE_PART function used in place of the EXTRACT function. Both DATE_PART and EXTRACT perform the same functionality, it’s just a matter of preference on which one you want to use. + +> **Postgres & DATE_PART:** +> This is overly pedantic and you’ll likely never encounter an issue with DATE_PART and EXTRACT evaluating to differences in values that truly matter, but it’s worth noting. Postgres’ DATE_PART and EXTRACT functions would previously evaluate to the same output. However, with Postgres 14, the [EXTRACT function now returns a numeric type instead of an 8-byte float.](https://stackoverflow.com/questions/38442340/difference-between-extractyear-from-timestamp-function-and-date-partyear-t) + +### Data warehouse support for the EXTRACT function + +[Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/datetime_functions#extract), [Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_EXTRACT_function.html), [Snowflake](https://docs.snowflake.com/en/sql-reference/functions/extract.html), [Postgres](https://www.postgresqltutorial.com/postgresql-date-functions/postgresql-extract/), and [Databricks](https://docs.databricks.com/sql/language-manual/functions/extract.html) all support the EXTRACT function. In addition, the syntax to use EXTRACT is the same across all of them. + +## EXTRACT function example + +Let’s take this to an actual example! We’re going to use the [jaffle shop](https://github.com/dbt-labs/jaffle_shop/blob/main/models/orders.sql), a simple dataset and dbt project, to help us. The jaffle shop’s `orders` has some fields around an order’s status, order date, and order amount. + +You can extract different time-based values (weeks, months, years, etc.) from the `order_date` in  the `orders` model using the following code: + +```sql +select + order_id, + order_date, + extract(week from order_date) as order_week, + extract(month from order_date) as order_month, + extract(year from order_date) as order_year +from {{ ref('orders') }} +``` + +After running this query, your results would look a little something like this: + +| **order_id** | **order_date** | **order_week** | **order_month** | **order_year** | +| ------------ | -------------- | -------------- | --------------- | -------------- | +| 1 | 2018-01-01 | 1 | 1 | 2018 | +| 9 | 2018-01-12 | 2 | 1 | 2018 | +| 72 | 2018-03-23 | 12 | 3 | 2018 | + +As you can see, this query extracted the week of year, month of year, and year from the `order_date`. + +## Why we love it + +We’re going to be honest: EXTRACT isn’t the most widely used SQL function in our dbt project. However, EXTRACT has its time and place:  + +* Fiscal calendars: If your business uses fiscal years, or calendars that differ from the normal 12-month cycle, EXTRACT functions can help create alignment between fiscal calendars and normal calendars +* Ad hoc analysis: EXTRACT functions are useful in ad hoc analyses and queries when you need to look at values grouped by date periods or for period comparisons + +Extract is a consistent, helpful, and straightforward function–what more could we ask for from a ~~friend~~ function? + diff --git a/website/blog/2022-06-30-lower-sql-function.md b/website/blog/2022-06-30-lower-sql-function.md new file mode 100644 index 0000000000..f1c3a81184 --- /dev/null +++ b/website/blog/2022-06-30-lower-sql-function.md @@ -0,0 +1,83 @@ +--- +title: "LOWER SQL function: Why we love it" +description: "The LOWER SQL Function allows you to return a string value as an all lowercase string. It’s an effective way to create consistent capitalization for string values across your data." +slug: lower-sql-love-letter + +authors: [kira_furuichi] + +tags: [SQL Magic] +hide_table_of_contents: false + +date: 2022-06-30 +is_featured: false +--- + +We’ve all been there: + +* In a user signup form, user A typed in their name as Kira Furuichi, user B typed it in as john blust, and user C wrote DAvid KrevitT (what’s up with that, David??) +* Your backend application engineers are adamant customer emails are in all caps +* All of your event tracking names are lowercase + +In the real world of human imperfection, opinions, and error, string values are likely to take inconsistent capitalization across different data sources (or even within the same data source). There’s always a little lack of rhyme or reason for why some values are passed as upper or lowercase, and it’s not worth the headache to unpack that. + +So how do you create uniformity for string values that you collect across all your data sources? The LOWER function! + + + +Using the LOWER function on a string value will return the input as an all lowercase string. It’s an effective way to create consistent capitalization for string values across your data. + +> **What is a SQL function?** +> At a high level, a function takes an input (or multiple inputs) and returns a manipulation of those inputs. Some common SQL functions are [COALESCE](https://docs.getdbt.com/blog/coalesce-sql-love-letter/), [EXTRACT](https://docs.getdbt.com/blog/extract-sql-love-letter), and [DATEDIFF](https://docs.getdbt.com/blog/datediff-sql-love-letter/). For example, the COALESCE function takes a group of values and returns the first non-null value from that group. + +## How to use the LOWER function + +The syntax for using the LOWER function looks like the following: + +```sql +lower('') +``` + +Executing this command in a SELECT statement will return the lowercase version of the input string. You may additionally use the LOWER function in WHERE clauses and joins. + +### Data warehouse support for the LOWER function + +[Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#lower), [Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_LOWER.html), [Snowflake](https://docs.snowflake.com/en/sql-reference/functions/lower.html), [Postgres](https://www.postgresqltutorial.com/postgresql-string-functions/postgresql-letter-case-functions/), and [Databricks](https://docs.databricks.com/sql/language-manual/functions/lower.html) all support the LOWER function. In addition, the syntax to use LOWER is the same across all of them. + +## LOWER SQL function example + +Let’s take this to an actual example! Below, you’ll see the first three rows from the `customers` in the [jaffle_shop](https://github.com/dbt-labs/jaffle_shop), a simple dataset and dbt project, that has three columns: `customer_id`, `first_name`, and `last_name`. + +| **customer_id** | **first_name** | **last_name** | +| --------------- | -------------- | ------------- | +| 1 | Michael | P. | +| 2 | Shawn | M. | +| 3 | Kathleen | P. | + +You can lower the first name and last name of the `customers` model using the following code: + +```sql +select + customer_id, + lower(first_name) as first_name, + lower(last_name) as last_name +from {{ ref('customers') }} +``` + +After running this query, the `customers` table will look a little something like this: + +| **customer_id** | **first_name** | **last_name** | +| --------------- | -------------- | ------------- | +| 1 | michael | p. | +| 2 | shawn | m. | +| 3 | kathleen | p. | + +Now, all characters in the `first_name` and `last_name` columns are lowercase. + +> **Note:** +> Changing all string columns to lowercase to create uniformity across data sources typically happens in our dbt project’s [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging). There are a few reasons for that: data cleanup and standardization, such as aliasing, casting, and lowercasing, should ideally happen in staging models to create downstream uniformity. It’s also more performant in downstream models that join on string values to join on strings that are of all the same casing versus having to join and perform lowercasing at the same time. + +## Why we love it + +Let’s go back to our chaotic trio of users A, B, and C who all used different capitalizations to type in their names. If you don’t create consistent capitalization for string values, how would a business user know what to exactly filter for in their BI tool? A business user could filter a name field on “John Blust” since that’s what they would expect it to look like, only to get zero results back. By creating a consistent capitalization format (upper or lowercase) for all string values in your data models, you, therefore, create some expectations for business users in your BI tool. + +There will most likely never be 100% consistency in your data models, but doing all that you can to mitigate that chaos will make your life and the life of your business users hopefully a little easier. Use the LOWER function to create a consistent casing for all strings in your data sources. diff --git a/website/blog/2022-07-05-date-trunc-sql-love-letter.md b/website/blog/2022-07-05-date-trunc-sql-love-letter.md new file mode 100644 index 0000000000..5f0791c68f --- /dev/null +++ b/website/blog/2022-07-05-date-trunc-sql-love-letter.md @@ -0,0 +1,101 @@ +--- +title: "DATE_TRUNC SQL function: Why we love it" +description: "The DATE_TRUNC function will truncate a date or time to the first instance for a given date part maintaining a date format. Wordy, wordy, wordy! What does this really mean?" +slug: date-trunc-sql + +authors: [kira_furuichi] + +tags: [sql magic] +hide_table_of_contents: true + +date: 2022-07-05 +is_featured: false +--- +In general, data people prefer the more granular over the less granular. [Timestamps > dates](https://docs.getdbt.com/blog/when-backend-devs-spark-joy#signs-the-data-is-sparking-joy), daily data > weekly data, etc.; having data at a more granular level always allows you to zoom in. However, you’re likely looking at your data at a somewhat zoomed-out level—weekly, monthly, or even yearly. To do that, you’re going to need a handy dandy function that helps you round out date or time fields. + +The DATE_TRUNC function will truncate a date or time to the first instance of a given date part. Wordy, wordy, wordy! What does this really mean? If you were to truncate `2021-12-13` out to its month, it would return `2021-12-01` (the first day of the month). + +Using the DATE_TRUNC function, you can truncate to the weeks, months, years, or other date parts for a date or time field. This can make date/time fields easier to read, as well as help perform cleaner time-based analyses. + + + +> **What is a SQL function?** +> +> At a high level, a function takes an input (or multiple inputs) and returns a manipulation of those inputs. Some common SQL functions are [COALESCE](https://getdbt.com/sql-foundations/coalesce-sql-love-letter/), [LOWER](https://getdbt.com/sql-foundations/lower-sql-love-letter/), and [EXTRACT](https://getdbt.com/sql-foundations/extract-sql-love-letter/). For example, the COALESCE function takes a group of values and returns the first non-null value from that group. + +Overall, it’s a great function to use to help you aggregate your data into specific date parts while keeping a date format. However, the DATE_TRUNC function isn’t your swiss army knife–it’s not able to do magic or solve all of your problems (we’re looking at you [star](https://getdbt.com/sql-foundations/star-sql-love-letter/). Instead, DATE_TRUNC is your standard kitchen knife—it’s simple and efficient, and you almost never start cooking (data modeling) without it. + +## How to use the DATE_TRUNC function + +For the DATE_TRUNC function, there are two arguments you must pass in: + +* The date part: This is the days/months/weeks/years (level) you want your field to be truncated out to +* The date/time you want to be truncated + +The DATE_TRUNC function can be used in SELECT statements and WHERE clauses. + +Most, if not all, modern cloud data warehouses support some type of the DATE_TRUNC function. There may be some minor differences between the argument order for DATE_TRUNC across data warehouses, but the functionality very much remains the same. + +Below, we’ll outline some of the slight differences in the implementation between some of the data warehouses. + +### The DATE_TRUNC function in Snowflake and Databricks + +In [Snowflake](https://docs.snowflake.com/en/sql-reference/functions/date_trunc.html) and [Databricks](https://docs.databricks.com/sql/language-manual/functions/date_trunc.html), you can use the DATE_TRUNC function using the following syntax: + +```sql +date_trunc(, ) +``` + +In these platforms, the `` is passed in as the first argument in the DATE_TRUNC function. + +### The DATE_TRUNC function in Google BigQuery and Amazon Redshift + +In [Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions#date_trunc) and [Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_DATE_TRUNC.html), the `` is passed in as the first argument and the `` is the second argument. + +```sql +date_trunc(, ) +``` + +> **Note:** +> BigQuery’s DATE_TRUNC function supports the truncation of date types, whereas Snowflake, Redshift, and Databricks’ can be a date or timestamp data type. BigQuery also supports DATETIME_TRUNC and TIMESTAMP_TRUNC functions to support truncation of more granular date/time types. + +## A dbt macro to remember + +Why Snowflake, Amazon Redshift, Databricks, and Google BigQuery decided to use different implementations of essentially the same function is beyond us and it’s not worth the headache trying to figure that out. Instead of remembering if the`` or the `` comes first, (which, let’s be honest, we can literally never remember) you can rely on a dbt Core macro to help you get away from finicky syntax. + +With dbt v1.2, [adapters](https://docs.getdbt.com/docs/available-adapters) now support [cross-database macros](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros) to help you write certain functions, like [DATE_TRUNC](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros#date_trunc) and [DATEDIFF](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros#datediff), without having to memorize sticky function syntax. + +> **Note:** +> Previously, [dbt_utils](https://github.com/dbt-labs/dbt-utils), a package of macros and tests that data folks can use to help write more DRY code in their dbt project, powered cross-database macros. Now, cross-database macros are available **regardless if dbt utils is installed or not.** + +Using the [jaffle shop](https://github.com/dbt-labs/jaffle_shop/blob/main/models/orders.sql), a simple dataset and dbt project, you can truncate the `order_date` from the `orders` table using the dbt [DATE_TRUNC macro](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros#date_trunc): + +```sql +select + order_id, + order_date, + {{ date_trunc("week", "order_date") }} as order_week, + {{ date_trunc("month", "order_date") }} as order_month, + {{ date_trunc("year", "order_date") }} as order_year +from {{ ref('orders') }} +``` + +Running the above would product the following sample results: + +| order_id | order_date | order_week | order_month | order_year | +|:---:|:---:|:---:|:---:|:---:| +| 1 | 2018-01-01 | 2018-01-01 | 2018-01-01 | 2018-01-01 | +| 70 | 2018-03-12 | 2018-03-12 | 2018-03-01 | 2018-01-01 | +| 91 | 2018-03-31 | 2018-03-26 | 2018-03-01 | 2018-01-01 | + +The `order_week`, `order_month`, and `order_year` fields are the truncated values from the `order_date` field. + +**A mild word of warning:** If you’re using the DATE_TRUNC function to modify fields or create new ones, it’s important that you use strong naming conventions for these fields. Since the output from the DATE_TRUNC function looks like a normal date, other data folks or business users may not understand that it’s an altered field and may mistake it for the actual date something happened. + +## Why we love it + +The DATE_TRUNC function is a great way to do data analysis and data modeling that needs to happen at a zoomed-out date part. It’s often used for time-based work, such as customer retention modeling or analysis. The DATE_TRUNC function also allows you to keep the date format of a field which allows for the most ease and compatibility in most BI (business intelligence) tools. + +TL;DR – DATE_TRUNC is a handy, widely-used SQL function—and dbt has made it even simpler to start using! + +*This post is a part of the SQL love letters—a series on the SQL functions the dbt Labs data team members use and love. You can find [the entire collection here](https://getdbt.com/sql-foundations/top-sql-functions).* diff --git a/website/blog/2022-07-12-change-data-capture-metrics.md b/website/blog/2022-07-12-change-data-capture-metrics.md new file mode 100644 index 0000000000..478b78ff20 --- /dev/null +++ b/website/blog/2022-07-12-change-data-capture-metrics.md @@ -0,0 +1,378 @@ +--- +title: "Change data capture for metrics in dbt" +description: "Metrics are user-defined and will likely change over time. Capturing a historical view of your metrics is comlex. Grace Goheen walks you through how to do it in this blog!" +slug: change-data-capture-metrics + +authors: [grace_goheen] + +tags: [analytics craft] +hide_table_of_contents: false + +date: 2022-07-14 +is_featured: true +--- + +Metrics are quantitative measurements of values in your data. Because metrics are user-defined, they can and likely will change over time. If you want to capture a historical view of your metrics, these definitional changes add complexity. + +There are many reasons you, as an analytics engineer, may want to capture the complete version history of metrics: + +- You’re in an industry with a very high standard for data governance +- You need to track big OKRs over time to report back to your stakeholders +- You want to build a window to view history with both forward and backward compatibility + +These are often high-stakes situations! So accuracy in tracking changes in metrics is key. + + + +If you’ve encountered this problem before, you know it’s a tricky one. dbt is [idempotent](https://discourse.getdbt.com/t/understanding-idempotent-data-transformations/518) - it recreates tables at runtime with the `CREATE TABLE AS` syntax. Because of this, the concepts of “version history” and “backups” aren’t intrinsic to dbt. + +Let’s imagine a specific scenario. Joanne is an analytics engineer for a large e-commerce company. The head of sales just messaged her the following question: + +“Can you tell me the revenue for January 2022 for all clothing products?” + +On the surface, this may seem like a simple question. But what if the calculation of revenue has changed since January 2022? Should Joanne calculate the revenue using the current formula or the formula that was used in January 2022? What if the source data for January changed after the month closed? Should Joanne use the source data as it was on January 30th, 2022 or the source data as it is now? + +All of these questions bubble up to our main theme: *How can you capture historical versions of metrics using dbt?* + +Sorry, Joanne. The TL;DR is - “it depends.” + +When I first encountered this problem, it took time and effort to: + +1. think through the possible solutions + +and + +2. determine which solution best suited my needs + +The goal of this article is to eliminate step one – to provide you with a menu of solutions I’ve encountered so you can spend less time ideating and more time considering the nuances of your specific use-case. + +I’ll start by discussing a basic version of the scenario I first encountered – a misapplication of dbt’s snapshot functionality. Then, I’ll outline a couple of solutions: + +- **Downstream Incremental Model**: Build an incremental model downstream of your metrics model to “grab” every point-in-time version +- **Upstream Snapshots**: Build snapshots on all of your sources to capture changes in your raw data and calculate all versions of history every time you execute a `dbt run` + +Finally, I’ll discuss the pros and cons of each solution to give you a head start on step two. + +## Scenario + +Let’s return to Joanne. Using dbt and her favorite BI tool, Joanne has created an income report to track monthly revenue for each product category. + +You can imagine her DAG as shown below, where `fct_income` captures revenue metrics per month for each product category. + +![](/img/blog/2022-07-12-change-data-capture-metrics/fct-income-dag.png) + +Joanne executes a `dbt run` on January 30th, 2022 and queries the resulting table: + +```sql +select * from fct_income where month_year = "January 2022" +``` + +She gets the following output: + +| month_year | product_category | revenue | run_timestamp | +|:---:|:---:|:---:|:---:| +| January 2022 | clothing | 100 | 01/30/22 12:00:00 | +| January 2022 | electronics | 200 | 01/30/22 12:00:00 | +| January 2022 | books | 100 | 01/30/22 12:00:00 | + +But a few days later, her source data changes for January - a manufacturing cost was dated incorrectly, and now has been updated in the source. Joanne executes a `dbt run` again on February 3rd. Now when she queries `fct_income`, she gets the following output: + +| month_year | product_category | revenue | run_timestamp | +|:---:|:---:|:---:|:---:| +| January 2022 | clothing | **50** | 02/03/22 16:00:00 | +| January 2022 | electronics | **150** | 02/03/22 16:00:00 | +| January 2022 | books | **200** | 02/03/22 16:00:00 | + +A few days later, Joanne finds a bug in her `dbt code`. She fixes the bug and executes a dbt run again on February 10th. Now, when she queries `fct_income`, she gets the following output: + +| month_year | product_category | revenue | run_timestamp | +|:---:|:---:|:---:|:---:| +| January 2022 | clothing | **52** | 02/10/22 08:00:00 | +| January 2022 | electronics | **152** | 02/10/22 08:00:00 | +| January 2022 | books | **202** | 02/10/22 08:00:00 | + +When the head of sales messages Joanne the following question: “Can you tell me the revenue for January 2022 for all clothing products?”, she’s unsure which number to give: 100, 50, or 52. + +![](/img/blog/2022-07-12-change-data-capture-metrics/revenue-meme.png) + +Because of this complexity, she decides to capture the history of her income report so that she can easily swap between versions in her BI tool. + +Her goal is to capture **all** versions of the `fct_income` model for January. Something like this: + +| month_year | product_category | revenue | run_timestamp | +|:---:|:---:|:---:|:---:| +| January 2022 | clothing | 100 | 01/30/22 12:00:00 | +| January 2022 | electronics | 200 | 01/30/22 12:00:00 | +| January 2022 | books | 300 | 01/30/22 12:00:00 | +| January 2022 | clothing | 50 | 02/03/22 16:00:00 | +| January 2022 | electronics | 150 | 02/03/22 16:00:00 | +| January 2022 | books | 200 | 02/03/22 16:00:00 | +| January 2022 | clothing | 52 | 02/10/22 08:00:00 | +| January 2022 | electronics | 152 | 02/10/22 08:00:00 | +| January 2022 | books | 202 | 02/10/22 08:00:00 | + +In order to achieve this **long table of history**, she decides to start [snapshotting](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots) `fct_income`. + +```sql +{% snapshot snapshot_fct_income %} + +{{ + config( + target_database='analytics', + target_schema='snapshots', + unique_key='id', + + strategy='check', + check_cols=['revenue'], + ) +}} + +select + month_year || ' - ' || product_category as id, + * +from {{ ref('fct_income') }} + +{% endsnapshot %} +``` + +The output of `snapshot_fct_income` looks like this: + +| id | month_year | product_category | revenue | run_timestamp | dbt_valid_from | dbt_valid_to | +|:---:|:---:|:---:|:---:|:---:|:---:|:---:| +| January 2022 - clothing | January 2022 | clothing | 100 | 01/30/22 12:00:00 | 01/30/22 12:00:00 | 02/03/22 16:00:00 | +| January 2022 - electronics | January 2022 | electronics | 200 | 01/30/22 12:00:00 | 01/30/22 12:00:00 | 02/03/22 16:00:00 | +| January 2022 - books | January 2022 | books | 300 | 01/30/22 12:00:00 | 01/30/22 12:00:00 | 02/03/22 16:00:00 | +| January 2022 - clothing | January 2022 | clothing | 50 | 02/03/22 16:00:00 | 02/03/22 16:00:00 | 02/10/22 08:00:00 | +| January 2022 - electronics | January 2022 | electronics | 150 | 02/03/22 16:00:00 | 02/03/22 16:00:00 | 02/10/22 08:00:00 | +| January 2022 - books | January 2022 | books | 200 | 02/03/22 16:00:00 | 02/03/22 16:00:00 | 02/10/22 08:00:00 | +| January 2022 - clothing | January 2022 | clothing | 52 | 02/10/22 08:00:00 | 02/10/22 08:00:00 | NULL | +| January 2022 - electronics | January 2022 | electronics | 152 | 02/10/22 08:00:00 | 02/10/22 08:00:00 | NULL | +| January 2022 - books | January 2022 | books | 202 | 02/10/22 08:00:00 | 02/10/22 08:00:00 | NULL | + +Each month now has multiple versions of revenue, and the sales department is responsible for determining which version is “correct.” +In order to keep track of which version has been marked as “correct” by the sales department, Joanne creates a seed file to capture which version of the `fct_income` model is the correct one for each month. The output of her seed `income_report_versions` looks like this: + +| month_year | correct_version | comment | +|:---:|:---:|:---:| +| January 2022 | 02/10/22 08:00:00 | Approved by Lucy | + +Her final DAG now looks like this: + +![](/img/blog/2022-07-12-change-data-capture-metrics/income-report-versions-dag.png) + +She's snapshotting `fct_income`, joining the seed file with the snapshot, then exposing the final output to her BI tool. The final output of `stg_snapshot_fct_income` looks like this: + +| month_year | product_category | revenue | run_timestamp | correct_version | +|:---:|:---:|:---:|:---:|:---:| +| January 2022 | clothing | 100 | 01/30/22 12:00:00 | FALSE | +| January 2022 | electronics | 200 | 01/30/22 12:00:00 | FALSE | +| January 2022 | books | 300 | 01/30/22 12:00:00 | FALSE | +| January 2022 | clothing | 50 | 02/03/22 16:00:00 | FALSE | +| January 2022 | electronics | 150 | 02/03/22 16:00:00 | FALSE | +| January 2022 | books | 200 | 02/03/22 16:00:00 | FALSE | +| January 2022 | clothing | 52 | 02/10/22 08:00:00 | TRUE | +| January 2022 | electronics | 152 | 02/10/22 08:00:00 | TRUE | +| January 2022 | books | 202 | 02/10/22 08:00:00 | TRUE | + +This method *technically* works. Joanne can track what she needs: + +- source data changes +- business logic changes + +And she can easily switch versions by adding a filter on her BI layer. + +However, this method causes long job times and adds potentially unnecessary complexity – one of the reasons our [best practices](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots#snapshot-query-best-practices) recommend only using snapshots to track changes in your source data, rather than your final models. + +Below, you’ll find two solutions that are more effective than snapshotting a final model, as well as the pros and cons of each method. + +## Solution #1: Downstream Incremental Model + +Instead of using snapshots, Joanne could create an [incremental model](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models) downstream of `fct_income` to “grab” every point-in-time version of `fct_income` – let’s call this incremental model `int_income_history` and assume it has the following config block: + +```sql +{{ + config( + materialized='incremental' + ) +}} +``` + +By materializing `int_income_history` as incremental but *not* including a `unique_key` config, dbt will only execute `INSERT` statements – new rows will be added, but old rows will remain unchanged. + +The rest of `int_income_history` would look like this: + +```sql +... + +select + * +from {{ ref('fct_income') }} +{% if is_incremental() %} + where true +{% endif %} +``` + +There are a few additional configs that Joanne might find helpful: + +- she can use the `on_schema_change` config to handle schema changes if new columns are added and/or deleted from `fct_income` +- she can also set the `full_refresh` config to false in order to prevent accidental loss of the historical data +- she can build this table in a custom `schema` if she wants to enforce specific role-based permissions for this historical table +- she can specify a time-grain `unique_key` if she wants to reduce the amount of versions being captured + - for example, if she only wants to capture the final version of each day she could set `unique_key = date_trunc('day', run_timestamp)`. This is excluded from the example below, as we are making the assumption that Joanne does indeed want to capture every version of `fct_income` + +The final config block for `int_income_history` might look something like this: + +```sql +{{ + config( + materialized='incremental', + full_refresh=false, + schema='history', + on_schema_change='sync_all_columns' + ) +}} +``` + +As a final step, Joanne would create `fct_income_history` to join in the seed file to determine which versions are “correct”. Her new DAG looks like this, where `int_income_history` is an incremental model without a unique key: + +![](/img/blog/2022-07-12-change-data-capture-metrics/int-income-history-dag.png) + +The final output of `fct_income_history` would look identical to `stg_snapshot_fct_income` from her initial approach: + +| month_year | product_category | revenue | run_timestamp | correct_version | +|:---:|:---:|:---:|:---:|:---:| +| January 2022 | clothing | 100 | 01/30/22 12:00:00 | FALSE | +| January 2022 | electronics | 200 | 01/30/22 12:00:00 | FALSE | +| January 2022 | books | 300 | 01/30/22 12:00:00 | FALSE | +| January 2022 | clothing | 50 | 02/03/22 16:00:00 | FALSE | +| January 2022 | electronics | 150 | 02/03/22 16:00:00 | FALSE | +| January 2022 | books | 200 | 02/03/22 16:00:00 | FALSE | +| January 2022 | clothing | 52 | 02/10/22 08:00:00 | TRUE | +| January 2022 | electronics | 152 | 02/10/22 08:00:00 | TRUE | +| January 2022 | books | 202 | 02/10/22 08:00:00 | TRUE | + +## Solution #2: Upstream Snapshots + +Alternatively, Joanne could snapshot her source data and add flexibility to her modeling so that all historical versions are calculated *at the same time*. Let’s look at our example. + +Joanne could track changes in the source data by adding snapshots directly on top of her raw data. + +![](/img/blog/2022-07-12-change-data-capture-metrics/snapshots-dag.png) + +This would *change the * of these `stg_` tables, so she would see a row for each version of each field. The staging models will contain the history of each record. + +Remember the source data change Joanne noticed — a manufacturing cost was dated incorrectly (Junkuary 2022 instead of January 2022). With this solution, the `costs_snapshot` model will pick up this change: + +```sql +{% snapshot costs_snapshot %} + +{{ + config( + target_database='analytics', + target_schema='snapshots', + unique_key='cost_id', + + strategy='timestamp', + updated_at='updated_at', + ) +}} + +select * from {{ source('source', 'costs') }} + +{% endsnapshot %} +``` + +| cost_id | month_year | cost | updated_at | dbt_valid_from | dbt_valid_to | +|:---:|:---:|:---:|:---:|:---:|:---:| +| 1 | Junkuary 2022 | 50 | 01/15/22 12:00:00 | 01/15/22 12:00:00 | 02/03/22 12:00:00 | +| 1 | January 2022 | 50 | 02/03/22 12:00:00 | 02/03/22 12:00:00 | NULL | + +:::note Note +Because snapshots only capture changes detected at the time the dbt snapshot command is executed, it is technically possible to miss some changes to your source data. You will have to consider how often you want to run this snapshot command in order to capture the history you need. +::: + +The original `fct_income` model now calculates the metrics for each version of source data, every time Joanne executes a `dbt run`. In other words, the downstream `fct_` models are **version-aware**. Because of this, Joanne changes the name of `fct_income` to `fct_income_history` to be more descriptive. + +In order to track changes in business logic, she can apply each version of logic to the relevant records and union together. + +Remember the bug Joanne found in her dbt code. With this solution, she can track this change in business logic in the `stg_costs` model: + +```sql +-- apply the old logic for any records that were valid on or before the logic change +select + cost_id, + ..., + cost + tax as final_cost, -- old logic + 1 || ‘-’ || dbt_valid_from as version +from costs_snapshot +where dbt_valid_from <= to_timestamp('02/10/22 08:00:00') + +union all + +-- apply the new logic for any records that were valid after the logic change +select + cost_id, + ..., + cost as final_cost, -- new logic + 2 || ‘-’ || dbt_valid_from as version +from costs_snapshot +where to_timestamp('02/10/22 08:00:00') between dbt_valid_to and coalesce(dbt_valid_from, to_timestamp('01/01/99 00:00:00')) +``` +| cost_id | month_year | cost | tax | final_cost | version | +|:---:|:---:|:---:|:---:|:---:| :---:| +| 1 | Junkuary 2022 | 50 | 1 | 51 | 1 - 01/15/22 12:00:00 | +| 1 | January 2022 | 50 | 1 | 51 | 1 - 02/03/22 12:00:00 | +| 1 | January 2022 | 50 | 1 | 50 | 1 - 02/03/22 12:00:00 | + +The contents of the seed `income_report_versions` would look slightly different to match the change in version definition: + +| month_year | correct_version | comment | +|:---:|:---:|:---:| +| January 2022 | 2 - 02/03/22 12:00:00 | Approved by Lucy | + +After joining in the seed file (check out [Tackling the complexity of joining snapshots](https://docs.getdbt.com/blog/joining-snapshot-complexity)), her new DAG looks like this: + +![](/img/blog/2022-07-12-change-data-capture-metrics/final-dag.png) + +The final output of `fct_income_history` would accomplish the same goal as `stg_snapshot_fct_income` from her initial approach: + +| month_year | product_category | revenue | version | correct_version | +|:---:|:---:|:---:|:---:|:---:| +| January 2022 | clothing | 100 | 1 - 01/15/22 12:00:00 | FALSE | +| January 2022 | electronics | 200 | 1 - 01/15/22 12:00:00 | FALSE | +| January 2022 | books | 300 | 1 - 01/15/22 12:00:00 | FALSE | +| January 2022 | clothing | 50 | 1 - 02/03/22 12:00:00 | FALSE | +| January 2022 | electronics | 150 | 1 - 02/03/22 12:00:00 | FALSE | +| January 2022 | books | 200 | 1 - 02/03/22 12:00:00 | FALSE | +| January 2022 | clothing | 52 | 2 - 02/03/22 12:00:00 | TRUE | +| January 2022 | electronics | 152 | 2 - 02/03/22 12:00:00 | TRUE | +| January 2022 | books | 202 | 2 - 02/03/22 12:00:00 | TRUE | + +## Final thoughts + +Both of these solutions allow Joanne to achieve her desired output – a table containing all versions of income metrics for a given month – while improving the workflow and the efficiency of the final model. + +However, each has its advantages and disadvantages. + +**Solution #1: Downstream Incremental Model** + +| Pros | Cons | +|:---:|:---:| +| incremental models without unique keys are fast | this isn't really the intended use of the incremental | +| | Joanne has no way to re-calculate prior versions if her historical table is accidentally lost | + +**Solution #2: Upstream Snapshots** + +| Pros | Cons | +|:---:|:---:| +| Joanne doesn't have to worry about losing historical data | snapshots are highly complex and require more institutional knowledge for Joanne's team | +| | every time Joanne wants to make a code change that affects her metric calculations, she'll have to remember to apply the change to each set of relevant records and union the outputs together | + +When deciding between the two solutions, you should consider the following: + +- How often is your source data changing? +- How many bug fixes do you anticipate? +- How fast do you need this job to be? +- How much visibility do you need into why a change in historic metric occurred? + +💡 What do you think? Is there another, more optimal, solution? diff --git a/website/blog/authors.yml b/website/blog/authors.yml index d2b2b3c7f6..af70084bc0 100644 --- a/website/blog/authors.yml +++ b/website/blog/authors.yml @@ -239,3 +239,24 @@ christine_berger: job_title: Senior Analytics Engineer organization: dbt Labs image_url: /img/blog/authors/christine-berger.jpeg + +grace_goheen: + name: Grace Goheen + job_title: Analytics Engineer + organization: dbt Labs + image_url: /img/blog/authors/grace-goheen.jpeg + links: + - url: https://www.linkedin.com/in/gracegoheen/ + icon: fa-linkedin + +jonathan_natkin: + name: Jon "Natty" Natkins + job_title: Regional Director, Solutions Architecture + organization: dbt Labs + description: Natty also writes about startups, equity, data, and more in his Substack called [Semi-Structured](http://semistructured.substack.com/). + image_url: /img/blog/authors/jonathan-natkins.jpeg + links: + - url: https://www.linkedin.com/in/nattyice/ + icon: fa-linkedin + - url: https://twitter.com/nattyice + icon: fa-twitter diff --git a/website/blog/categories.yml b/website/blog/categories.yml index cd486b0bde..2a45e6529e 100644 --- a/website/blog/categories.yml +++ b/website/blog/categories.yml @@ -15,10 +15,6 @@ display_title: dbt tutorials description: Best practices in the usage of our favorite data transformation tool. is_featured: true -- name: release notes - display_title: Release notes - description: Notable updates and new features in dbt Cloud. - is_featured: true - name: dbt updates display_title: dbt product updates description: An archive of monthly product updates from the dbt Labs team. diff --git a/website/blog/maching-learning-dbt-baton-pass.md b/website/blog/maching-learning-dbt-baton-pass.md index 9eed523800..8d49b5aa13 100644 --- a/website/blog/maching-learning-dbt-baton-pass.md +++ b/website/blog/maching-learning-dbt-baton-pass.md @@ -42,7 +42,7 @@ This happens because the “normal” way of doing things lacks long-term & expl ### Here’s what happened -After some initial planning, I knew we had this raw data living somewhere in our data warehouse. It was easy to make sense of this starting point for our work together. I wrote dbt transformations to massage this raw data and joined a couple tables together based on intuition of what variables mattered: daily active usage, number of users, amount paid, historical usage, etc. +After some initial planning, I knew we had this raw data living somewhere in our . It was easy to make sense of this starting point for our work together. I wrote dbt transformations to massage this raw data and joined a couple tables together based on intuition of what variables mattered: daily active usage, number of users, amount paid, historical usage, etc. The ML engineer stepped in from here. She was used to doing her statistics and preprocessing in python [pandas](https://pandas.pydata.org/) and [scikit-learn](https://scikit-learn.org/stable/index.html). Before she opened up her Jupyter notebook, we had a heart-to-heart conversation and realized the same work could be done through dbt. Preprocessing could be done through this [open source dbt package](https://github.com/omnata-labs/dbt-ml-preprocessing/tree/1.1.0/#dbt-ml-preprocessing) and there were plenty of others like it in the [package registry](https://hub.getdbt.com/). diff --git a/website/docs/dbt-cli/configure-your-profile.md b/website/docs/dbt-cli/configure-your-profile.md index 4b5958026c..3ac0cf1407 100644 --- a/website/docs/dbt-cli/configure-your-profile.md +++ b/website/docs/dbt-cli/configure-your-profile.md @@ -9,7 +9,7 @@ description: "Configure your profile using the command line." ## Connecting to your warehouse using the command line -When you invoke dbt from the command line, dbt parses your `dbt_project.yml` and obtains the `profile` name, which dbt needs to connect to your data warehouse. +When you invoke dbt from the command line, dbt parses your `dbt_project.yml` and obtains the `profile` name, which dbt needs to connect to your . diff --git a/website/docs/docs/building-a-dbt-project/building-models.md b/website/docs/docs/building-a-dbt-project/building-models.md index 99ec1d8d7b..b1d53f3720 100644 --- a/website/docs/docs/building-a-dbt-project/building-models.md +++ b/website/docs/docs/building-a-dbt-project/building-models.md @@ -22,7 +22,7 @@ A model is a `select` statement. Models are defined in `.sql` files (typically i - The name of the file is used as the model name - Models can be nested in subdirectories within the `models` directory -When you execute the [`dbt run` command](run), dbt will build this model in your data warehouse by wrapping it in a `create view as` or `create table as` statement. +When you execute the [`dbt run` command](run), dbt will build this model in your by wrapping it in a `create view as` or `create table as` statement. For example, consider this `customers` model: diff --git a/website/docs/docs/building-a-dbt-project/building-models/configuring-incremental-models.md b/website/docs/docs/building-a-dbt-project/building-models/configuring-incremental-models.md index 2e1c47421d..22af3bbf36 100644 --- a/website/docs/docs/building-a-dbt-project/building-models/configuring-incremental-models.md +++ b/website/docs/docs/building-a-dbt-project/building-models/configuring-incremental-models.md @@ -5,7 +5,7 @@ id: "configuring-incremental-models" ## About incremental models -Incremental models are built as tables in your data warehouse. The first time a model is run, the is built by transforming _all_ rows of source data. On subsequent runs, dbt transforms _only_ the rows in your source data that you tell dbt to filter for, inserting them into the target table which is the table that has already been built. +Incremental models are built as tables in your . The first time a model is run, the is built by transforming _all_ rows of source data. On subsequent runs, dbt transforms _only_ the rows in your source data that you tell dbt to filter for, inserting them into the target table which is the table that has already been built. Often, the rows you filter for on an incremental run will be the rows in your source data that have been created or updated since the last time dbt ran. As such, on each dbt run, your model gets built incrementally. diff --git a/website/docs/docs/building-a-dbt-project/building-models/materializations.md b/website/docs/docs/building-a-dbt-project/building-models/materializations.md index fb7f9ef307..a703814df3 100644 --- a/website/docs/docs/building-a-dbt-project/building-models/materializations.md +++ b/website/docs/docs/building-a-dbt-project/building-models/materializations.md @@ -91,7 +91,7 @@ When using the `table` materialization, your model is rebuilt as a expression. * **Pros:** * You can still write reusable logic - * Ephemeral models can help keep your data warehouse clean by reducing clutter (also consider splitting your models across multiple schemas by [using custom schemas](using-custom-schemas)). + * Ephemeral models can help keep your clean by reducing clutter (also consider splitting your models across multiple schemas by [using custom schemas](using-custom-schemas)). * **Cons:** * You cannot select directly from this model. * Operations (e.g. macros called via `dbt run-operation` cannot `ref()` ephemeral nodes) diff --git a/website/docs/docs/building-a-dbt-project/building-models/using-custom-aliases.md b/website/docs/docs/building-a-dbt-project/building-models/using-custom-aliases.md index c382c2c666..5675c64ae3 100644 --- a/website/docs/docs/building-a-dbt-project/building-models/using-custom-aliases.md +++ b/website/docs/docs/building-a-dbt-project/building-models/using-custom-aliases.md @@ -8,7 +8,7 @@ id: "using-custom-aliases" When dbt runs a model, it will generally create a relation (either a `table` or a `view`) in the database. By default, dbt uses the filename of the model as the identifier for this relation in the database. This identifier can optionally be overridden using the `alias` model configuration. ### Why alias model names? -The names of schemas and tables are effectively the "user interface" of your data warehouse. Well-named schemas and tables can help provide clarity and direction for consumers of this data. In combination with [custom schemas](using-custom-schemas), model aliasing is a powerful mechanism for designing your warehouse. +The names of schemas and tables are effectively the "user interface" of your . Well-named schemas and tables can help provide clarity and direction for consumers of this data. In combination with [custom schemas](using-custom-schemas), model aliasing is a powerful mechanism for designing your warehouse. ### Usage The `alias` config can be used to change the name of a model's identifier in the database. The following shows examples of database identifiers for models both with, and without, a supplied `alias`. diff --git a/website/docs/docs/building-a-dbt-project/documentation.md b/website/docs/docs/building-a-dbt-project/documentation.md index 3bd7e78aaa..711dc8130a 100644 --- a/website/docs/docs/building-a-dbt-project/documentation.md +++ b/website/docs/docs/building-a-dbt-project/documentation.md @@ -17,7 +17,7 @@ Good documentation for your dbt models will help downstream consumers discover a dbt provides a way to generate documentation for your dbt project and render it as a website. The documentation for your project includes: * **Information about your project**: including model code, a DAG of your project, any tests you've added to a column, and more. -* **Information about your data warehouse**: including column data types, and sizes. This information is generated by running queries against the information schema. +* **Information about your **: including column data types, and sizes. This information is generated by running queries against the information schema. Here's a screenshot of an example docs site (you can find the whole site [here](https://www.getdbt.com/mrr-playbook/#!/overview)): diff --git a/website/docs/docs/building-a-dbt-project/hooks-operations.md b/website/docs/docs/building-a-dbt-project/hooks-operations.md index fb6778f212..64942b245a 100644 --- a/website/docs/docs/building-a-dbt-project/hooks-operations.md +++ b/website/docs/docs/building-a-dbt-project/hooks-operations.md @@ -39,7 +39,7 @@ Hooks are a more-advanced capability that enable you to run custom SQL, and leve -In order to streamline hooks and automatically apply grants when your dbt model runs, we recommend using [`grants` resource-config](/reference/resource-configs/grants). + If (and only if) you can't leverage the [`grants` resource-config](/reference/resource-configs/grants), you can use `post-hook` to perform more advanced workflows: @@ -238,7 +238,7 @@ Full usage docs can for the `run-operation` command can be found [here](run-oper These examples from the community highlight some of the use-cases for hooks and operations! -* [In-depth discussion of granting privileges using hooks and operations](https://discourse.getdbt.com/t/the-exact-grant-statements-we-use-in-a-dbt-project/430) +* [In-depth discussion of granting privileges using hooks and operations, for dbt Core versions prior to 1.2](https://discourse.getdbt.com/t/the-exact-grant-statements-we-use-in-a-dbt-project/430) * [Staging external tables](https://github.com/dbt-labs/dbt-external-tables) * [Performing a zero copy clone on Snowflake to reset a dev environment](https://discourse.getdbt.com/t/creating-a-dev-environment-quickly-on-snowflake/1151/2) * [Running `vacuum` and `analyze` on a Redshift warehouse](https://github.com/dbt-labs/redshift/tree/0.2.3/#redshift_maintenance_operation-source) diff --git a/website/docs/docs/building-a-dbt-project/metrics.md b/website/docs/docs/building-a-dbt-project/metrics.md index 3ca76eb0c6..1a7d595bd0 100644 --- a/website/docs/docs/building-a-dbt-project/metrics.md +++ b/website/docs/docs/building-a-dbt-project/metrics.md @@ -29,6 +29,8 @@ A metric is a timeseries aggregation over a that supports ze In v1.0, dbt supports metric definitions as a new node type. Like [exposures](exposures), metrics participate in the dbt DAG and can be expressed in YAML files. By defining metrics in dbt projects, you encode crucial business logic in tested, version-controlled code. Further, you can expose these metrics definitions to downstream tooling, which drives consistency and precision in metric reporting. +For more information on querying the metrics defined in your dbt project, please reference the readme in the [dbt_metrics package.](https://github.com/dbt-labs/dbt_metrics) + ### Benefits of defining metrics **Use metric specifications in downstream tools** @@ -95,7 +97,7 @@ metrics: | Field | Description | Example | Required? | |-------------|-------------------------------------------------------------|---------------------------------|-----------| | name | A unique identifier for the metric | new_customers | yes | -| model | The dbt model that powers this metric | dim_customers | yes | +| model | The dbt model that powers this metric | dim_customers | yes (no for `expression` metrics)yes | | label | A short for name / label for the metric | New Customers | no | | description | Long form, human-readable description for the metric | The number of customers who.... | no | | type | The type of calculation to perform when evaluating a metric | count_distinct | yes | @@ -106,6 +108,59 @@ metrics: | filters | A list of filters to apply before calculating the metric | See below | no | | meta | Arbitrary key/value store | {team: Finance} | no | + +### Available types + +| Metric Type | Description | +|----------------|----------------------------------------------------------------------------| +| count | This metric type will apply the `count` aggregation to the specified field | +| count_distinct | This metric type will apply the `count` aggregation to the specified field, with an additional distinct statement inside the aggregation | +| sum | This metric type will apply the `sum` aggregation to the specified field | +| average | This metric type will apply the `average` aggregation to the specified field | +| min | This metric type will apply the `min` aggregation to the specified field | +| max | This metric type will apply the `max` aggregation to the specified field | +| expression | This metric type is defined as any **non-aggregating** calculation of 1 or more metrics Not yet available — added in v1.2 | + + + +### Expression Metrics +In v1.2, support was added for `expression` metrics, which are defined as non-aggregating calculations of 1 or more metrics. By defining these metrics, you are able to create metrics like: +- ratios +- subtractions +- any arbitrary calculation + +As long as the two+ base metrics (the metrics that comprise the `expression` metric) share the specified `time_grains` and `dimensions`, those attributes can be used in any downstream metrics macro. + +An example definition of an `expression` metric is: + + + + +```yaml +# models/marts/product/schema.yml +version: 2 + +models: + - name: dim_customers + ... + +metrics: + - name: average_revenue_per_customer + label: Average Revenue Per Customer + description: "The average revenue received per customer" + + type: expression + sql: "{{metric('total_revenue')}} / {{metric('count_of_customers')}}" + + timestamp: order_date + time_grains: [day, week, month] + dimensions: + - had_discount + - order_country + +``` + + ### Filters Filters should be defined as a list of dictionaries that define predicates for the metric. Filters are combined using AND clauses. For more control, users can (and should) include the complex logic in the model powering the metric. diff --git a/website/docs/docs/building-a-dbt-project/projects.md b/website/docs/docs/building-a-dbt-project/projects.md index 87a1647a81..aefe335345 100644 --- a/website/docs/docs/building-a-dbt-project/projects.md +++ b/website/docs/docs/building-a-dbt-project/projects.md @@ -43,7 +43,7 @@ To create a new dbt project when developing in dbt Cloud: * Click the hamburger menu, and then `Home`. * Switch the project in the header bar to your new "dbt Tutorial" project. 3. Complete the project setup flow: - * Connect to your data warehouse + * Connect to your * Add a repository — either choose a managed repository, or connect to an existing, but bare, repository. diff --git a/website/docs/docs/building-a-dbt-project/seeds.md b/website/docs/docs/building-a-dbt-project/seeds.md index d13d570959..a0b935461f 100644 --- a/website/docs/docs/building-a-dbt-project/seeds.md +++ b/website/docs/docs/building-a-dbt-project/seeds.md @@ -8,7 +8,7 @@ id: "seeds" * [`seed` command](seed) ## Getting started -Seeds are CSV files in your dbt project (typically in your `seeds` directory), that dbt can load into your data warehouse using the `dbt seed` command. +Seeds are CSV files in your dbt project (typically in your `seeds` directory), that dbt can load into your using the `dbt seed` command. Seeds can be referenced in downstream models the same way as referencing models — by using the [`ref` function](/reference/dbt-jinja-functions/ref). diff --git a/website/docs/docs/contributing/building-a-new-adapter.md b/website/docs/docs/contributing/building-a-new-adapter.md index f0eb5ffed6..0cceadd22e 100644 --- a/website/docs/docs/contributing/building-a-new-adapter.md +++ b/website/docs/docs/contributing/building-a-new-adapter.md @@ -36,7 +36,7 @@ The more you can answer Yes to the below questions, the easier your adapter deve - Does your organization have an established process for publishing open source software? -It is easiest to build an adapter for dbt when the following the data warehouse/platform in question has: +It is easiest to build an adapter for dbt when the following the /platform in question has: - a conventional ANSI-SQL interface (or as close to it as possible), - a mature connection library/SDK that uses ODBC or Python DB 2 API, and - a way to enable developers to iterate rapidly with both quick reads and writes diff --git a/website/docs/docs/contributing/testing-a-new-adapter.md b/website/docs/docs/contributing/testing-a-new-adapter.md index cc2f56184b..96e96dc06d 100644 --- a/website/docs/docs/contributing/testing-a-new-adapter.md +++ b/website/docs/docs/contributing/testing-a-new-adapter.md @@ -203,7 +203,7 @@ In the course of creating and maintaining your adapter, it's likely that you wil 2. **Optional tests**, for second-order functionality that is common across plugins, but not required for basic use. Your plugin can opt into these test cases by inheriting existing ones, or reimplementing them with adjustments. For now, this category includes all tests located outside the `basic` subdirectory. More tests will be added as we convert older tests defined on dbt-core and mature plugins to use the standard framework. -3. **Custom tests**, for behavior that is specific to your adapter / data platform. Each data warehouse has its own specialties and idiosyncracies. We encourage you to use the same `pytest`-based framework, utilities, and fixtures to write your own custom tests for functionality that is unique to your adapter. +3. **Custom tests**, for behavior that is specific to your adapter / data platform. Each has its own specialties and idiosyncracies. We encourage you to use the same `pytest`-based framework, utilities, and fixtures to write your own custom tests for functionality that is unique to your adapter. If you run into an issue with the core framework, or the basic/optional test cases—or if you've written a custom test that you believe would be relevant and useful for other adapter plugin developers—please open an issue or PR in the `dbt-core` repository on GitHub. diff --git a/website/docs/docs/core-versions.md b/website/docs/docs/core-versions.md index e1bd7d47c3..0c8ff11930 100644 --- a/website/docs/docs/core-versions.md +++ b/website/docs/docs/core-versions.md @@ -16,7 +16,7 @@ dbt Core releases follow [semantic versioning](https://semver.org/). The policie - We are no longer releasing new patches for minor versions prior to v1.0. - As of June 30, 2022, dbt Cloud will remove support for dbt Core versions older than v1.0. At that point, we will also remove v0.20 and v0.21 from the version dropdown on this website. -- You can read the [specific version migration guides](/docs/guides/migration-guide) to understand changes to each version. Each migration guide will link to pages of documentation that were added or updated. Those pages of documentation will also include "Changelog" notes, which you can toggle to see notes on specific changes from each older version. +- You can read the [specific version migration guides](/guides/migration/versions) to understand changes to each version. Each migration guide will link to pages of documentation that were added or updated. Those pages of documentation will also include "Changelog" notes, which you can toggle to see notes on specific changes from each older version. ## Version support starting with v1.0 diff --git a/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-setting-up-bigquery-oauth.md b/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-setting-up-bigquery-oauth.md index 48349020da..ac059b5836 100644 --- a/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-setting-up-bigquery-oauth.md +++ b/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-setting-up-bigquery-oauth.md @@ -12,7 +12,7 @@ dbt Cloud supports [OAuth authentication](https://cloud.google.com/bigquery/docs :::info Some Pre-Work Required -Before setting up a Client ID & Secret, you'll have to have your existing BigQuery Settings in order. We recommend using a Service Account JSON file, and have a walkthrough for that [here](setting-up#generate-bigquery-credentials) - you will also need to set up an [OAuth Consent Screen](https://support.google.com/cloud/answer/6158849) if you haven't already! +Before setting up a Client ID & Secret, you'll have to have your existing BigQuery Settings in order. We recommend using a Service Account JSON file, and have a walkthrough for that [here](/guides/getting-started/getting-set-up/setting-up-bigquery#generate-bigquery-credentials) - you will also need to set up an [OAuth Consent Screen](https://support.google.com/cloud/answer/6158849) if you haven't already! ::: diff --git a/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database.md b/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database.md index 2ef4726b44..fedfb8f0c7 100644 --- a/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database.md +++ b/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database.md @@ -19,7 +19,7 @@ any database grants. -Allowing these IP addresses only enables the connection to your data warehouse. However, you might want to send API requests from your restricted network to the dbt Cloud API. For example, you could use the API to send a POST request that [triggers a job to run](https://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun). Using the dbt Cloud API requires that you allow the `cloud.getdbt.com` subdomain. For more on the dbt Cloud architecture, see "[Deployment architecture](deployment-architecture)." +Allowing these IP addresses only enables the connection to your . However, you might want to send API requests from your restricted network to the dbt Cloud API. For example, you could use the API to send a POST request that [triggers a job to run](https://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun). Using the dbt Cloud API requires that you allow the `cloud.getdbt.com` subdomain. For more on the dbt Cloud architecture, see "[Deployment architecture](deployment-architecture)." ## Connecting to Redshift and Postgres diff --git a/website/docs/docs/dbt-cloud/cloud-quickstart.md b/website/docs/docs/dbt-cloud/cloud-quickstart.md index b756c8f5d5..7b78613a9e 100644 --- a/website/docs/docs/dbt-cloud/cloud-quickstart.md +++ b/website/docs/docs/dbt-cloud/cloud-quickstart.md @@ -17,7 +17,7 @@ Let's get started! ## Create a connection -dbt Cloud uses this connection to _connect_ to your database when running jobs and transformation queries. Depending on the type of data warehouse you're using, you'll need to supply [different configuration parameters](docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database). dbt Cloud natively supports connections to Snowflake, BigQuery, Redshift, Apache Spark, Databricks, and Postgres. +dbt Cloud uses this connection to _connect_ to your database when running jobs and transformation queries. Depending on the type of you're using, you'll need to supply [different configuration parameters](docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database). dbt Cloud natively supports connections to Snowflake, BigQuery, Redshift, Apache Spark, Databricks, and Postgres. After picking a data warehouse type, a form will be generated where you can populate your warehouse's credentials. These credentials are encrypted at rest, and dbt Cloud never stores credentials in plaintext. diff --git a/website/docs/docs/dbt-cloud/deployments/architecture.md b/website/docs/docs/dbt-cloud/deployments/architecture.md index 370a308247..53026b580c 100644 --- a/website/docs/docs/dbt-cloud/deployments/architecture.md +++ b/website/docs/docs/dbt-cloud/deployments/architecture.md @@ -32,7 +32,7 @@ In addition to the application components, there are a few critical dependencies ### Data Warehouse Interaction -dbt Cloud's primary role is as a data processor, not a data store. The dbt Cloud application enables users to dispatch SQL to the warehouse for transformation purposes. However, it is possible for users to dispatch SQL that returns customer data into the dbt Cloud application. This data is never persisted and will only exist in memory on the instance in question. In order to properly lock down customer data, it is critical that proper data warehouse permissioning is applied to prevent improper access or storage of sensitive data. +dbt Cloud's primary role is as a data processor, not a data store. The dbt Cloud application enables users to dispatch SQL to the warehouse for transformation purposes. However, it is possible for users to dispatch SQL that returns customer data into the dbt Cloud application. This data is never persisted and will only exist in memory on the instance in question. In order to properly lock down customer data, it is critical that proper permissioning is applied to prevent improper access or storage of sensitive data. ### Deployment Architecture diff --git a/website/docs/docs/dbt-cloud/on-premises/setup.md b/website/docs/docs/dbt-cloud/on-premises/setup.md index 1e0d9bf051..b0823e6e65 100644 --- a/website/docs/docs/dbt-cloud/on-premises/setup.md +++ b/website/docs/docs/dbt-cloud/on-premises/setup.md @@ -85,7 +85,7 @@ Each user can have a specific role on each account. For more information on each A new version of dbt Cloud will appear on the Version History page in your Configuration Console anytime any of the following happen: -- A new version of the dbt Cloud code is released. This typically happens every two weeks, and each new version will be accompanied by a [changelog](/docs/dbt-cloud/cloud-changelog). +- A new version of the dbt Cloud code is released. This typically happens every two weeks, and each new version will be accompanied by a [changelog](/docs/dbt-cloud/release-notes). - Any configuration change is applied to your application via the Configuration Console. - Anytime an edit is applied to your Kubernetes configs via the overlays mechanism built into kots. diff --git a/website/docs/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration-with-github.md b/website/docs/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration-with-github.md index c632974ae0..bd362d1319 100644 --- a/website/docs/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration-with-github.md +++ b/website/docs/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration-with-github.md @@ -31,7 +31,7 @@ When a [dbt Cloud CI job is set up](cloud-enabling-continuous-integration#config -When the run is complete, dbt Cloud will update the PR in GitHub, GitLab, or Azure DevOps with a status message indicating the results of the run, letting you know if the models and tests ran successfully or not. And finally, once the pull request is closed or merged, dbt Cloud will delete the temporary schema from your data warehouse. +When the run is complete, dbt Cloud will update the PR in GitHub, GitLab, or Azure DevOps with a status message indicating the results of the run, letting you know if the models and tests ran successfully or not. And finally, once the pull request is closed or merged, dbt Cloud will delete the temporary schema from your . ### GitHub pull request example diff --git a/website/docs/docs/faqs/define-a-column-type.md b/website/docs/docs/faqs/define-a-column-type.md index 13f2844bc6..11f7248bda 100644 --- a/website/docs/docs/faqs/define-a-column-type.md +++ b/website/docs/docs/faqs/define-a-column-type.md @@ -16,7 +16,7 @@ from {{ ref('stg_orders') }} -Many modern data warehouses now support `::` syntax as a shorthand for `cast( as )`. +Many modern s now support `::` syntax as a shorthand for `cast( as )`. diff --git a/website/docs/docs/faqs/load-raw-data-with-seed.md b/website/docs/docs/faqs/load-raw-data-with-seed.md index 142ce37979..2cf6e76774 100644 --- a/website/docs/docs/faqs/load-raw-data-with-seed.md +++ b/website/docs/docs/faqs/load-raw-data-with-seed.md @@ -6,4 +6,4 @@ Seeds should **not** be used to load raw data (for example, large CSV exports fr Since seeds are version controlled, they are best suited to files that contain business-specific logic, for example a list of country codes or user IDs of employees. -Loading CSVs using dbt's seed functionality is not performant for large files. Consider using a different tool to load these CSVs into your data warehouse. +Loading CSVs using dbt's seed functionality is not performant for large files. Consider using a different tool to load these CSVs into your . diff --git a/website/docs/docs/faqs/loading-data.md b/website/docs/docs/faqs/loading-data.md index 28d53210cf..9ed6e09eb8 100644 --- a/website/docs/docs/faqs/loading-data.md +++ b/website/docs/docs/faqs/loading-data.md @@ -1,7 +1,7 @@ --- title: How do I load data into my warehouse? --- -dbt assumes that you already have a copy of your data, in your data warehouse. We recommend you use an off-the-shelf tool like [Stitch](https://www.stitchdata.com/) or [Fivetran](https://fivetran.com/) to get data into your warehouse. +dbt assumes that you already have a copy of your data, in your . We recommend you use an off-the-shelf tool like [Stitch](https://www.stitchdata.com/) or [Fivetran](https://fivetran.com/) to get data into your warehouse. **Can dbt be used to load data?** diff --git a/website/docs/docs/introduction.md b/website/docs/docs/introduction.md index 8205986132..b6eaa1ae8a 100644 --- a/website/docs/docs/introduction.md +++ b/website/docs/docs/introduction.md @@ -29,7 +29,7 @@ Want to check out a sample project? Have a look at our [Jaffle Shop](https://git ::: ### Database Connections -dbt connects to your data warehouse to run data transformation queries. As such, you’ll need a data warehouse with source data loaded in it to use dbt. dbt natively supports connections to Snowflake, BigQuery, Redshift and Postgres data warehouses, and there’s a number of community-supported adapters for other warehouses (see [docs](available-adapters)). +dbt connects to your to run data transformation queries. As such, you’ll need a data warehouse with source data loaded in it to use dbt. dbt natively supports connections to Snowflake, BigQuery, Redshift and Postgres data warehouses, and there’s a number of community-supported adapters for other warehouses (see [docs](available-adapters)). When you define your connection, you’ll also be able to specify the target schema where dbt should create your models as tables and views. See [Managing environments](managing-environments) for more information on picking target schema names. diff --git a/website/docs/faqs/Accounts/dbt-specific-jinja.md b/website/docs/faqs/Accounts/dbt-specific-jinja.md index eed87dd575..e262349100 100644 --- a/website/docs/faqs/Accounts/dbt-specific-jinja.md +++ b/website/docs/faqs/Accounts/dbt-specific-jinja.md @@ -5,4 +5,4 @@ sidebar_label: 'dbt-specific Jinja' id: dbt-specific-jinja --- -There are certain expressions that are specific to dbt — these are documented in the [Jinja function reference](dbt-jinja-functions) section of these docs. Further, docs blocks, snapshots, and materializations are custom Jinja _blocks_ that exist only in dbt. +There are certain expressions that are specific to dbt — these are documented in the [Jinja function reference](/reference/dbt-jinja-functions) section of these docs. Further, docs blocks, snapshots, and materializations are custom Jinja _blocks_ that exist only in dbt. diff --git a/website/docs/faqs/Jinja/which-jinja-docs.md b/website/docs/faqs/Jinja/which-jinja-docs.md index 6ef7581023..5272509e8b 100644 --- a/website/docs/faqs/Jinja/which-jinja-docs.md +++ b/website/docs/faqs/Jinja/which-jinja-docs.md @@ -8,5 +8,5 @@ id: which-jinja-docs If you are stuck with a Jinja issue, it can get confusing where to check for more information. We recommend you check (in order): 1. [Jinja's Template Designer Docs](https://jinja.palletsprojects.com/page/templates/): This is the best reference for most of the Jinja you'll use -2. [Our Jinja function reference](dbt-jinja-functions): This documents any additional functionality we've added to Jinja in dbt. +2. [Our Jinja function reference](/docs/building-a-dbt-project/jinja-macros#related-reference-docs): This documents any additional functionality we've added to Jinja in dbt. 3. [Agate's table docs](https://agate.readthedocs.io/page/api/table.html): If you're operating on the result of a query, dbt will pass it back to you as an agate table. This means that the methods you call on the belong to the Agate library rather than Jinja or dbt. diff --git a/website/docs/faqs/Models/create-dependencies.md b/website/docs/faqs/Models/create-dependencies.md index c804ce5997..5ed6f3f5b7 100644 --- a/website/docs/faqs/Models/create-dependencies.md +++ b/website/docs/faqs/Models/create-dependencies.md @@ -44,4 +44,4 @@ Found 2 models, 28 tests, 0 snapshots, 0 analyses, 130 macros, 0 operations, 0 s Done. PASS=2 WARN=0 ERROR=0 SKIP=0 TOTAL=2 ``` -To get some practice with this, we recommend you complete the [tutorial](tutorial/getting-started.md) to build your first dbt project +To get some practice with this, we recommend you complete the [tutorial](/guides/getting-started) to build your first dbt project diff --git a/website/docs/faqs/Project/define-a-column-type.md b/website/docs/faqs/Project/define-a-column-type.md index 63b0fe64a9..0f15f9e689 100644 --- a/website/docs/faqs/Project/define-a-column-type.md +++ b/website/docs/faqs/Project/define-a-column-type.md @@ -20,7 +20,7 @@ from {{ ref('stg_orders') }} -Many modern data warehouses now support `::` syntax as a shorthand for `cast( as )`. +Many modern s now support `::` syntax as a shorthand for `cast( as )`. diff --git a/website/docs/faqs/Seeds/load-raw-data-with-seed.md b/website/docs/faqs/Seeds/load-raw-data-with-seed.md index ef65a36d61..e33780f5dc 100644 --- a/website/docs/faqs/Seeds/load-raw-data-with-seed.md +++ b/website/docs/faqs/Seeds/load-raw-data-with-seed.md @@ -10,4 +10,4 @@ Seeds should **not** be used to load raw data (for example, large CSV exports fr Since seeds are version controlled, they are best suited to files that contain business-specific logic, for example a list of country codes or user IDs of employees. -Loading CSVs using dbt's seed functionality is not performant for large files. Consider using a different tool to load these CSVs into your data warehouse. +Loading CSVs using dbt's seed functionality is not performant for large files. Consider using a different tool to load these CSVs into your . diff --git a/website/docs/faqs/Warehouse/loading-data.md b/website/docs/faqs/Warehouse/loading-data.md index 448920b1d4..3fb13f139b 100644 --- a/website/docs/faqs/Warehouse/loading-data.md +++ b/website/docs/faqs/Warehouse/loading-data.md @@ -5,7 +5,7 @@ sidebar_label: 'Recommendations on tools to get data into your warehouse' id: loading-data --- -dbt assumes that you already have a copy of your data, in your data warehouse. We recommend you use an off-the-shelf tool like [Stitch](https://www.stitchdata.com/) or [Fivetran](https://fivetran.com/) to get data into your warehouse. +dbt assumes that you already have a copy of your data, in your . We recommend you use an off-the-shelf tool like [Stitch](https://www.stitchdata.com/) or [Fivetran](https://fivetran.com/) to get data into your warehouse. **Can dbt be used to load data?** diff --git a/website/docs/guides/getting-started/building-your-first-project/schedule-a-job.md b/website/docs/guides/getting-started/building-your-first-project/schedule-a-job.md index 6f8a436102..e9b172565c 100644 --- a/website/docs/guides/getting-started/building-your-first-project/schedule-a-job.md +++ b/website/docs/guides/getting-started/building-your-first-project/schedule-a-job.md @@ -48,7 +48,7 @@ Congratulations 🎉! You've just deployed your first dbt project! ## Next steps -Congratulations! Now that you've got a working dbt project, you can read about dbt [best practices](/docs/guides/best-practices). +Congratulations! Now that you've got a working dbt project, you can read about dbt [best practices](/guides/best-practices). You can improve your dbt skills with these fun exercises: diff --git a/website/docs/guides/getting-started/getting-set-up/setting-up-bigquery.md b/website/docs/guides/getting-started/getting-set-up/setting-up-bigquery.md index 7478133ec9..08a6359ab3 100644 --- a/website/docs/guides/getting-started/getting-set-up/setting-up-bigquery.md +++ b/website/docs/guides/getting-started/getting-set-up/setting-up-bigquery.md @@ -88,7 +88,7 @@ You will learn how to connect dbt Cloud to Google BigQuery so that you can lever -In order to let dbt connect to your warehouse, you'll need to generate a keyfile. This is analogous to using a database user name and password with most other data warehouses. +In order to let dbt connect to your warehouse, you'll need to generate a keyfile. This is analogous to using a database user name and password with most other data warehouses. 1. Go to the [BigQuery credential wizard](https://console.cloud.google.com/apis/credentials/wizard). Make sure your new project is selected in the header. If you do not see your account or project, click your profile picture to the right and verify your are using the correct email account. 2. Select **+ Create Credentials** then select **Service account**. diff --git a/website/docs/guides/getting-started/learning-more/getting-started-dbt-core.md b/website/docs/guides/getting-started/learning-more/getting-started-dbt-core.md index 4a04f3a86d..32213e2c9b 100644 --- a/website/docs/guides/getting-started/learning-more/getting-started-dbt-core.md +++ b/website/docs/guides/getting-started/learning-more/getting-started-dbt-core.md @@ -87,7 +87,7 @@ To create your dbt project: ### Connect to BigQuery -When developing locally, dbt connects to your data warehouse using a [profile](/dbt-cli/configure-your-profile), which is a yaml file with all the connection details to your warehouse. +When developing locally, dbt connects to your using a [profile](/dbt-cli/configure-your-profile), which is a yaml file with all the connection details to your warehouse. 1. Create a file in the `~/.dbt/` directory named `profiles.yml`. 2. Move your BigQuery keyfile into this directory. diff --git a/website/docs/guides/getting-started/learning-more/refactoring-legacy-sql.md b/website/docs/guides/getting-started/learning-more/refactoring-legacy-sql.md index 97f2c9715a..aa4073763d 100644 --- a/website/docs/guides/getting-started/learning-more/refactoring-legacy-sql.md +++ b/website/docs/guides/getting-started/learning-more/refactoring-legacy-sql.md @@ -42,7 +42,7 @@ Once you've copied it over, you'll want to `dbt run` to execute the query and po This step may sound simple, but if you're porting over an existing set of SQL transformations to a new SQL dialect, you will need to consider how your legacy SQL dialect differs from your new SQL flavor, and you may need to modify your legacy code to get it to run at all. -This will commonly happen if you're migrating from a [stored procedure workflow on a legacy database](https://getdbt.com/analytics-engineering/case-for-elt-workflow/) into dbt + a cloud data warehouse. +This will commonly happen if you're migrating from a [stored procedure workflow on a legacy database](https://getdbt.com/analytics-engineering/case-for-elt-workflow/) into dbt + a cloud . Functions that you were using previously may not exist, or their syntax may shift slightly between SQL dialects. diff --git a/website/docs/guides/legacy/best-practices.md b/website/docs/guides/legacy/best-practices.md index ed52971017..08e0a80e57 100644 --- a/website/docs/guides/legacy/best-practices.md +++ b/website/docs/guides/legacy/best-practices.md @@ -86,7 +86,7 @@ Our [style guide](https://github.com/dbt-labs/corp/blob/master/dbt_style_guide.m ::: ### Consider the information architecture of your data warehouse -When a user connects to a data warehouse via a SQL client, they often rely on the names of schemas, relations, and columns, to understand the data they are presented with. To improve the information architecture of a data warehouse, we: +When a user connects to a via a SQL client, they often rely on the names of schemas, relations, and columns, to understand the data they are presented with. To improve the information architecture of a data warehouse, we: * Use [custom schemas](using-custom-schemas) to separate relations into logical groupings, or hide intermediate models in a separate schema. Generally, these custom schemas align with the directories we use to group our models, and are configured from the `dbt_project.yml` file. * Use prefixes in names (for example, `stg_`, `fct_` and `dim_`) to indicate which relations should be queried by end users. diff --git a/website/docs/guides/legacy/building-packages.md b/website/docs/guides/legacy/building-packages.md index 226dd3dca3..505c1c93b0 100644 --- a/website/docs/guides/legacy/building-packages.md +++ b/website/docs/guides/legacy/building-packages.md @@ -78,7 +78,7 @@ If you wish to support multiple warehouses, we have a number of tricks up our sl - If you're working on a modeling package, you may notice that you need write different models for each warehouse (for example, if the EL tool you are working with stores data differently on each warehouse). In this case, you can write different versions of each model, and use the [`enabled` config](enabled), in combination with [`target.type`](/reference/dbt-jinja-functions/target) to enable the correct models — check out [this package](https://github.com/fivetran/dbt_facebook_ads_creative_history/blob/master/dbt_project.yml#L11-L16) as an example. -If your package has only been written to work for one data warehouse, make sure you document this in your package README. +If your package has only been written to work for one , make sure you document this in your package README. ### Use specific model names _Modeling packages only_ diff --git a/website/docs/guides/legacy/debugging-errors.md b/website/docs/guides/legacy/debugging-errors.md index 07726ceb59..72529da3f8 100644 --- a/website/docs/guides/legacy/debugging-errors.md +++ b/website/docs/guides/legacy/debugging-errors.md @@ -319,7 +319,7 @@ Your dbt DAG is not acyclic, and needs to be fixed! ## Database Errors -The thorniest errors of all! These errors come from your data warehouse, and dbt passes the message on. You may need to use your warehouse docs (i.e. the Snowflake docs, or BigQuery docs) to debug these. +The thorniest errors of all! These errors come from your , and dbt passes the message on. You may need to use your warehouse docs (i.e. the Snowflake docs, or BigQuery docs) to debug these. ``` $ dbt run diff --git a/website/docs/guides/legacy/managing-environments.md b/website/docs/guides/legacy/managing-environments.md index f3d9035e3c..a62b584bd5 100644 --- a/website/docs/guides/legacy/managing-environments.md +++ b/website/docs/guides/legacy/managing-environments.md @@ -10,7 +10,7 @@ In software engineering, environments are used to enable engineers to develop an In traditional software engineering, different environments often use completely separate architecture. For example, the dev and prod versions of a website may use different servers and databases. -Data warehouses can also be designed to have separate environments – the _production_ environment refers to the relations (i.e. schemas, tables, and views) that your end users query (often through a BI tool). +Data warehouses can also be designed to have separate environments – the _production_ environment refers to the relations (i.e. schemas, tables, and views) that your end users query (often through a BI tool). ## How do I maintain different environments with dbt? dbt makes it easy to maintain separate production and development environments through the use of targets within a profile. A typical profile when using dbt locally (i.e. running from your command line) will have a target named `dev`, and have this set as the default. This means that while making changes, your objects will be built in your _development_ target, without affecting production queries made by your end users. Once you are confident in your changes, you can deploy the code to _production_, by running your dbt project with a _prod_ target. diff --git a/website/docs/guides/legacy/navigating-the-docs.md b/website/docs/guides/legacy/navigating-the-docs.md index e65a439b7b..4a289ef50e 100644 --- a/website/docs/guides/legacy/navigating-the-docs.md +++ b/website/docs/guides/legacy/navigating-the-docs.md @@ -46,7 +46,7 @@ How analytics engineers use dbt to solve their tactical problems, e.g.: * Version controlling UDFs * Writing a custom schema test for not null * Snowflake shares + dbt -* Permission schemes in a data warehouse +* Permission schemes in a Usually these are write-ups where there is no one perfect answer (unlike the “I’m stuck” questions on Stack Overflow), instead, you might need to dig into the “why” or discuss tradeoffs of your approach in these articles. diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures.md index 47bc828a55..aae8b373b2 100644 --- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures.md +++ b/website/docs/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures.md @@ -9,7 +9,7 @@ One of the more common situations that new dbt adopters encounter is a historica Before getting into the meat of conversion, it’s worth noting that DML statements will not always illustrate a comprehensive set of columns and column types that an original table might contain. Without knowing the DDL to create the table, it’s impossible to know precisely if your conversion effort is apples-to-apples, but you can generally get close. -If your data warehouse supports `SHOW CREATE TABLE`, that can be a quick way to get a comprehensive set of columns you’ll want to recreate. If you don’t have the DDL, but are working on a substantial stored procedure, one approach that can work is to pull column lists out of any DML statements that modify the table, and build up a full set of the columns that appear. +If your supports `SHOW CREATE TABLE`, that can be a quick way to get a comprehensive set of columns you’ll want to recreate. If you don’t have the DDL, but are working on a substantial stored procedure, one approach that can work is to pull column lists out of any DML statements that modify the table, and build up a full set of the columns that appear. As for ensuring that you have the right column types, since models materialized by dbt generally use `CREATE TABLE AS SELECT` or `CREATE VIEW AS SELECT` as the driver for object creation, tables can end up with unintended column types if the queries aren’t explicit. For example, if you care about `INT` versus `DECIMAL` versus `NUMERIC`, it’s generally going to be best to be explicit. The good news is that this is easy with dbt: you just cast the column to the type you intend. diff --git a/website/docs/guides/migration/versions/06-upgrading-to-v1.2.md b/website/docs/guides/migration/versions/06-upgrading-to-v1.2.md index 3e647ebba6..e137578776 100644 --- a/website/docs/guides/migration/versions/06-upgrading-to-v1.2.md +++ b/website/docs/guides/migration/versions/06-upgrading-to-v1.2.md @@ -13,121 +13,25 @@ title: "Upgrading to v1.2 (prerelease)" There are no breaking changes for end users of dbt. We are committed to providing backwards compatibility for all versions 1.x. If you encounter an error upon upgrading, please let us know by [opening an issue](https://github.com/dbt-labs/dbt-core/issues/new). -## For maintainers of adapter plugins - -### Cross-database Macros - -In [dbt-core#5298](https://github.com/dbt-labs/dbt-core/pull/5298), we migrated a collection of ["cross-database macros"](cross-database-macros) from [dbt-utils](https://github.com/dbt-labs/dbt-utils) to dbt-core. Default implementations are automatically inherited by adapters and included in the testing suite. Adapter maintainers may need to override the implementation of one or more macros to align with database-specific syntax or optimize performance. For details on the testing suite, see: ["Testing a new adapter"](testing-a-new-adapter). - -The TL;DR rationale for this work is: -1. Simplify dbt-utils development -2. Allow some packages to no longer depend on dbt-utils as a package -3. Provide adapter maintainers tests that can but used in the adapter repo CI, as opposed to in a shim package - -As for how to make it happen, looking at the following PRs for dbt-Labs-maintained adapters show it clearly: - -- [dbt-bigquery#192](https://github.com/dbt-labs/dbt-bigquery/pull/192) -- [dbt-redshift#120](https://github.com/dbt-labs/dbt-redshift/pull/120) -- [dbt-snowflake#162](https://github.com/dbt-labs/dbt-snowflake/pull/162) - - -### Grants - -Managing access grants is one of the most asked for features from dbt users. We’re delivering this capability, but naturally there’s variance across data platforms as to how grants work, so time for adapter maintainers to roll their sleeves up. You might get lucky and not have to override any of them, but in case you do, below are descriptions of the new methods and macros, grouped into level of complexity (start with the easy ones first!) - -:::info Note -This new functionality does not add users, only grants access. You'll have to handle adding users elsewhere, and it has [implications for the GRANT adapter tests](#testing-grants-for-your-adapter). -::: - -Pull requests for adding grants for dbt Labs-maintained adapters should be very useful as a reference, for example [dbt-bigquery#212](https://github.com/dbt-labs/dbt-bigquery/pull/212). - -#### Overrideable macros and methods - -The two macros below are simple Boolean-toggles (i.e. `True/False` value) indicating whether certain features are available for your database. The default of both of these macros are `True`, because we believe that all databases should support these ergonomic features. However, we've built for flexibility, so overriding these macros for your adapter, will handle the case where your database doesn't support these features. - -| macro | description | global project’s default | example override | -| --- | --- | --- | --- | -| `copy_grants()` | when an object is fully replaced on your database, do grants copy over? e.g. on Postgres this is never true, on Spark this is different for views vs. non-Delta tables vs. Delta tables, on Snowflake it depends on the user-supplied `copy_grants` configuration. true by default, which means “play it safe”: grants MIGHT have copied over, so dbt will run an extra query to check them + calculate diffs. | [`default__copy_grants()`](https://github.com/dbt-labs/dbt-core/blob/c25260e5dd2afa237a30db115605ece9629443d1/core/dbt/include/global_project/macros/adapters/apply_grants.sql#L3-L21)| [`snowflake__copy_grants()`](https://github.com/dbt-labs/dbt-snowflake/blob/d53c327e20c91522b4792ede75bbe50e16a9d9c3/dbt/include/snowflake/macros/adapters.sql#L297-L300) | -| `support_multiple_grantees_per_dcl_statement()` | does this database support `grant {privilege} to user_a, user_b, ...`? or do `user_a` + `user_b` need their own separate grant statements? | [`default__support_multiple_grantees_per_dcl_statement()`](https://github.com/dbt-labs/dbt-core/blob/c25260e5dd2afa237a30db115605ece9629443d1/core/dbt/include/global_project/macros/adapters/apply_grants.sql#L24-L39) | [`spark__support_multiple_grantees_per_dcl_statement()`](https://github.com/dbt-labs/dbt-spark/blob/9109fe1babaab92cbe1c58868977c7a9c998c2a8/dbt/include/spark/macros/apply_grants.sql#L28-L30) | - -If the above macros do not suffice, then at least one of these `get_*_sql()` macros will need to be overwritten. They're all one-liners and might need small syntax tweaks to work on your database. - -| macro | description | global project’s version | example override | -| --- | --- | --- | --- | -| `get_show_grant_sql()` | SQL that returns the `CURRENT` grants (privilege-grantee pairs) for a given relation | [`default__get_show_grant_sql()`](https://github.com/dbt-labs/dbt-core/blob/c25260e5dd2afa237a30db115605ece9629443d1/core/dbt/include/global_project/macros/adapters/apply_grants.sql#L63-L65) | [`redshift__get_show_grant_sql()`](https://github.com/dbt-labs/dbt-redshift/blob/9a3492a1c3394496c9061252d54c87caa112821a/dbt/include/redshift/macros/adapters/apply_grants.sql#L1-L27) | -| `get_grant_sql()` | generate a GRANT statement for a given relation given a privilege-grantee(s) pairing. grantees will be a list of grantees if supported by this database, otherwise just one. | [`default__get_grant_sql()`](https://github.com/dbt-labs/dbt-core/blob/ct-660-grant-sql/core/dbt/include/global_project/macros/adapters/apply_grants.sql#L36-L45) | [`spark__get_grant_sql()`](https://github.com/dbt-labs/dbt-spark/blob/bff1348931efb60a41831429c795498008d2d3ac/dbt/include/spark/macros/apply_grants.sql#L17-L29) | -| `get_revoke_sql()` | generate a REVOKE statement for a given relation given a privilege-grantee(s) pairing. grantees will be a list of grantees if supported by this database, otherwise just one. | [`default__get_revoke_sql()`](https://github.com/dbt-labs/dbt-core/blob/c25260e5dd2afa237a30db115605ece9629443d1/core/dbt/include/global_project/macros/adapters/apply_grants.sql#L81-L83) | [`bigquery__get_revoke_sql()`](https://github.com/dbt-labs/dbt-bigquery/blob/942d460fc60beb87325871903b26afee7e5f4d85/dbt/include/bigquery/macros/adapters/apply_grants.sql#L18-L20) | -| any custom materialization (or override of a default materialization) | you have to add the lines for fetching and applying the grants `{% set grant_config = config.get('grants') %}` and `{% do apply_grants(target_relation, grant_config) %}` by default, the `should_revoke` argument of `apply_grants` is `True`. dbt will first run a query to “show” grants, then calculate diffs, then apply revoke/grant statements. you can use the `should_revoke` macro to determine whether this extra step is necessary. in cases where dbt is fully replacing an object, or creating one for the first time, grants may not be carried over — so it may be more efficient to skip the “show” step and just add the grants. | | BigQuery’s [custom](https://github.com/dbt-labs/dbt-bigquery/blob/942d460fc60beb87325871903b26afee7e5f4d85/dbt/include/bigquery/macros/materializations/incremental.sql#L155-L156) incremental [materialization](https://github.com/dbt-labs/dbt-bigquery/blob/942d460fc60beb87325871903b26afee7e5f4d85/dbt/include/bigquery/macros/materializations/incremental.sql#L204) | - -If the above sets of macros still aren't cutting it, here's the final depth of complexity in which to wade. - -| macro | description | global project’s version | example override | -| --- | --- | --- | --- | -| `get_dcl_statement_list()` | Unpacks grant_config. For each privilege-grantee(s) pairing, call either get_grant_sql or get_revoke_sql and return a list of all needed statements | [`default__get_dcl_statement_list()`](https://github.com/dbt-labs/dbt-core/blob/c25260e5dd2afa237a30db115605ece9629443d1/core/dbt/include/global_project/macros/adapters/apply_grants.sql#L92-L112) | | -| `call_dcl_statements()` | Call all DCL statements, i.e. actually run them against the database. This is the culmination of apply_grants. By default, this generates one big string (every grant/revoke statement, separated by ;), but some adapters will need to execute these differently. | [`default__call_dcl_statements()`](https://github.com/dbt-labs/dbt-core/blob/c25260e5dd2afa237a30db115605ece9629443d1/core/dbt/include/global_project/macros/adapters/apply_grants.sql#L119-L132) | | -| `Adapter.standardize_grants_dict()` | Input: result from query to “show grants” Returns: a dictionary of structure `{"privilege_name": [list, of, grantees], ...}` —> matches the structure of the user-supplied `grant_config` | (this is a python method in the `core/dbt/adapters/base/impl.py`'s [`BaseAdapter.standardize_grants_dict()`](https://github.com/dbt-labs/dbt-core/blob/c25260e5dd2afa237a30db115605ece9629443d1/core/dbt/adapters/base/impl.py#L542-L567) | | +### For consumers of dbt artifacts (metadata) -##### Testing grants with your adapter +The manifest schema version has been updated to `v6`. The relevant changes are: +- Change to `config` default, which includes a new `grants` property with default value `{}` +- Addition of a `metrics` property, to any node which could reference metrics using the `metric()` function -The tests for grants are implemented in the same way as the pytest tests that were introduced in dbt-core v1.1.0, in that they are importable and can you create adapter-specific child classes of each test in your repo. for example see how [dbt-bigquery implements the tests](https://github.com/dbt-labs/dbt-bigquery/blob/main/tests/functional/adapter/test_grants.py). Notice the `BaseGrantsBigQuery` in which the mapping dict of standard privileges to BigQuery-specific privilege names. +For users of [state-based selection](understanding-state): This release also includes new logic declaring forwards compatibility for older manifest versions. While running dbt Coree v1.2, it should be possible to use `state:modified --state ...` selection against a manifest produced by dbt Core v1.0 or v1.1. -```python -class BaseGrantsBigQuery(BaseGrants): - def privilege_grantee_name_overrides(self): - return { - "select": "roles/bigquery.dataViewer", - "insert": "roles/bigquery.dataEditor", - "fake_privilege": "roles/invalid", - "invalid_user": "user:fake@dbtlabs.com", - } -``` - -It is also worth noting that in your test database, you need to have create three users. If your integration test database is persistent, you'll only need to add the users to the database once, if the database is set up and torn down within the CI testing, you'll need to have the users added as part of your CI testing (or even the docker image). - - -In the example test.env, the users are [prescribed as environment variables](https://github.com/dbt-labs/dbt-snowflake/blob/1247bbabad12f264b1880d429e6fd025544ffe38/test.env.example#L33-L35) as follows: - -```bash -DBT_TEST_USER_1=dbt_test_role_1 -DBT_TEST_USER_2=dbt_test_role_2 -DBT_TEST_USER_3=dbt_test_role_3 -``` - -### Materialization inheritance! - -Via a community contribution from the folks at Layer.ai, [dbt-core#5348](https://github.com/dbt-labs/dbt-core/pull/5348) enables materializations to be inherited from parent adapters in much the same was as macros are dispatched. - -this is a big deal for folks who are inheriting adapters, e.g. as dbt-synapse does with dbt-sqlserver, and for the family of adapters inherit from dbt-spark today. - -### New basic tests to implement in adapterland: `BaseDocsGenerate` and `BaseDocsGenReferences` - -[dbt-core#5058](https://github.com/dbt-labs/dbt-core/pull/5058) is another step along [the path of converting all our functional tests](https://github.com/dbt-labs/dbt-core/issues/4788) to the new framework in order to empower adapter maintainers and other contributors to make use of the same tests that the core team uses for their own adapters. Effectively, this test is validates an adapter's ability to correctly generate the catalog that serves as the static backend of a project docs site. -If your adapter does not add extra relation-level metadata (e.g. table size (rows + bytes), last modified timestamp) which is the case by default, then you can follow the same inherit and `pass` pattern to enable your version of `BaseDocsGenerate` and `BaseDocsGenReferences`. However, if you are supplementing the catalog with more metadata, you'll have to: -- Add a method that defines stats for this adapter [e.g. dbt-bigquerys](https://github.com/dbt-labs/dbt-bigquery/blob/main/tests/functional/adapter/expected_stats.py) -- Reimplement the `expected_catalog` fixture, [passing the above into `model_stats` and `seed_stats`](https://github.com/dbt-labs/dbt-bigquery/blob/0212fd621ede4c24929a008de718a7e45bc32cec/tests/functional/adapter/test_basic.py#L68-L81) - - -Example PRs: -- [dbt-bigquery#190](https://github.com/dbt-labs/dbt-bigquery/pull/190) -- [dbt-redshift#116](https://github.com/dbt-labs/dbt-redshift/pull/116/) - -### More python functions now available in the dbt jinja context - -python’s `set` and `zip` , and the most of the `itertools` are available in the dbt-jinja context. Yay! ([dbt-core#5107](https://github.com/dbt-labs/dbt-core/pull/5107 ) and [dbt-core#5140](https://github.com/dbt-labs/dbt-core/pull/5140)). THere's no explicit action needed here, only mentioning in case it enables some jinja simplifications. - -### Slight change to the default seed materialization - -**who:** folks who override the entire seed materialization, and anyone who overrides materializations for small reasons. this is a great example of how the global_project can be modified to reduce boiler plate within adapters. - -**what:** a new macro, [`get_csv_sql()`](https://github.com/dbt-labs/dbt-core/blob/0cacfd0f8898434bf97386742453a4f61378a732/core/dbt/include/global_project/macros/materializations/seeds/helpers.sql#L47-L55), was added to `macros/materializations/seeds/helpers.sql` - -**why** transactions are no longer the default behavior for dbt-snowflake, however, they’re still needed for bundling the seed table creation and insertion. So now we have a new default macro so that dbt-snowflake can implement a version that makes the two statements happen in the same transaction +## For maintainers of adapter plugins -**more info** check out the issue ([dbt-core#5206](https://github.com/dbt-labs/dbt-core/issues/5206)) and PR ([dbt-core#5207](https://github.com/dbt-labs/dbt-core/pull/5207)) +See GitHub discussion [dbt-labs/dbt-core#5468](https://github.com/dbt-labs/dbt-core/discussions/5468) for detailed information ## New and changed documentation -- **[Grants](/reference/resource-configs/grants)**: You should now manage access to the datasets you're producing with dbt by using grants instead of using hooks. If you already use post-hook to apply simple grants, moving to the grants feature will allow you to [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) up your duplicated or boilerplate code. +- **[Grants](/reference/resource-configs/grants)** are natively supported in `dbt-core` for the first time. That support extends to all standard materializations, and the most popular adapters. If you already use hooks to apply simple grants, we encourage you to use built-in `grants` to configure your models, seeds, and snapshots instead. This will enable you to [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) up your duplicated or boilerplate code. +- **[dbt-Jinja functions](reference/dbt-jinja-functions)** now include the [`itertools` Python module](dbt-jinja-functions/modules#itertools), as well as the [set](dbt-jinja-functions/set) and [zip](dbt-jinja-functions/zip) functions. +- **[Node selection](node-selection/syntax)** includes a [file selection method](node-selection/methods#the-file-method) (`-s model.sql`), and [yaml selector](node-selection/yaml-selectors) inheritance. +- **[Global configs](global-configs)** now include CLI flag and environment variable settings for [`target-path`](target-path) and [`log-path`](log-path), which can be used to override the values set in `dbt_project.yml` +- **[Metrics](building-a-dbt-project/metrics)** now support an `expression` type (metrics-on-metrics), as well as a `metric()` function to use when referencing metrics from within models, macros, or `expression`-type metrics. For more information how to use expression metrics, please reference the [**`dbt_metrics` package**](https://github.com/dbt-labs/dbt_metrics) https://github.com/dbt-labs/docs.getdbt.com/labels/dbt-core%20v1.2 diff --git a/website/docs/guides/migration/versions/07-upgrading-to-v1.1.md b/website/docs/guides/migration/versions/07-upgrading-to-v1.1.md index 740bba6fe4..68daeb9394 100644 --- a/website/docs/guides/migration/versions/07-upgrading-to-v1.1.md +++ b/website/docs/guides/migration/versions/07-upgrading-to-v1.1.md @@ -23,7 +23,7 @@ The manifest schema version will be updated to v5. The only change is to the def ## New and changed documentation -[**Incremental models**](configuring-incremental-models) can now accept a list of multiple columns as their `unique_key`, for models that need a combination of columns to uniquely identify each row. This is supported by the most common data warehouses, for incremental strategies that make use of the `unique_key` config (`merge` and `delete+insert`). +[**Incremental models**](configuring-incremental-models) can now accept a list of multiple columns as their `unique_key`, for models that need a combination of columns to uniquely identify each row. This is supported by the most common data warehouses, for incremental strategies that make use of the `unique_key` config (`merge` and `delete+insert`). [**Generic tests**](resource-properties/tests) can define custom names. This is useful to "prettify" the synthetic name that dbt applies automatically. It's needed to disambiguate the case when the same generic test is defined multiple times with different configurations. diff --git a/website/docs/reference/artifacts/catalog-json.md b/website/docs/reference/artifacts/catalog-json.md index 2de1e1dff3..d5788f6f25 100644 --- a/website/docs/reference/artifacts/catalog-json.md +++ b/website/docs/reference/artifacts/catalog-json.md @@ -6,7 +6,7 @@ _Current schema_: [`v1`](https://schemas.getdbt.com/dbt/catalog/v1.json) _Produced by:_ `dbt docs generate` -This file contains information from your data warehouse about the tables and views produced and defined by the resources in your project. Today, dbt uses this file to populate metadata, such as column types and statistics, in the [docs site](documentation). +This file contains information from your about the tables and views produced and defined by the resources in your project. Today, dbt uses this file to populate metadata, such as column types and statistics, in the [docs site](documentation). ### Top-level keys diff --git a/website/docs/reference/artifacts/dbt-artifacts.md b/website/docs/reference/artifacts/dbt-artifacts.md index 385d7b12ab..a4ab823667 100644 --- a/website/docs/reference/artifacts/dbt-artifacts.md +++ b/website/docs/reference/artifacts/dbt-artifacts.md @@ -43,9 +43,4 @@ In the manifest, the `metadata` may also include: #### Notes: - The structure of dbt artifacts is canonized by [JSON schemas](https://json-schema.org/), which are hosted at **schemas.getdbt.com**. -- As of v0.20.0, the current schema for each artifact is: - - https://schemas.getdbt.com/dbt/manifest/v4.json - - https://schemas.getdbt.com/dbt/run-results/v4.json - - https://schemas.getdbt.com/dbt/catalog/v1.json - - https://schemas.getdbt.com/dbt/sources/v3.json -- Artifact versions may change in any minor version of dbt (`v0.x.0`). Each artifact is versioned independently. +- Artifact versions may change in any minor version of dbt (`v1.x.0`). Each artifact is versioned independently. diff --git a/website/docs/reference/artifacts/manifest-json.md b/website/docs/reference/artifacts/manifest-json.md index e581895b7e..9983426631 100644 --- a/website/docs/reference/artifacts/manifest-json.md +++ b/website/docs/reference/artifacts/manifest-json.md @@ -2,7 +2,7 @@ title: Manifest --- -_Current schema_: [`v5`](https://schemas.getdbt.com/dbt/manifest/v5/index.html) +_Current schema_: [`v6`](https://schemas.getdbt.com/dbt/manifest/v6/index.html) _Produced by:_ - `dbt compile` diff --git a/website/docs/reference/commands/seed.md b/website/docs/reference/commands/seed.md index 3a0227d8cb..e60ceced0d 100644 --- a/website/docs/reference/commands/seed.md +++ b/website/docs/reference/commands/seed.md @@ -10,7 +10,7 @@ id: "seed" -The `dbt seed` command will load `csv` files located in the `seed-paths` directory of your dbt project into your data warehouse. +The `dbt seed` command will load `csv` files located in the `seed-paths` directory of your dbt project into your . ### Selecting seeds to run diff --git a/website/docs/reference/dbt-classes.md b/website/docs/reference/dbt-classes.md index 16c06e8b38..20e7637ba7 100644 --- a/website/docs/reference/dbt-classes.md +++ b/website/docs/reference/dbt-classes.md @@ -2,7 +2,7 @@ title: "dbt Classes" --- -dbt has a number of classes it uses to represent objects in a data warehouse, parts of a dbt project, and the results of a command. +dbt has a number of classes it uses to represent objects in a , parts of a dbt project, and the results of a command. These classes are often useful when building advanced dbt models and macros. diff --git a/website/docs/reference/dbt-jinja-functions/dispatch.md b/website/docs/reference/dbt-jinja-functions/dispatch.md index 7690a0711b..5b927e0410 100644 --- a/website/docs/reference/dbt-jinja-functions/dispatch.md +++ b/website/docs/reference/dbt-jinja-functions/dispatch.md @@ -154,7 +154,7 @@ As a package maintainer, this functionality enables users of my package to exten I maintain an internal utility package at my organization, named `my_org_dbt_helpers`. I use this package to reimplement built-in dbt macros on behalf of all my dbt-using colleagues, who work across a number of dbt projects. -My package can define custom versions of any dispatched global macro I choose, from `generate_schema_name` to `test_unique`. I can define a new default version of that macro (e.g. `default__generate_schema_name`), or custom versions for specific data warehouse adapters (e.g. `spark__generate_schema_name`). +My package can define custom versions of any dispatched global macro I choose, from `generate_schema_name` to `test_unique`. I can define a new default version of that macro (e.g. `default__generate_schema_name`), or custom versions for specific adapters (e.g. `spark__generate_schema_name`). Each root project installing my package simply needs to include the [project-level `dispatch` config](project-configs/dispatch-config) that searches my package ahead of `dbt` for the `dbt` global namespace: diff --git a/website/docs/reference/dbt-jinja-functions/env_var.md b/website/docs/reference/dbt-jinja-functions/env_var.md index d86808f3f4..640b919001 100644 --- a/website/docs/reference/dbt-jinja-functions/env_var.md +++ b/website/docs/reference/dbt-jinja-functions/env_var.md @@ -59,7 +59,7 @@ models: For certain configurations, you can use "secret" env vars. Any env var named with the prefix `DBT_ENV_SECRET_` will be: - Available for use in `profiles.yml` + `packages.yml`, via the same `env_var()` function -- Disallowed everywhere else, including `dbt_project.yml` and model SQL, to prevent accidentally writing these secret values to the data warehouse or metadata artifacts +- Disallowed everywhere else, including `dbt_project.yml` and model SQL, to prevent accidentally writing these secret values to the or metadata artifacts - Scrubbed from dbt logs and replaced with `*****`, any time its value appears in those logs (even if the env var was not called directly) The primary use case of secret env vars is git access tokens for [private packages](package-management#private-packages). diff --git a/website/docs/reference/dbt-jinja-functions/modules.md b/website/docs/reference/dbt-jinja-functions/modules.md index 6a188ff300..baa8da80f1 100644 --- a/website/docs/reference/dbt-jinja-functions/modules.md +++ b/website/docs/reference/dbt-jinja-functions/modules.md @@ -48,3 +48,46 @@ This variable is a pointer to the Python [re](https://docs.python.org/3/library/ ) -%} {% endif %} ``` + + + +## itertools +This variable is a pointer to the Python [itertools](https://docs.python.org/3/library/itertools.html) module, which includes useful functions for working with iterators (loops, lists, and the like). + +The supported functions are: +- `count` +- `cycle` +- `repeat` +- `accumulate` +- `chain` +- `compress` +- `islice` +- `starmap` +- `tee` +- `zip_longest` +- `product` +- `permutations` +- `combinations` +- `combinations_with_replacement` + +**Usage** + +``` +{%- set A = [1, 2] -%} +{%- set B = ['x', 'y', 'z'] -%} +{%- set AB_cartesian = modules.itertools.product(A, B) -%} + +{%- for item in AB_cartesian %} + {{ item }} +{%- endfor -%} +``` +``` + (1, 'x') + (1, 'y') + (1, 'z') + (2, 'x') + (2, 'y') + (2, 'z') +``` + + diff --git a/website/docs/reference/dbt-jinja-functions/set.md b/website/docs/reference/dbt-jinja-functions/set.md new file mode 100644 index 0000000000..93ada50df3 --- /dev/null +++ b/website/docs/reference/dbt-jinja-functions/set.md @@ -0,0 +1,50 @@ +--- +title: "set" +id: "set" +--- + +### set + +_Not to be confused with the `{% set foo = "bar" ... %}` expression in Jinja!_ + +The `set` context method can be used to convert any iterable to a sequence of iterable elements that are unique (a set). + +__Args__: +- `value`: The iterable to convert (e.g. a list) +- `default`: A default value to return if the `value` argument is not a valid iterable + +### Usage + +``` +{% set my_list = [1, 2, 2, 3] %} +{% set my_set = set(my_list) %} +{% do log(my_set) %} {# {1, 2, 3} #} +``` + +``` +{% set my_invalid_iterable = 1234 %} +{% set my_set = set(my_invalid_iterable) %} +{% do log(my_set) %} {# None #} +``` + +### try_set + +The `try_set` context method can be used to convert any iterable to a sequence of iterable elements that are unique (a set). The difference to the `set` context method is that the `try_set` method will raise an exception on a `TypeError`, if the provided value is not a valid iterable and cannot be converted to a set. + +__Args__: +- `value`: The iterable to convert (e.g. a list) + +``` +{% set my_list = [1, 2, 2, 3] %} +{% set my_set = set(my_list) %} +{% do log(my_set) %} {# {1, 2, 3} #} +``` + +``` +{% set my_invalid_iterable = 1234 %} +{% set my_set = try_set(my_invalid_iterable) %} +{% do log(my_set) %} + +Compilation Error in ... (...) + 'int' object is not iterable +``` diff --git a/website/docs/reference/dbt-jinja-functions/zip.md b/website/docs/reference/dbt-jinja-functions/zip.md new file mode 100644 index 0000000000..5c54208289 --- /dev/null +++ b/website/docs/reference/dbt-jinja-functions/zip.md @@ -0,0 +1,53 @@ +--- +title: "zip" +id: "zip" +--- + +### zip + +The `zip` context method can be used to used to return an iterator of tuples, where the i-th tuple contains the i-th element from each of the argument iterables. ([Python docs](https://docs.python.org/3/library/functions.html#zip)) + :param + :param + +__Args__: +- `*args`: Any number of iterables +- `default`: A default value to return if `*args` is not iterable + +### Usage + +``` +{% set my_list_a = [1, 2] %} +{% set my_list_b = ['alice', 'bob'] %} +{% set my_zip = zip(my_list_a, my_list_b) | list %} +{% do log(my_zip) %} {# [(1, 'alice'), (2, 'bob')] #} +``` + +``` +{% set my_list_a = 12 %} +{% set my_list_b = ['alice', 'bob'] %} +{% set my_zip = zip(my_list_a, my_list_b, default = []) | list %} +{% do log(my_zip) %} {# [] #} +``` + +### try_zip + +The `try_zip` context method can be used to used to return an iterator of tuples, just like `zip`. The difference to the `zip` context method is that the `try_zip` method will raise an exception on a `TypeError`, if one of the provided values is not a valid iterable. + +__Args__: +- `value`: The iterable to convert (e.g. a list) + +``` +{% set my_list_a = [1, 2] %} +{% set my_list_b = ['alice', 'bob'] %} +{% set my_zip = try_zip(my_list_a, my_list_b) | list %} +{% do log(my_zip) %} {# [(1, 'alice'), (2, 'bob')] #} +``` + +``` +{% set my_list_a = 12 %} +{% set my_list_b = ['alice', 'bob'] %} +{% set my_zip = try_zip(my_list_a, my_list_b) %} + +Compilation Error in ... (...) + 'int' object is not iterable +``` diff --git a/website/docs/reference/global-configs.md b/website/docs/reference/global-configs.md index 5e130093bf..af468b63d6 100644 --- a/website/docs/reference/global-configs.md +++ b/website/docs/reference/global-configs.md @@ -5,13 +5,11 @@ id: "global-configs" ## About Global Configs -Global configs enable you to fine-tune how dbt runs projects on your machine—whether your personal laptop, an orchestration tool running remotely, or (in some cases) dbt Cloud. They differ from [project configs](reference/dbt_project.yml) and [resource configs](reference/configs-and-properties), which tell dbt _what_ to run. +Global configs enable you to fine-tune _how_ dbt runs projects on your machine—whether your personal laptop, an orchestration tool running remotely, or (in some cases) dbt Cloud. In general, they differ from most [project configs](reference/dbt_project.yml) and [resource configs](reference/configs-and-properties), which tell dbt _what_ to run. -Global configs control things like the visual output of logs, the manner in which dbt parses your project, and what to do when dbt finds a version mismatch or a failing model. +Global configs control things like the visual output of logs, the manner in which dbt parses your project, and what to do when dbt finds a version mismatch or a failing model. These configs are "global" because they are available for all dbt commands, and because they can be set for all projects running on the same machine or in the same environment. -These configs are "global" because they are available for all dbt commands, and because they apply across all projects run on the same machine. - -Starting in v1.0, you can set global configs in three places. When all three are set, command line flags take precedence, then environment variables, and last profile configs. +Starting in v1.0, you can set global configs in three places. When all three are set, command line flags take precedence, then environment variables, and last yaml configs (usually `profiles.yml`). ## Command line flags @@ -88,9 +86,9 @@ $ dbt run -## Profile (or user) configurations +## Yaml configurations -You can set profile (or user) configurations in the `config:` block of `profiles.yml`. You would use the profile config to set defaults for all projects running on your local machine. +For most global configurations, you can set "user profile" configurations in the `config:` block of `profiles.yml`. This style of configuration sets default values for all projects using this profile directory—usually, all projects running on your local machine. @@ -103,6 +101,12 @@ config: + + +The exception: Some global configurations are actually set in `dbt_project.yml`, instead of `profiles.yml`, because they control where dbt places logs and artifacts. Those file paths are always relative to the location of `dbt_project.yml`. For more details, see ["Log and target paths"](#log-and-target-paths) below. + + + ### Cache database objects for selected resource @@ -269,6 +273,27 @@ config: + + +### Log and target paths + +By default, dbt will write logs to a directory named `logs/`, and all other artifacts to a directory named `target/`. Both of those directories are located relative to `dbt_project.yml` of the active project—that is, the root directory from which dbt is run. + +Just like other global configs, it is possible to override these values for your environment or invocation by using CLI flags (`--target-path`, `--log-path`) or environment variables (`DBT_TARGET_PATH`, `DBT_LOG_PATH`). + +Unlike the other global configs documented on this page, which can be set in `profiles.yml`, the project paths are configured in `dbt_project.yml`. This is because `profiles.yml` and `dbt_project.yml` are most often located in separate file systems on your machine, and the log and artifact paths are always defined relative to the location of `dbt_project.yml`. + + + +```yaml +[target-path](target-path): "other-target" +[log-path](log-path): "other-logs" +``` + + + + + ### Send anonymous usage stats We want to build the best version of dbt possible, and a crucial part of that is understanding how users work with dbt. To this end, we've added some simple event tracking to dbt (using Snowplow). We do not track credentials, model contents or model names (we consider these private, and frankly none of our business). diff --git a/website/docs/reference/node-selection/methods.md b/website/docs/reference/node-selection/methods.md index 6fdf651452..ce8574277f 100644 --- a/website/docs/reference/node-selection/methods.md +++ b/website/docs/reference/node-selection/methods.md @@ -73,6 +73,19 @@ selectors unambiguous. + + +### The "file" method +The `file` method can be used to select a model by its filename, including the file extension (`.sql`). + +```bash +# These are equivalent +dbt run --select some_model.sql +dbt run --select some_model +``` + + + ### The "package" method New in v0.18.0 diff --git a/website/docs/reference/node-selection/yaml-selectors.md b/website/docs/reference/node-selection/yaml-selectors.md index 0089aa6d13..2856b0299b 100644 --- a/website/docs/reference/node-selection/yaml-selectors.md +++ b/website/docs/reference/node-selection/yaml-selectors.md @@ -261,3 +261,31 @@ selectors: default: "{{ target.name == 'prod' | as_bool }}" definition: ... ``` + + + +### Selector inheritance + +Selectors can reuse and extend definitions from other selectors, via the `selector` method. + +```yml +selectors: + - name: foo_and_bar + definition: + intersection: + - tag: foo + - tag: bar + + - name: foo_bar_less_buzz + definition: + intersection: + # reuse the definition from above + - method: selector + value: foo_and_bar + # with a modification! + - exclude: + - method: tag + value: buzz +``` + + diff --git a/website/docs/reference/project-configs/log-path.md b/website/docs/reference/project-configs/log-path.md index 5dc1bf6980..64b5d73ce8 100644 --- a/website/docs/reference/project-configs/log-path.md +++ b/website/docs/reference/project-configs/log-path.md @@ -16,6 +16,18 @@ Optionally specify a custom directory where dbt will write logs. ## Default By default, dbt will write to the `logs` directory, i.e. `log-path: logs` + +## Configuration + +In the manner of a ["global" config](global-configs), the log path can be set in three places: +1. `--log-path` CLI flag +2. `DBT_LOG_PATH` environment variable +3. `log-path` in `dbt_project.yml` + +The precedence order is: CLI flag > env var > `dbt_project.yml` + + + ## Examples ### Write logs to a subdirectory named `dbt_logs` instead of `logs` diff --git a/website/docs/reference/project-configs/on-run-start-on-run-end.md b/website/docs/reference/project-configs/on-run-start-on-run-end.md index 7456c0fb84..5aad8280fa 100644 --- a/website/docs/reference/project-configs/on-run-start-on-run-end.md +++ b/website/docs/reference/project-configs/on-run-start-on-run-end.md @@ -29,6 +29,15 @@ A SQL statement (or list of SQL statements) to be run at the start, or end, of t * The `on-run-end` hook has additional jinja variables available in the context — check out the [docs](on-run-end-context). ## Examples + + + + + + + + + ### Grant privileges at the end of a run @@ -53,6 +62,8 @@ on-run-end: + + ### Grant privileges on all schemas that dbt uses at the end of a run This leverages the [schemas](schemas) variable that is only available in an `on-run-end` hook. diff --git a/website/docs/reference/project-configs/profile.md b/website/docs/reference/project-configs/profile.md index de11b52092..0a468040f0 100644 --- a/website/docs/reference/project-configs/profile.md +++ b/website/docs/reference/project-configs/profile.md @@ -10,7 +10,7 @@ profile: string ## Definition -The profile your dbt project should use to connect to your data warehouse. +The profile your dbt project should use to connect to your . * If you are developing in dbt Cloud: This configuration is optional * If you are developing locally: This configuration is required, unless a command-line option (i.e. `--profile`) is supplied. @@ -18,7 +18,7 @@ The profile your dbt project should use to connect to your data warehouse. * [Connecting to your warehouse](dbt-cli/configure-your-profile) ## Recommendation -Often an organization has only one data warehouse, so it is sensible to use your organization's name as a profile name, in `snake_case`. For example: +Often an organization has only one , so it is sensible to use your organization's name as a profile name, in `snake_case`. For example: * `profile: acme` * `profile: jaffle_shop` diff --git a/website/docs/reference/project-configs/target-path.md b/website/docs/reference/project-configs/target-path.md index 9bbd2c19de..82956d9688 100644 --- a/website/docs/reference/project-configs/target-path.md +++ b/website/docs/reference/project-configs/target-path.md @@ -17,6 +17,18 @@ Optionally specify a custom directory where compiled files (e.g. compiled models ## Default By default, dbt will write compiled files to the `target` directory, i.e. `target-path: target` + +## Configuration + +In the manner of a ["global" config](global-configs), the target path can be set in three places: +1. `--target-path` CLI flag +2. `DBT_TARGET_PATH` environment variable +3. `target-path` in `dbt_project.yml` + +The precedence order is: CLI flag > env var > `dbt_project.yml` + + + ## Examples ### Use a subdirectory named `compiled` for compiled files diff --git a/website/docs/reference/resource-configs/grants.md b/website/docs/reference/resource-configs/grants.md index 3a299e64dd..7291b28a98 100644 --- a/website/docs/reference/resource-configs/grants.md +++ b/website/docs/reference/resource-configs/grants.md @@ -22,7 +22,22 @@ dbt encourages you to use grants as resource configs whenever possible in Core v For more information on hooks, see [Hooks & operations](/building-a-dbt-project/hooks-operations). -You can set grants in `dbt_project.yml` and as a `config` yaml property that applies to the entire dbt project. +## Definition + +You can use the `grants` field to set permissions or grants for a resource. When you run a model, seed or seed, or snapshot a snapshot, dbt will run `grant` and/or `revoke` statements to ensure that the permissions on the database object match the `grants` you have configured on the resource. + +Like all configurations, `grants` will be included in dbt project metadata, including [the manifest artifact](dbt-artifacts/manifest-json). + +### Common syntax + +Grants have two key components: + +* **Privilege:** A right to perform a specific action or set of actions on an object in the database, such as selecting data from a table. +* **Grantees:** One or more recipients of granted privileges. Some platforms also call these "principals." For example, a grantee could be a user, a group of users, a role held by one or more users (Snowflake), or a service account (BigQuery/GCP). + +## Configuring grants + +You can configure `grants` in `dbt_project.yml` to apply grants to many resources at once—all models in your project, a package, or a subfolder—and you can also configure `grants` one-by-one for specific resources, in yaml `config:` blocks or right within their `.sql` files. -## Definition +### Grant config inheritance -You can use the `grants` field to set permissions or grants for a resource. These grants will be compiled into the `manifest.json` file complied by dbt. +When you set `grants` for the same model in multiple places, such as in `dbt_project.yml` and in a more-specific `.sql` or `.yml` file, dbt's default behavior replaces the less-specific set of grantees with the more-specific set of grantees. This "merge and clobber" behavior updates each privilege when dbt parses your project. -## Database-specific requirements and notes +For example: -While we try to standardize the terms we use to describe different features, you will always find nuances in different databases. This section outlines some of those database-specific requirements and notes. + -### Common syntax +```yml +models: + +grants: + select: ['user_a', 'user_b'] +``` + + + + + +```sql +{{ config(grants = {'select': ['user_c']}) }} +``` + + + +As a result of this configuration, `specific_model` will be configured to grant the `select` privilege to `user_c` _only_. After you run `specific_model`, that is the only granted privilege you would see in the database, and the only `grant` statement you would find in dbt's logs. + +Let's say we wanted to _add_ `user_c` to the existing list of grantees receiving the `select` privilege on `specific_model`, rather than _replacing_ that list entirely. To accomplish that, we can use the `+` ("addition") symbol, prefixing the name of the privilege: + + + +```sql +{{ config(grants = {'+select': ['user_c']}) }} +``` + + -In our examples, you will find terms like `select` and `another_user` because many databases use these terms, but be aware of the syntax your own database supports: +Now, the model will grant select to `user_a`, `user_b`, AND `user_c`! -* Privileges: A right to perform an action in a database. -* Grantees: A way to manage privileges. Recipients of granted privileges, also called "principals." Grantees can be a user, a group of users, a role held by users (Snowflake), a service account (GCP), and more. +**Notes:** +- This will only take effect for privileges which include the `+` prefix. Each privilege controls that behavior separately. If we were granting other privileges, in addition to `select`, and those privilege names lacked the `+` prefix, they would continue to "clobber" rather than "add" new grantees. +- This use of `+`, controlling clobber vs. add merge behavior, is distinct from the use of `+` in `dbt_project.yml` (shown in the example above) for defining configs with dictionary values. For more information, see [the plus prefix](https://docs.getdbt.com/reference/resource-configs/plus-prefix). +- `grants` is the first config to support a `+` prefix for controlling config merge behavior. Currently, it's the only one. If it proves useful, we may extend this capability to new and existing configs in the future. + +## General examples + +You can grant each permission to a single grantee, or a set of multiple grantees. In this example, we're granting `select` on this model to just `bi_user`, so that it can be queried in our Business Intelligence (BI) tool. + + + +```sql +{{ config(materialized = 'table', grants = { + 'select': 'bi_user' +}) }} +``` + + + +When dbt runs this model for the first time, it will create the table, and then run code like: +```sql +grant select on schema_name.table_model to bi_user; +``` + +In this case, we're creating an incremental model, and granting the `select` privilege to two recipients: `bi_user` and `reporter`. + + + +```sql +{{ config(materialized = 'incremental', grants = { + 'select': ['bi_user', 'reporter'] +}) }} +``` + + + +When dbt runs this model for the first time, it will create the table, and then run code like: +```sql +grant select on schema_name.incremental_model to bi_user, reporter; +``` + +In subsequent runs, dbt will use database-specific SQL to show the grants already on `incremental_model`, and then determine if any `revoke` or `grant` statements are needed. + + +## Database-specific requirements and notes + +While we try to standardize the terms we use to describe different features, you will always find nuances in different databases. This section outlines some of those database-specific requirements and notes. + +In our examples above and below, you will find us referring to a privilege named `select`, and a grantee named `another_user`. Many databases use these or similar terms. Be aware that your database may require different syntax for privileges and grantees; you must configure `grants` in dbt with the appropriate names for both.
-- Use BigQuery-specific grantee and privilege names: - - Use `user:jeremy@dbtlabs.com` (do not use `jerco_user`) - - Use `roles/bigquery.dataViewer` (do not use `select`) +On BigQuery, "privileges" are called "roles," and they take the form `roles/service.roleName`. For instance, instead of granting `select` on a model, you would grant `roles/bigquery.dataViewer`. + +Grantees can be users, groups, service accounts, domains—and each needs to be clearly demarcated as such with a prefix. For instance, to grant access on a model to `someone@yourcompany.com`, you need to specify them as `user:someone@yourcompany.com`. + +We encourage you to read Google's documentation for more context: +- [Understanding GCP roles](https://cloud.google.com/iam/docs/understanding-roles) +- [How to format grantees](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-control-language#user_list) -## BigQuery examples +### BigQuery examples Granting permission using SQL and BigQuery: @@ -169,25 +261,3 @@ models:
- -## General examples - -When granting permissions, you can optimize for single or multiple users. - -Granting a single permission: - -```sql -{{ config(materialized = 'incremental', grants = { - 'select': 'bi' -}) }} - -``` - -Granting multiple users the same permission: - -```sql -{{ config(materialized = 'incremental', grants = { - 'select': ['bi','reporter'] -}) }} - -``` diff --git a/website/docs/reference/resource-configs/plus-prefix.md b/website/docs/reference/resource-configs/plus-prefix.md index 42501f1197..3d143e8ea1 100644 --- a/website/docs/reference/resource-configs/plus-prefix.md +++ b/website/docs/reference/resource-configs/plus-prefix.md @@ -62,3 +62,9 @@ models: Since it doesn't hurt to use the `+` prefix, we recommend you use it whenever adding configs to your `dbt_project.yml` file. + + + +**Note:** This use of the `+` prefix, in `dbt_project.yml`, is distinct from the use of `+` to control config merge behavior (clobber vs. add) in other config settings (specific resource `.yml` and `.sql` files). Currently, the only config which supports `+` for controlling config merge behavior is [`grants`](grants#grant-config-inheritance). + + diff --git a/website/docs/reference/resource-configs/pre-hook-post-hook.md b/website/docs/reference/resource-configs/pre-hook-post-hook.md index 09c5aa2771..feccfc0dac 100644 --- a/website/docs/reference/resource-configs/pre-hook-post-hook.md +++ b/website/docs/reference/resource-configs/pre-hook-post-hook.md @@ -100,9 +100,13 @@ select ... ## Definition -A SQL statement (or list of SQL statements) to be run before or after a model, seed or snapshot is built. +A SQL statement (or list of SQL statements) to be run before or after a model, seed, or snapshot is built. -Pre- and post-hooks can also call macros that return SQL statements. +Pre- and post-hooks can also call macros that return SQL statements. If your macro depends on values available only at execution time, such as using model configurations or `ref()` calls to other resources as inputs, you will need to [wrap your macro call in an extra set of curly braces](dont-nest-your-curlies#an-exception). + +### Why would I use hooks? + +dbt aims to provide all the boilerplate SQL you need (DDL, DML, and DCL) via out-of-the-box functionality, which you can configure quickly and concisely. In some cases, there may be SQL that you want or need to run, specific to functionality in your data platform, which dbt does not (yet) offer as a built-in feature. In those cases, you can write the exact SQL you need, using dbt's compilation context, and pass it into a `pre-` or `post-` hook to run before or after your model, seed, or snapshot. @@ -112,6 +116,52 @@ Pre- and post-hooks can also call macros that return SQL statements. ## Examples + + + + + +### [Redshift] Unload one model to S3 + + + +```sql +{{ config( + post_hook = "unload ('select from {{ this }}') to 's3:/bucket_name/{{ this }}" +) }} + +select ... +``` + + + +See: [Redshift docs on `UNLOAD`](https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html) + +### [Apache Spark] Analyze tables after creation + + + +```yml + +model: + jaffle_shop: # this is the project name + marts: + finance: + +post-hook: + # this can be a list + - "analyze table {{ this }} compute statistics for all columns" + # or call a macro instead + - "{{ analyze_table() }}" +``` + +See: [Apache Spark docs on `ANALYZE TABLE`](https://spark.apache.org/docs/latest/sql-ref-syntax-aux-analyze-table.html) + + + + + + + ### Grant privileges on a model @@ -171,6 +221,8 @@ model: + + ### Additional examples We've compiled some more in-depth examples [here](hooks-operations#additional-examples). @@ -192,7 +244,7 @@ There may be occasions where you need to run these hooks _outside_ of a transact * You want to run a `VACUUM` in a `post-hook`, however this cannot be executed within a transaction ([Redshift docs](https://docs.aws.amazon.com/redshift/latest/dg/r_VACUUM_command.html#r_VACUUM_usage_notes)) * You want to insert a record into an audit at the start of a run, and do not want that statement rolled back if the model creation fails. -To achieve this, you can use one of the following syntaxes: +To achieve this, you can use one of the following syntaxes. (Note: You should NOT use this syntax if using a database where dbt does not use transactions by default, including Snowflake, BigQuery, and Spark/Databricks.) #### Config block: use the `before_begin` and `after_commit` helper macros diff --git a/website/docs/reference/resource-configs/snowflake-configs.md b/website/docs/reference/resource-configs/snowflake-configs.md index d013932cfb..883058d97f 100644 --- a/website/docs/reference/resource-configs/snowflake-configs.md +++ b/website/docs/reference/resource-configs/snowflake-configs.md @@ -10,7 +10,7 @@ To-do: ## Transient tables -Snowflake supports the creation of [transient tables](https://docs.snowflake.net/manuals/user-guide/tables-temp-transient.html). Snowflake does not preserve a history for these tables, which can result in a measurable reduction of your Snowflake storage costs. Note however that transient tables do not participate in Time Travel. Weigh these tradeoffs when deciding whether or not to configure your dbt models as `transient`. **By default, all Snowflake tables created by dbt are `transient`.** +Snowflake supports the creation of [transient tables](https://docs.snowflake.net/manuals/user-guide/tables-temp-transient.html). Snowflake does not preserve a history for these tables, which can result in a measurable reduction of your Snowflake storage costs. Transient tables participate in time travel to a limited degree with a retention period of 1 day by default with no fail-safe period. Weigh these tradeoffs when deciding whether or not to configure your dbt models as `transient`. **By default, all Snowflake tables created by dbt are `transient`.** ### Configuring transient tables in dbt_project.yml diff --git a/website/docs/reference/resource-configs/teradata-configs.md b/website/docs/reference/resource-configs/teradata-configs.md index d1a65928a9..033dec1d94 100644 --- a/website/docs/reference/resource-configs/teradata-configs.md +++ b/website/docs/reference/resource-configs/teradata-configs.md @@ -208,7 +208,7 @@ As explained in [dbt seeds documentation](https://docs.getdbt.com/docs/building- Since seeds are version controlled, they are best suited to files that contain business-specific logic, for example a list of country codes or user IDs of employees. -Loading CSVs using dbt's seed functionality is not performant for large files. Consider using a different tool to load these CSVs into your data warehouse. +Loading CSVs using dbt's seed functionality is not performant for large files. Consider using a different tool to load these CSVs into your . ::: diff --git a/website/docs/reference/warehouse-profiles/oracle-profile.md b/website/docs/reference/warehouse-profiles/oracle-profile.md index 9dc21b7ebb..0c9587683b 100644 --- a/website/docs/reference/warehouse-profiles/oracle-profile.md +++ b/website/docs/reference/warehouse-profiles/oracle-profile.md @@ -6,7 +6,7 @@ title: "Oracle Profile" **Maintained by:** Oracle **Source:** [Github](https://github.com/oracle/dbt-oracle) -**Core version:** v1.0.7 +**Core version:** v1.1.1 **dbt Cloud:** Not Supported **dbt Slack channel** [#db-oracle](https://getdbt.slack.com/archives/C01PWH4TXLY) @@ -18,9 +18,54 @@ dbt-oracle can be installed via the Python Package Index (PyPI) using pip pip install dbt-oracle +### Configure the Python driver mode + +:::info +[python-oracledb](https://oracle.github.io/python-oracledb/) is the renamed, major release of Oracle's popular cx_Oracle interface +::: + +[python-oracledb](https://oracle.github.io/python-oracledb/) makes it optional to install the Oracle Client libraries. +This driver supports 2 modes + +1. **Thin mode (preferred) ** : Python process directly connects to the Oracle database. This mode does not need the Oracle Client libraries +2. **Thick mode** : Python process links with the Oracle Client libraries. Some advanced Oracle database functionalities (for e.g. Advanced Queuing and Scrollable cursors) are currently available via Oracle Client libraries + +It is highly recommended to use the **thin** mode as it vastly simplifies installation. You can configure the driver mode using the environment variable `ORA_PYTHON_DRIVER_TYPE` + +| Driver Mode | Oracle Client libraries required? | Configuration | +|------------------------|-----------------------------------| ------------- | +| Thin | No | `ORA_PYTHON_DRIVER_TYPE=thin`| +| Thick | Yes | `ORA_PYTHON_DRIVER_TYPE=thick` | +| cx_oracle (old driver) | Yes | `ORA_PYTHON_DRIVER_TYPE=cx` | + +The default value of `ORA_PYTHON_DRIVER_TYPE` is `cx`. This might change in the future as more users migrate towards the new python driver. + + + + + + ```bash + export ORA_PYTHON_DRIVER_TYPE=thin + ``` + + + + + + ```bash + export ORA_PYTHON_DRIVER_TYPE=thick + # or + export ORA_PYTHON_DRIVER_TYPE=cx # default + ``` + ### Install Oracle Instant Client libraries -To use dbt-oracle, you will need the [Oracle Instant Client libraries](https://www.oracle.com/database/technologies/instant-client.html) installed. These provide the necessary network connectivity allowing dbt-oracle to access an Oracle Database instance. +In thick mode or the old cx_oracle mode, you will need the [Oracle Instant Client libraries](https://www.oracle.com/database/technologies/instant-client.html) installed. These provide the necessary network connectivity allowing dbt-oracle to access an Oracle Database instance. Oracle client libraries versions 21, 19, 18, 12, and 11.2 are supported where available on Linux, Windows and macOS (Intel x86). It is recommended to use the latest client possible: Oracle’s standard client-server version interoperability allows connection to both older and newer databases. @@ -134,8 +179,103 @@ Note that Oracle Client versions 21c and 19c are not supported on Windows 7. + + + + +## Configure wallet for Oracle Autonomous Database in Cloud + +dbt can connect to Oracle Autonomous Database (ADB) in Oracle Cloud using either TLS (Transport Layer Security) or mutual TLS (mTLS). TLS and mTLS provide enhanced security for authentication and encryption. +A database username and password is still required for dbt connections which can be configured as explained in the next section [Connecting to Oracle Database](#connecting-to-oracle-database). + + + + + +With TLS, dbt can connect to Oracle ADB without using a wallet. Both Thin and Thick modes of the python-oracledb driver support TLS. + +:::info +In Thick mode, dbt can connect through TLS only when using Oracle Client library versions 19.14 (or later) or 21.5 (or later). +::: + +Refer to Oracle documentation to [connect to an ADB instance using TLS authentication](https://docs.oracle.com/en/cloud/paas/autonomous-database/adbsa/connecting-nodejs-tls.html#GUID-B3809B88-D2FB-4E08-8F9B-65A550F93A07) and the blog post [Easy wallet-less connections to Oracle Autonomous Databases in Python](https://blogs.oracle.com/opal/post/easy-way-to-connect-python-applications-to-oracle-autonomous-databases) to enable TLS for your Oracle ADB instance. + + + + +For mutual TLS connections, a wallet needs be downloaded from the OCI console and the python driver needs to be configured to use it. + +#### Install the Wallet and Network Configuration Files + +From the Oracle Cloud console for the database, download the wallet zip file using the `DB Connection` button. The zip contains the wallet and network configuration files. + +:::warning Note +Keep wallet files in a secure location and share them only with authorized users. +::: + +Unzip the wallet zip file. + + + + +In Thin mode, only two files from the zip are needed: + +- `tnsnames.ora` - Maps net service names used for application connection strings to your database services + +- `ewallet.pem` - Enables SSL/TLS connections in Thin mode. Keep this file secure + +After unzipping the files in a secure directory, set the **TNS_ADMIN** and **WALLET_LOCATION** environment variables to the directory name. + +```bash +export WALLET_LOCATION=/path/to/directory_containing_ewallet.pem +export WALLET_PASSWORD=*** +export TNS_ADMIN=/path/to/directory_containing_tnsnames.ora +``` +Optionally, if `ewallet.pem` file is encrypted using a wallet password, specify the password using environment variable **WALLET_PASSWORD** + + + + +In Thick mode, the following files from the zip are needed: + +- `tnsnames.ora` - Maps net service names used for application connection strings to your database services +- `sqlnet.ora` - Configures Oracle Network settings +- `cwallet.sso` - Enables SSL/TLS connections + +After unzipping the files in a secure directory, set the **TNS_ADMIN** environment variable to that directory name. + +```bash +export TNS_ADMIN=/path/to/directory_containing_tnsnames.ora +``` + +Next, edit the `sqlnet.ora` file to point to the wallet directory. + + + +```text +WALLET_LOCATION = (SOURCE = (METHOD = file) (METHOD_DATA = (DIRECTORY="/path/to/wallet/directory"))) +SSL_SERVER_DN_MATCH=yes +``` + + + + + + + + ## Connecting to Oracle Database Define the following mandatory parameters as environment variables and refer them in the connection profile using [env_var](https://docs.getdbt.com/reference/dbt-jinja-functions/env_var) jinja function. Optionally, you can also define these directly in the `profiles.yml` file, but this is not recommended @@ -149,7 +289,7 @@ export DBT_ORACLE_SCHEMA= Starting with `dbt-oracle==1.0.2`, it is **optional** to set the database name ```bash -export DBT_ORACLE_DATABASE=ga01d78d2ecd5f1_db202112221108 +export DBT_ORACLE_DATABASE=example_db2022adb ``` If database name is not set, adapter will retrieve it using the following query. @@ -161,47 +301,13 @@ SELECT SYS_CONTEXT('userenv', 'DB_NAME') FROM DUAL An Oracle connection profile for dbt can be set using any one of the following methods - - -To connect using the database hostname or IP address, you need to specify the following -- host -- port (1521 or 1522) -- protocol (tcp or tcps) -- service - -```bash -export DBT_ORACLE_HOST=adb.us-ashburn-1.oraclecloud.com -export DBT_ORACLE_SERVICE=ga01d78d2ecd5f1_db202112221108_high.adb.oraclecloud.com -``` - - - -```yaml -dbt_test: - target: "{{ env_var('DBT_TARGET', 'dev') }}" - outputs: - dev: - type: oracle - user: "{{ env_var('DBT_ORACLE_USER') }}" - pass: "{{ env_var('DBT_ORACLE_PASSWORD') }}" - protocol: "tcps" - host: "{{ env_var('DBT_ORACLE_HOST') }}" - port: 1522 - service: "{{ env_var('DBT_ORACLE_SERVICE') }}" - database: "{{ env_var('DBT_ORACLE_DATABASE') }}" - schema: "{{ env_var('DBT_ORACLE_SCHEMA') }}" - threads: 4 -``` - - - The `tnsnames.ora` file is a configuration file that contains network service names mapped to connect descriptors. @@ -210,18 +316,21 @@ The directory location of `tnsnames.ora` file can be specified using `TNS_ADMIN` ```text -net_service_name= - (DESCRIPTION= - (ADDRESS=(PROTOCOL=TCP)(HOST=dbhost.example.com)(PORT=1521)) - (CONNECT_DATA=(SERVICE_NAME=orclpdb1))) +db2022adb_high = (description = ( + address=(protocol=tcps) + (port=1522) + (host=adb.example.oraclecloud.com)) + (connect_data=(service_name=example_high.adb.oraclecloud.com)) + (security=(ssl_server_cert_dn="CN=adb.example.oraclecloud.com, + OU=Oracle BMCS US,O=Oracle Corporation,L=Redwood City,ST=California,C=US"))) ``` -The `net_service_name` can be defined as environment variable and referred in `profiles.yml` +The TNS alias `db2022adb_high` can be defined as environment variable and referred in `profiles.yml` ```bash -export DBT_ORACLE_TNS_NAME=net_service_name +export DBT_ORACLE_TNS_NAME=db2022adb_high ``` @@ -251,7 +360,10 @@ The connection string identifies which database service to connect to. It can be - A Net Service Name mapping to a connect descriptor ```bash -export DBT_ORACLE_CONNECT_STRING="(DESCRIPTION=(ADDRESS=(PROTOCOL=TCP)(HOST=dbhost.example.com)(PORT=1521))(CONNECT_DATA=(SERVICE_NAME=orclpdb1)))" +export DBT_ORACLE_CONNECT_STRING="(description=(address=(protocol=tcps)(port=1522) + (host=adb.example.oraclecloud.com))(connect_data=(service_name=example_high.adb.oraclecloud.com)) + (security=(ssl_server_cert_dn=\"CN=adb.example.oraclecloud.com, + OU=Oracle BMCS US,O=Oracle Corporation,L=Redwood City,ST=California,C=US\")))" ``` @@ -272,38 +384,42 @@ dbt_test: - - -## Connecting to Oracle Autonomous Database in Cloud - -To enable connection to Oracle Autonomous Database in Oracle Cloud, a wallet needs be downloaded from the cloud, and cx_Oracle needs to be configured to use it. The wallet gives mutual TLS which provides enhanced security for authentication and encryption. A database username and password is still required for your application connections. - -### Install the Wallet and Network Configuration Files - -From the Oracle Cloud console for the database, download the wallet zip file. It contains the wallet and network configuration files. Note: keep wallet files in a secure location and share them only with authorized users. - -Unzip the wallet zip file. For cx_Oracle, only these files from the zip are needed: + -- `tnsnames.ora` - Maps net service names used for application connection strings to your database services -- `sqlnet.ora` - Configures Oracle Network settings -- `cwallet.sso` - Enables SSL/TLS connections +To connect using the database hostname or IP address, you need to specify the following +- host +- port (1521 or 1522) +- protocol (tcp or tcps) +- service -After downloading the wallet, put the unzipped wallet files in a secure directory and set the TNS_ADMIN environment variable to that directory name. Next, edit the sqlnet.ora file to point to the wallet directory. +```bash +export DBT_ORACLE_HOST=adb.example.oraclecloud.com +export DBT_ORACLE_SERVICE=example_high.adb.oraclecloud.com +``` - + -```text -WALLET_LOCATION = (SOURCE = (METHOD = file) (METHOD_DATA = (DIRECTORY="/path/to/wallet/directory"))) -SSL_SERVER_DN_MATCH=yes +```yaml +dbt_test: + target: "{{ env_var('DBT_TARGET', 'dev') }}" + outputs: + dev: + type: oracle + user: "{{ env_var('DBT_ORACLE_USER') }}" + pass: "{{ env_var('DBT_ORACLE_PASSWORD') }}" + protocol: "tcps" + host: "{{ env_var('DBT_ORACLE_HOST') }}" + port: 1522 + service: "{{ env_var('DBT_ORACLE_SERVICE') }}" + database: "{{ env_var('DBT_ORACLE_DATABASE') }}" + schema: "{{ env_var('DBT_ORACLE_SCHEMA') }}" + threads: 4 ``` - + -:::info TLS v/s mTLS - -If you have enabled TLS connections on your Database instance then dbt can connect using only database username, password and the Oracle Net connect name given in the unzipped tnsnames.ora file. -::: + ## Supported Features diff --git a/website/docs/terms/cte.md b/website/docs/terms/cte.md index 9b2518f4b1..dacf2ce3d3 100644 --- a/website/docs/terms/cte.md +++ b/website/docs/terms/cte.md @@ -158,7 +158,7 @@ A is a nested query that can oftentimes be used in place ## Data warehouse support for CTEs -CTEs are likely to be supported across most, if not all, [modern data warehouses](https://blog.getdbt.com/future-of-the-modern-data-stack/). Please use this table to see more information about using CTEs in your specific data warehouse. +CTEs are likely to be supported across most, if not all, [modern data warehouses](https://blog.getdbt.com/future-of-the-modern-data-stack/). Please use this table to see more information about using CTEs in your specific . | Data Warehouse | Support CTEs? | |---|---| diff --git a/website/docs/terms/data-warehouse.md b/website/docs/terms/data-warehouse.md new file mode 100644 index 0000000000..f26b6ac7d9 --- /dev/null +++ b/website/docs/terms/data-warehouse.md @@ -0,0 +1,84 @@ +--- +id: data-warehouse +title: Data warehouse +displayText: data warehouse +hoverSnippet: A data warehouse is a data management system used for data storage and computing that allows for analytics activities such as transforming and sharing data. +--- + +A data warehouse is a data management system used for data storage and computing that allows for analytics activities such as transforming and sharing data. It helps businesses to capture and store data from external sources. Analytics engineers and data analysts use it to query datasets using SQL, helping to transform them into powerful data models and reports. Data warehouses are the central source of truth for any modern data stack. Data is ingested, transformed, and shared to other tools from the warehouse. + +There are two main types of data warehouses — on-prem warehouses and cloud warehouses. An on-prem data warehouse is a physical location where companies need to maintain hardware and software in order to store data. A cloud data warehouse is available anywhere and doesn’t include a physical location that you need to access. In this arrangement, you pay to use the storage space and compute power that is provided and maintained by another company. + +## History of data warehouses + +While data has been stored throughout history, it wasn’t until the 1980s that technology began to accelerate and the first official data warehouse was created. It was an on-prem warehouse consisting of a lot of computer processing and storage towers, taking up a lot of space. As you can imagine, this caused a lot of problems. It not only took up a lot of physical space, but employees had to maintain the hardware and software of these warehouses. This quickly became expensive and unrealistic for smaller companies without the budget or space. + +When Amazon began scaling their on-prem data warehouses to support their business, they noticed an opportunity to sell compute capacity to other businesses in order to save costs. This is when Redshift, Amazon’s cloud data warehouse product, came to be. Shortly after, other tech giants like Google and Microsoft who were also building data infrastructure followed suit. + +Now, you can be anywhere and access the power of an online warehouse. You no longer need to maintain the infrastructure yourself but can pay a company to do this for you. This is cheaper and allows for faster data capabilities. + + +## Why businesses need data warehouses + +Data warehouses were once unrealistic due to the costs associated with them. Now that cloud warehouses make them available to nearly everyone, they have a plethora of benefits to offer businesses. Cloud warehouses allow for scalability, availability, cost savings, and increased security- all of which are handled by the provider themself. + +### Scalability + +Data warehouses allow you to scale computing up or down depending on how fast you need your transformations to run and how much you are willing to spend. You can turn computing resources on or off as well in order to save on costs. + +### Availability + +Data warehouses are always available. While latency may vary based on source and destination locations, your data can be accessed anywhere, at any time. This is ideal for the remote culture that we are currently living in, where anyone can work from anywhere. + +### Cost savings + +Because you no longer need to maintain all of the infrastructure, you can save on costs related to maintenance. Because the data warehouse companies manage so much data, they are able to unlock cost-savings that you wouldn’t be able to otherwise. + +### Security + +Data warehouses offer advanced security features that ensure your data is always secure. They often directly handle certain compliance strategies needed with healthcare and financial data, eliminating the need for you to do this yourself. They also have features such as roles and users which help you control who has access to your data. But we will get into this more later. + +## Potential business use cases + +Businesses can leverage data warehouses for many different reasons. Most of these reasons end up savings time and money for the business, whether directly or indirectly. + +### Consolidating all of your data in one place + +First, a data warehouse acts as a single source of truth for all of your data. Rather than having all of your data spread across different platforms, it is available to you in one place. This allows you to standardize all of your core metrics and data definitions, rather than depending on the metrics calculated by platforms like Google and Facebook. If you find that different metrics don’t align across platforms, a data warehouse acts as a dependable source for the right metric. Rather than relying on external platforms, you now have one that centralizes all of your data. + +Not to mention, you will save your analytics engineer and data analyst a few headaches. They would otherwise have to manually pull needed data from various sources. Not having a single source of truth decreases your data quality, wastes your data team’s precious time, and makes it difficult to combine data from different sources. + +### Ability to control who has access and the type of access they have + +Data warehouses have extensive security features which allow you to control who has access to what. You have the ability to give someone as little or extensive permissions as you wish. Warehouses give you the ability to create users and assign them roles. Each role has its own set of permissions to which databases and tables it can see. Then, you can also choose who is allowed to query those tables or even update and delete them. + +When anyone in your organization can easily access your data, bad things can happen. You risk the potential of important data being deleted, incorrectly edited, or inappropriately accessed. Data warehouses users, roles, policies, and security measures can help ensure data is in the right hands of the right people. + +### Fast reporting + +Because all of your data is located in the same place, it allows for faster reporting compared to pulling data from many different sources. A central location allows for you to quickly access and query millions of rows of data, allowing transformations and reporting to be done much faster. + +## Data platforms that support data warehousing workloads + +| **Data platform** | **Description** | +|---|---| +| Snowflake | Snowflake is a fully-managed platform for data warehousing, data lakes, data engineering, data science, and data application development. | +| Databricks | Databricks is a cloud-based collaborative data science, data engineering, and data analytics platform that combines the best of data warehouses and data lakes into a lakehouse architecture. | +| Google BigQuery | Google BigQuery is a serverless, highly scalable data warehouse that comes with a built-in query engine. | +| Amazon Redshift | Amazon Redshift is a fully-managed petabyte-scale cloud-based data warehouse designed for large scale data set storage and analysis. | +| Postgres | PostgreSQL is an advanced, enterprise class open source relational database that supports both SQL (relational) and JSON (non-relational) querying. | + +## Data warehouse vs data lake + +A data lake is a system where you store, process, and query unstructured, semi-structured, and structured data at almost any scale. The main difference between a data warehouse and a data lake is the type and way data is stored. Data warehouses contain structured data that is meant to organize data for analytics use. Data lakes can contain pretty much any kind of data—structured or unstructured—and data is usually left it its raw format until it's ready to use. Compare that to data warehouses, whose primary goal is to be a place for data teams to store both raw and transformed, usable data. + +## Conclusion + +Data warehouses have come a long way [in the last 40 years](https://www.getdbt.com/blog/future-of-the-modern-data-stack/). They began as a physical location with huge costs associated with them to a system available to anyone, anywhere at an affordable cost. They have the power to centralize all of your business’s data, allowing for faster analytics operations, standardized KPIs, and a single source of truth. All businesses need a data warehouse in order to operate quickly and efficiently with data that they can rely on. The question isn’t whether or not you need a data warehouse, but which data warehouse you should choose. Make a list of the key features needed for your business and use that to assess the options above. + +## Additional reading + +- [Operational analytics](https://www.getdbt.com/analytics-engineering/use-cases/operational-analytics/) +- [Glossary: ETL](https://docs.getdbt.com/terms/etl/) +- [Glossary: ELT](https://docs.getdbt.com/terms/elt/) + diff --git a/website/docs/terms/ddl.md b/website/docs/terms/ddl.md index b31ea97235..3a85954d6e 100644 --- a/website/docs/terms/ddl.md +++ b/website/docs/terms/ddl.md @@ -49,7 +49,7 @@ In this example, you have to rename the `last_name` column [in jaffle_shop’s]( ### DROP -The `DROP` command. Probably the most high-stakes DDL statement one can execute. One that should be used with the *utmost* of care. At its core, an executed `DROP` statement will remove that object from the data warehouse. You can drop tables, views, schemas, databases, users, functions, and more. +The `DROP` command. Probably the most high-stakes DDL statement one can execute. One that should be used with the *utmost* of care. At its core, an executed `DROP` statement will remove that object from the . You can drop tables, views, schemas, databases, users, functions, and more. Some data warehouses such as Snowflake allow you to add restrictions to `DROP` statements to caution you about the impact of dropping a table, view, or schema before it’s actually dropped. In practice, we recommend you never drop raw source tables as they are often your baseline of truth. Your database user also usually needs the correct permissions to drop database objects. diff --git a/website/docs/terms/dimensional-modeling.md b/website/docs/terms/dimensional-modeling.md new file mode 100644 index 0000000000..b7f0434309 --- /dev/null +++ b/website/docs/terms/dimensional-modeling.md @@ -0,0 +1,154 @@ +--- +id: dimensional-modeling +title: Dimensional modeling +displayText: dimensional modeling +hoverSnippet: A data warehouse is a data management system used for data storage and computing that allows for analytics activities such as transforming and sharing data. +--- + +Dimensional modeling is a data modeling technique where you break data up into “facts” and “dimensions” to organize and describe entities within your data warehouse. The result is a staging layer in the data warehouse that cleans and organizes the data into the business end of the warehouse that is more accessible to data consumers. + +By breaking your data down into clearly defined and organized entities, your consumers can make sense of what that data is, what it’s used for, and how to join it with new or additional data. Ultimately, using dimensional modeling for your data can help create the appropriate layer of models to expose in an end business intelligence (BI) tool. + +There are a few different methodologies for dimensional modeling that have evolved over the years. The big hitters are the Kimball methodology and the Inmon methodology. Ralph Kimball’s work formed much of the foundation for how data teams approached data management and data modeling. Here, we’ll focus on dimensional modeling from Kimball’s perspective—why it exists, where it drives value for teams, and how it’s evolved in recent years. + +## What are we trying to do here? + +Let’s take a step back for a second and ask ourselves: why should you read this glossary page? What are you trying to accomplish with dimensional modeling and data modeling in general? Why have you taken up this rewarding, but challenging career? Why are *you* here? + +This may come as a surprise to you, but we’re not trying to build a top-notch foundation for analytics—we’re actually trying to build a bakery. + +Not the answer you expected? Well, let’s open up our minds a bit and explore this analogy. + +If you run a bakery (and we’d be interested in seeing the data person + baker venn diagram), you may not realize you’re doing a form of dimensional modeling. What’s the final output from a bakery? It’s that glittering, glass display of delicious-looking cupcakes, cakes, cookies, and everything in between. But a cupcake just didn’t magically appear in the display case! Raw ingredients went through a rigorous process of preparation, mixing, melting, and baking before they got there. + +Just as eating raw flour isn’t that appetizing, neither is deriving insights from raw data since it rarely has a nice structure that makes it poised for analytics. There’s some considerable work that’s needed to organize data and make it usable for business users. + +This is where dimensional modeling comes into play; it’s a method that can help data folks create meaningful entities (cupcakes and cookies) to live inside their [data mart](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) (your glass display) and eventually use for business intelligence purposes (eating said cookies). + +So I guess we take it back—you’re not just trying to build a bakery, you’re also trying to build a top-notch foundation for meaningful analytics. Dimensional modeling can be a method to get you part of the way there. + +## Facts vs. dimensions + +The ultimate goal of dimensional modeling is to be able to categorize your data into their fact or dimension models, making them the key components to understand. So what are these components? + +### Facts + +A fact is a collection of information that typically refers to an action, event, or result of a business process. As such, people typically liken facts to verbs. In terms of a real business, some facts may look like account creations, payments, or emails sent. + +It’s important to note that fact tables act as a historical record of those actions. You should almost never overwrite that data when it needs updating. Instead, you add new data as additional rows onto that table. + +For many businesses, marketing and finance teams need to understand all the touchpoints leading up to a sale or conversion. A fact table for a scenario like this might look like a `fct_account_touchpoints` table: + +| **unique_id** | **touchpoint_id** | **account_id** | **touchpoint_name** | **touchpoint_created_at_utc** | +|---|---|---|---|---| +| 23534 | 34 | 325611 | fall_convention_2020 | 2022-01-30 00:11:26 | +| 12312 | 29 | 325611 | demo_1 | 2022-05-29 01:42:07 | +| 66782 | 67 | 325611 | demo_2 | 2022-06-25 04:10:32 | +| 85311 | 15 | 105697 | fall_convention_2020 | 2022-05-29 06:13:45 | + +Accounts may have many touch points and this table acts as a true log of events leading up to an account conversion. + +This table is great and all for helping understanding what might have led to a conversion or account creation, but what if business users need additional context on these accounts or touchpoints? That’s where dimensions come into play. + +### Dimensions +A dimension is a collection of data that describe who or what took action or was affected by the action. Dimensions are typically likened to nouns. They add context to the stored events in fact tables. In terms of a business, some dimensions may look like users, accounts, customers, and invoices. + +A noun can take multiple actions or be affected by multiple actions. It’s important to call out: a noun doesn’t become a new thing whenever it does something. As such, when updating dimension tables, you should overwrite that data instead of duplicating them, like you would in a fact table. + +Following the example from above, a dimension table for this business would look like an `dim_accounts` table with some descriptors: + +| account_id | account_created_at_utc | account_name | account_status | billing_address | +|---|---|---|---|---| +| 325611 | 2022-06-29 12:11:43 | Not a Pyramid Scheme | active | 9999 Snake Oil Rd, Los Angeles, CA | +| 234332 | 2019-01-03 07:34:50 | Charlie’s Angels’ Chocolate Factory | inactive | 123 Wonka Way, Indianapolis, IN | +| 105697 | 2020-12-11 11:50:22 | Baggins Thievery | active | The Shire | + +In this table, each account only has one row. If an account’s name or status were to be updated, new values would overwrite existing records versus appending new rows. + +:::tip Snapshots +For fact tables you want to keep track of changes to, folks can leverage [dbt snapshots](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots). +::: + +### Facts and dimensions at play with each other +Cool, you think you’ve got some facts and dimensions that can be used to qualify your business. There’s one big consideration left to think about: how do these facts and dimensions interact with each other? + +![Image of depicting how facts and dimensions join together to create analytics ready datasets](/img/docs/terms/dimensional-modeling/fact-star.png) + +Pre-cloud data warehouses, there were two dominant design options, star schemas and snowflake schemas, that were used to concretely separate out the lines between fact and dimension tables. + +- In a star schema, there’s one central fact table that can join to relevant dimension tables. +- A snowflake schema is simply an extension of a star schema; dimension tables link to other dimension tables making it form a snowflake-esque shape. + +It sounds really nice to have this clean setup with star or snowflake schemas. Almost as if it’s too good to be true (and it very well could be). + +The development of cheap cloud storage, BI tools great at handling joins, the evolution of SQL capabilities, and data analysts with growing skill sets have changed the way data folks use to look at dimensional modeling and star schemas. Wide tables consisting of fact and dimension tables joined together are now a competitive option for data teams. + +Below, we’ll dig more into the design process of dimensional modeling, wide tables, and the beautiful ambiguity of it all. + +## The dimensional modeling design process + +According to the Kimball Group, the official(™) four-step design process is (1) selecting a business process to analyze, (2) declaring the , (3) Identifying the dimensions, and (4) Identifying the facts. That makes dimensional modeling sound really easy, but in reality, it’s packed full of nuance. + +Coming back down to planet Earth, your design process is how you make decisions about: + +- Whether something should be a fact or a dimension +- Whether you should keep fact and dimension tables separate or create wide, joined tables + +This is something that data philosophers and thinkers could debate long after we’re all gone, but let’s explore some of the major questions to hold you over in the meantime. + +### Should this entity be a fact or dimension? + +Time to put on your consultant hat because that dreaded answer is coming: it depends. This is what makes dimensional modeling a challenge! + +Kimball would say that a fact must be numeric. The inconvenient truth is: an entity can be viewed as a fact or a dimension depending on the analysis you are trying to run. + +:::note Birds of a feather +If you ran a clinic, you would probably have a log of appointments by patient. At first, you could think of appointments as facts—they are, after all, events that happen and patients can have multiple appointments—and patients as dimensions. But what if your business team really cared about the appointment data itself—how well it went, when it happened, the duration of the visit. You could, in this scenario, make the case for treating this appointments table as a dimension table. If you cared more about looking at your data at a patient-level, it probably makes sense to keep appointments as facts and patients as dimensions. All this to say is that there’s inherent complexity in dimensional modeling, and it’s up to you to draw those lines and build those models. +::: + +So then, how do you know which is which if there aren’t any hard rules!? Life is a gray area, my friend. Get used to it. + +A general rule of thumb: go with your gut! If something feels like it should be a fact to meet your stakeholders' needs, then it’s a fact. If it feels like a dimension, it’s a dimension. The world is your oyster. If you find that you made the wrong decision down the road, (it’s usually) no big deal. You can remodel that data. Just remember: you’re not a surgeon. No one will die if you mess up (hopefully). So, just go with what feels right because you’re the expert on your data 👉😎👉 + +Also, this is why we have data teams. Dimensional modeling and data modeling is usually a collaborative effort; working with folks on your team to understand the data and stakeholder wants will ultimately lead to some rad data marts. + +### Should I make a wide table or keep them separate? + +Yet again, it depends. Don’t roll your eyes. Strap in for a quick history lesson because the answer to this harkens back to the very inception of dimensional modeling. + +Back in the day before cloud technology adoption was accessible and prolific, storing data was expensive and joining data was relatively cheap. Dimensional modeling came about as a solution to these issues. Separating collections of data into smaller, individual tables (star schema-esque) made the data cheaper to store and easier to understand. So, individual tables were the thing to do back then. + +Things are different today. Cloud storage costs have gotten really inexpensive. Instead, computing is the primary cost driver. Now, keeping all of your tables separate can be expensive because every time you join those tables, you’re spending usage credits. + +Should you just add everything to one, wide table? No. One table will never rule them all. Knowing whether something should be its own fact table or get added on to an existing table generally comes down to understanding who will be your primary end consumers. + +For end business users who are writing their own SQL, feel comfortable performing joins, or use a tool that joins tables for them, keeping your data as separate fact and dimension tables is pretty on-par. In this setup, these users have the freedom and flexibility to join and explore as they please. + +If your end data consumers are less comfortable with SQL and your BI tool doesn’t handle joins well, you should consider joining several fact and dimension tables into wide tables. Another consideration: these wide, heavily joined tables can tend to wind up pretty specialized and specific to business departments. Would these types of wide tables be helpful for you, your data team, and your business users? Well, that’s for you to unpack. + +## Advantages and disadvantages of dimensional modeling + +The benefits and drawbacks of dimensional modeling are pretty straightforward. Generally, the main advantages can be boiled down to: + +* **More accessibility**: Since the output of good dimensional modeling is a [data mart](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts), the tables created are easier to understand and more accessible to end consumers. +* **More flexibility**: Easy to slice, dice, filter, and view your data in whatever way suits your purpose. +* **Performance**: Fact and dimension models are typically materialized as tables or [incremental models](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models). Since these often form the core understanding of a business, they are queried often. Materializing them as tables allows them to be more performant in downstream BI platforms. + +The disadvantages include: +* **Navigating ambiguity**: You need to rely on your understanding of your data and stakeholder wants to model your data in a comprehensible and useful way. What you know about your data and what people really need out of the data are two of the most fundamental and difficult things to understand and balance as a data person. +* **Utility limited by your BI tool**: Some BI tools don’t handle joins well, which can make queries from separated fact and dimensional tables painful. Other tools have long query times, which can make querying from ultra-wide tables not fun. + +## Conclusion + +Dimensional data modeling is a data modeling technique that allows you to organize your data into distinct entities that can be mixed and matched in many ways. That can give your stakeholders a lot of flexibility. [While the exact methodologies have changed](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/)—and will continue to, the philosophical principle of having tables that are sources of truth and tables that describe them will continue to be important in the work of analytics engineering practitioners. + + +## Additional Reading + +Dimensional modeling is a tough, complex, and opinionated topic in the data world. Below you’ll find some additional resources that may help you identify the data modeling approach that works best for you, your data team, and your end business users: + + + +* [Modular data modeling techniques](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) +* [Stakeholder-friendly model naming conventions](https://docs.getdbt.com/blog/stakeholder-friendly-model-names/) +* [How we structure our dbt projects guide](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) diff --git a/website/docs/terms/elt.md b/website/docs/terms/elt.md index 764ed544de..bec0ded80c 100644 --- a/website/docs/terms/elt.md +++ b/website/docs/terms/elt.md @@ -5,7 +5,7 @@ displayText: ELT hoverSnippet: Extract, Load, Transform (ELT) is the process of first extracting data from different data sources, loading it into a target data warehouse, and finally transforming it. --- -Extract, Load, Transform (ELT) is the process of first extracting data from different data sources, then loading it into a target data warehouse, and finally transforming it. +Extract, Load, Transform (ELT) is the process of first extracting data from different data sources, then loading it into a target , and finally transforming it. ELT has emerged as a paradigm for how to manage information flows in a modern data warehouse. This represents a fundamental shift from how data previously was handled when Extract, Transform, Load (ETL) was the data workflow most companies implemented. diff --git a/website/docs/terms/etl.md b/website/docs/terms/etl.md index 9dd07ce31a..a0d39f9521 100644 --- a/website/docs/terms/etl.md +++ b/website/docs/terms/etl.md @@ -5,7 +5,7 @@ displayText: ETL hoverSnippet: Extract, Transform, Load (ETL)is the process of first extracting data from a data source, transforming it, and then loading it into a target data warehouse. --- -ETL, or “Extract, Transform, Load”, is the process of first extracting data from a data source, transforming it, and then loading it into a target data warehouse. In ETL workflows, much of the meaningful data transformation occurs outside this primary pipeline in a downstream business intelligence (BI) platform. +ETL, or “Extract, Transform, Load”, is the process of first extracting data from a data source, transforming it, and then loading it into a target . In ETL workflows, much of the meaningful data transformation occurs outside this primary pipeline in a downstream business intelligence (BI) platform. ETL is contrasted with the newer (Extract, Load, Transform) workflow, where transformation occurs after data has been loaded into the target data warehouse. In many ways, the ETL workflow could have been renamed the ETLT workflow, because a considerable portion of meaningful data transformations happen outside the data pipeline. The same transformations can occur in both ETL and ELT workflows, the primary difference is *when* (inside or outside the primary ETL workflow) and *where* the data is transformed (ETL platform/BI tool/data warehouse). diff --git a/website/docs/terms/materialization.md b/website/docs/terms/materialization.md index 73f17cadc3..ef1242ff26 100644 --- a/website/docs/terms/materialization.md +++ b/website/docs/terms/materialization.md @@ -8,7 +8,7 @@ hoverSnippet: The exact Data Definition Language (DDL) that dbt will use when cr This term would benefit from additional depth and examples. Have knowledge to contribute? [Create a discussion in the docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com/discussions) to begin the process of becoming a glossary contributor! ::: -The exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a data warehouse. It's the manner in which the data is represented, and each of those options is defined either canonically (tables, views, incremental), or bespoke. +The exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a . It's the manner in which the data is represented, and each of those options is defined either canonically (tables, views, incremental), or bespoke. It is important to consider the downstream impacts of your materialization choice on query run times and macro capabilities. diff --git a/website/docs/terms/primary-key.md b/website/docs/terms/primary-key.md index 41ee8ccf27..02319f7af3 100644 --- a/website/docs/terms/primary-key.md +++ b/website/docs/terms/primary-key.md @@ -16,7 +16,7 @@ At their core, you create and use these row-level unique identifiers to: One of the great things about data modeling is that there are very few rules to it. You have the flexibility to create the models and columns that are applicable to your business and the SQL you use to accomplish that is pretty much up to you and your team. _Having a primary key in each data model is pretty much the one rule you can’t break._ Without primary keys that are tested for non-nullness and uniqueness, duplicate or null records can slip undetected into your data models and cause counts to be incorrect. These two reasons coupled together can create a sense of distrust in the data and data team. -Use this glossary page to understand the importance of primary keys, how natural keys and surrogate keys differ, and how data warehouse support for primary keys varies. +Use this glossary page to understand the importance of primary keys, how natural keys and surrogate keys differ, and how support for primary keys varies. ## Types of primary keys diff --git a/website/docs/terms/subquery.md b/website/docs/terms/subquery.md index de94ea947f..12b91bf56f 100644 --- a/website/docs/terms/subquery.md +++ b/website/docs/terms/subquery.md @@ -194,7 +194,7 @@ Again, choosing to use CTEs over subqueries is a personal choice. It may help to ## Data warehouse support for subqueries -Subqueries are likely to be supported across most, if not all, modern data warehouses. Please use this table to see more information about using subqueries in your specific data warehouse. +Subqueries are likely to be supported across most, if not all, modern data warehouses. Please use this table to see more information about using subqueries in your specific data warehouse. | Data warehouse | Supports subqueries? | |---|---| diff --git a/website/docs/terms/surrogate-key.md b/website/docs/terms/surrogate-key.md index 06e7d28eed..3ab110121d 100644 --- a/website/docs/terms/surrogate-key.md +++ b/website/docs/terms/surrogate-key.md @@ -19,7 +19,7 @@ Primary keys can be established two ways: naturally or derived through the data * A __surrogate key__ is a hashed value of multiple fields in a dataset that create a uniqueness constraint on that dataset. You’ll essentially need to make a surrogate key in every table that lacks a natural key. :::note Note -You may also hear about primary keys being a form of a _constraint_ on a database object. Column constraints are specified in the to create or alter a database object. For data warehouses that support the enforcement of primary key constraints, this means that an error would be raised if a field's uniqueness or non-nullness was broken upon an `INSERT` or `UPDATE` statement. Most modern data warehouses don’t support _and_ enforce [primary key constraints](https://docs.getdbt.com/terms/primary-key#Data-warehouse-support-for-primary-keys), so it’s important to have [automated testing](https://docs.getdbt.com/blog/primary-key-testing#how-to-test-primary-keys-with-dbt) in-place to ensure your primary keys are unique and not null. +You may also hear about primary keys being a form of a _constraint_ on a database object. Column constraints are specified in the to create or alter a database object. For data warehouses that support the enforcement of primary key constraints, this means that an error would be raised if a field's uniqueness or non-nullness was broken upon an `INSERT` or `UPDATE` statement. Most modern data warehouses don’t support _and_ enforce [primary key constraints](https://docs.getdbt.com/terms/primary-key#Data-warehouse-support-for-primary-keys), so it’s important to have [automated testing](https://docs.getdbt.com/blog/primary-key-testing#how-to-test-primary-keys-with-dbt) in-place to ensure your primary keys are unique and not null. ::: ## How surrogate keys are created diff --git a/website/docs/terms/table.md b/website/docs/terms/table.md index 0aeb9216a4..54a0042fce 100644 --- a/website/docs/terms/table.md +++ b/website/docs/terms/table.md @@ -18,7 +18,7 @@ Here is an example of a table: | 02 | Bilbo | Baggins | bilbo@theshire.co.uk | | 03 | Gandalf | The Grey | greywizard1@gmail.com | -Tables do use storage in your data warehouse. The data can be queried directly because you are directly pulling from the raw data itself. If a particular table was created by underlying data, the table will not be automatically updated. +Tables do use storage in your . The data can be queried directly because you are directly pulling from the raw data itself. If a particular table was created by underlying data, the table will not be automatically updated. This table definition applies to most data warehouses, however, there are different flavors of tables for different warehouses. For example, Snowflake has transient and temporary tables that support different features. diff --git a/website/docs/terms/view.md b/website/docs/terms/view.md index f0a5cd8c49..6cc9ccc9a6 100644 --- a/website/docs/terms/view.md +++ b/website/docs/terms/view.md @@ -2,7 +2,7 @@ id: view title: View displayText: view -hoverSnippet: A view (as opposed to a table) is a defined passthrough SQL query that can be run against a database (or data warehouse). +hoverSnippet: A view (as opposed to a table) is a defined passthrough SQL query that can be run against a database (or ). --- :::important This page could use some love This term would benefit from additional depth and examples. Have knowledge to contribute? [Create a discussion in the docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com/discussions) to begin the process of becoming a glossary contributor! @@ -32,4 +32,4 @@ You shouldn’t expect a view in itself to be your final destination in terms of ## Further reading -- [Best practices guide on choosing table vs view materializations](docs/guides/best-practices#choose-your-materializations-wisely) +- [Best practices guide on choosing table vs view materializations](/guides/best-practices) diff --git a/website/snippets/grants-vs-access-to.md b/website/snippets/grants-vs-access-to.md index c6a80bc5d7..c0c08e8df4 100644 --- a/website/snippets/grants-vs-access-to.md +++ b/website/snippets/grants-vs-access-to.md @@ -1,8 +1,8 @@ :::info Note -The `grants` config is unrelated to the `grant_access_to` config: +The [`grants` config](resource-configs/grants) and the [`grant_access_to` config](bigquery-configs#authorized-views) are distinct. - **`grants_access_to`:** Enables you to set up authorized views. When configured, dbt provides an authorized view access to show partial information from other datasets, without providing end users with full access to those underlying datasets. For more information, see [BigQuery configurations: Authorized views](/reference/resource-configs/bigquery-configs#authorized-views) - **`grants`:** Provides specific permissions to users, groups, or service accounts for managing access to datasets you're producing with dbt. -You can use the two features together: "authorize" a view model with the `grants_access_to` configuration, and then add `grants` to that view model to share its query results (and _only_ its query results) with other users, groups, or service accounts. For more information, see [grants](/reference/resource-configs/grants). +You can use the two features together: "authorize" a view model with the `grants_access_to` configuration, and then add `grants` to that view model to share its query results (and _only_ its query results) with other users, groups, or service accounts. ::: diff --git a/website/snippets/hooks-to-grants.md b/website/snippets/hooks-to-grants.md new file mode 100644 index 0000000000..384b56b8af --- /dev/null +++ b/website/snippets/hooks-to-grants.md @@ -0,0 +1 @@ +In older versions of dbt, the most common use of `post-hook` was to execute `grant` statements, to apply database permissions to models right after creating them. Starting in v1.2, we recommend using the [`grants` resource config](/reference/resource-configs/grants) instead, in order to automatically apply grants when your dbt model runs. \ No newline at end of file diff --git a/website/static/img/blog/2022-07-12-change-data-capture-metrics/fct-income-dag.png b/website/static/img/blog/2022-07-12-change-data-capture-metrics/fct-income-dag.png new file mode 100644 index 0000000000..861b859156 Binary files /dev/null and b/website/static/img/blog/2022-07-12-change-data-capture-metrics/fct-income-dag.png differ diff --git a/website/static/img/blog/2022-07-12-change-data-capture-metrics/final-dag.png b/website/static/img/blog/2022-07-12-change-data-capture-metrics/final-dag.png new file mode 100644 index 0000000000..37243ff06d Binary files /dev/null and b/website/static/img/blog/2022-07-12-change-data-capture-metrics/final-dag.png differ diff --git a/website/static/img/blog/2022-07-12-change-data-capture-metrics/income-report-versions-dag.png b/website/static/img/blog/2022-07-12-change-data-capture-metrics/income-report-versions-dag.png new file mode 100644 index 0000000000..63eb3400ca Binary files /dev/null and b/website/static/img/blog/2022-07-12-change-data-capture-metrics/income-report-versions-dag.png differ diff --git a/website/static/img/blog/2022-07-12-change-data-capture-metrics/int-income-history-dag.png b/website/static/img/blog/2022-07-12-change-data-capture-metrics/int-income-history-dag.png new file mode 100644 index 0000000000..00684a680a Binary files /dev/null and b/website/static/img/blog/2022-07-12-change-data-capture-metrics/int-income-history-dag.png differ diff --git a/website/static/img/blog/2022-07-12-change-data-capture-metrics/revenue-meme.png b/website/static/img/blog/2022-07-12-change-data-capture-metrics/revenue-meme.png new file mode 100644 index 0000000000..d8b49a7ef1 Binary files /dev/null and b/website/static/img/blog/2022-07-12-change-data-capture-metrics/revenue-meme.png differ diff --git a/website/static/img/blog/2022-07-12-change-data-capture-metrics/snapshots-dag.png b/website/static/img/blog/2022-07-12-change-data-capture-metrics/snapshots-dag.png new file mode 100644 index 0000000000..801331f705 Binary files /dev/null and b/website/static/img/blog/2022-07-12-change-data-capture-metrics/snapshots-dag.png differ diff --git a/website/static/img/blog/authors/grace-goheen.jpeg b/website/static/img/blog/authors/grace-goheen.jpeg new file mode 100644 index 0000000000..06b059b08c Binary files /dev/null and b/website/static/img/blog/authors/grace-goheen.jpeg differ diff --git a/website/static/img/docs/terms/dimensional-modeling/fact-star.png b/website/static/img/docs/terms/dimensional-modeling/fact-star.png new file mode 100644 index 0000000000..22ebed964c Binary files /dev/null and b/website/static/img/docs/terms/dimensional-modeling/fact-star.png differ