diff --git a/cspell.json b/cspell.json index 0afd55220d..24637e5869 100644 --- a/cspell.json +++ b/cspell.json @@ -1,17 +1,151 @@ { - "dictionaries": [ - "custom-dictionary" - ], - // Tell CSpell about your dictionary - "dictionaryDefinitions": [ - { - // The name of the dictionary is used to look it up. - "name": "custom-dictionary", - // Path to the custom word file. Relative to this `cspell.json` file. - "path": "./custom-dictionary.txt", - // Some editor extensions will use `addWords` for adding words to your - // personal dictionary. - "addWords": true - } - ] -} \ No newline at end of file + "version": "0.2", + "language": "en", + "words": [ + "Gruntwork", + "gruntwork", + "GitOps", + "Docusaurus", + "runbooks", + "terraform", + "terragrunt", + "Terragrunt", + "kubernetes", + "kubectl", + "autoscaling", + "monorepo", + "microservices", + "infra", + "DevOps", + "OpenTofu", + "opentofu", + "Tofu", + "prebuilt", + "repo", + "repos", + "serverless", + "cloudformation", + "elasticache", + "cicd", + "gitignore", + "cursorrules", + "npmignore", + "lockfile", + "lockfiles", + "docstrings", + "codebase", + "codebases", + "runbook", + "dockerfile", + "dockerfiles", + "orchestrator", + "orchestrators", + "customizable", + "composability", + "scalability", + "observability", + "linter", + "linters", + "linting", + "Yousif", + "Oreoluwa", + "Agunbiade", + "Yevgeniy", + "Brikman", + "Rahul", + "Vohra", + "Nana", + "Pulumi", + "pulumi", + "Rego", + "Sonatype", + "Pgvector", + "karpenter", + "Karpenter", + "MTTR", + "MTTP", + "inhouse", + "SOURCER", + "mydomain", + "terratest", + "Terratest", + "terratests", + "tailscale", + "sdlc", + "acmecorp", + "envcommon", + "tada", + "codeowners", + "jsonencode", + "chdir", + "rulesets", + "controltower", + "awscli", + "passwordless", + "coolapp", + "acmeco", + "grunty", + "yamldecode", + "mgmt", + "tfstate", + "baselining", + "baselined", + "clickops", + "terrascan", + "steampipe", + "infracost", + "kubergrunt", + "terraformrc", + "tfvars", + "fargate", + "keypair", + "mimecast", + "slugified", + "dlist", + "DEPENDENCYID", + "subfolders", + "terrapatch", + "Terrapatch", + "KodeKloud", + "preconfigured", + "projectprefix", + "GOVCLOUD", + "rollouts", + "myvars", + "myfile", + "minamijoyo", + "tfupdate", + "hcledit", + "infrachanges", + "Entra", + "GLMU", + "myprodsa", + "azuread", + "mysa", + "deinterlaced", + "rolename", + "ACCOUNTNAME" + ], + "ignoreWords": [], + "ignorePaths": [ + "node_modules", + "build", + "dist", + ".git", + "*.lockb", + "bun.lockb", + "yarn.lock", + "package-lock.json", + ".cspell.json", + "tsconfig.json", + "*.min.js", + "*.map" + ], + "ignoreRegExpList": [ + "/https?:\\/\\/[^\\s]+/g", + "/```[\\s\\S]*?```/g" + ], + "allowCompoundWords": true, + "dictionaries": ["typescript", "node", "npm", "bash", "markdown"], + "enableFiletypes": ["markdown", "mdx", "typescript", "javascript", "json"] +} diff --git a/custom-dictionary.txt b/custom-dictionary.txt deleted file mode 100644 index 55f207c308..0000000000 --- a/custom-dictionary.txt +++ /dev/null @@ -1,71 +0,0 @@ -terragrunt -Terragrunt -gruntwork -Gruntwork -SOURCER -mydomain -terratest -Terratest -terratests -tailscale -opentofu -sdlc -acmecorp -envcommon -tada -codeowners -jsonencode -chdir -rulesets -controltower -awscli -passwordless -acmecorp -coolapp -acmeco -grunty -yamldecode -pulumi -mgmt -tfstate -baselining -baselined -clickops -terrascan -steampipe -infracost -kubergrunt -terraformrc -tfvars -fargate -karpenter -keypair -mimecast -slugified -dlist -DEPENDENCYID -subfolders -MTTR -terrapatch -Terrapatch -KodeKloud -preconfigured -projectprefix -GOVCLOUD -rollouts -myvars -myfile -gruntwork-io -minamijoyo -tfupdate -hcledit -self-hosting -infrachanges -Entra -GLMU -myprodsa -azuread -mysa -deinterlaced -rolename -ACCOUNTNAME diff --git a/docs/2.0/way/authorship.md b/docs/2.0/way/authorship.md new file mode 100644 index 0000000000..093f3e0e3e --- /dev/null +++ b/docs/2.0/way/authorship.md @@ -0,0 +1,10 @@ +--- +sidebar_position: 10 +sidebar_class_name: way-top-level-item +--- + +# Authorship + +The Gruntwork Way was originally authored by [Josh Padnick](https://joshpadnick.com) with input from fellow Grunts Yousif Akbar, Lewis Christie, Oreoluwa Agunbiade, Brian Torres, [Yevgeniy Brikman](https://ybrikman.com) through his book [The Fundamentals of DevOps and Software Delivery](https://www.fundamentals-of-devops.com/), and especially Eben Eliason. + +It is intended to be a living, evolving knowledge set, so it is our hope that if you find something you believe can be improved that you will [open a Pull Request](/2.0/way/intro/share-your-feedback) suggesting it! \ No newline at end of file diff --git a/docs/2.0/way/intro/_category_.json b/docs/2.0/way/intro/_category_.json new file mode 100644 index 0000000000..5d2775d6ef --- /dev/null +++ b/docs/2.0/way/intro/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Introduction", + "position": 1, + "className": "way-top-level-item" +} + diff --git a/docs/2.0/way/intro/how-to-use-this.md b/docs/2.0/way/intro/how-to-use-this.md new file mode 100644 index 0000000000..a7f48a8453 --- /dev/null +++ b/docs/2.0/way/intro/how-to-use-this.md @@ -0,0 +1,30 @@ +--- +title: How To Use This +sidebar_position: 3 +--- + +To get the most out of the Gruntwork Way, start by making sure you understand the big picture. + +## The Gruntwork Way Big Picture + +First, a **developer platform** is the product your platform team builds to balance three essential concerns: enabling developers to move fast ([velocity](/2.0/way/why/velocity)), meeting business needs like security, cost, and compliance ([governance](/2.0/way/why/governance)), and avoiding unsustainable technical debt ([maintainability](/2.0/way/why/maintainability)). + +To build a successful platform, you need to combine three types of building blocks: [patterns](/2.0/way/platform/patterns/overview) (pre-built solutions to common infrastructure problems), [platform components](/2.0/way/platform/components/overview) (the machinery for deploying and managing infrastructure), and [interfaces](/2.0/way/platform/interfaces/overview) (how developers interact with your platform). + +Finally, everything in this framework is based on the [guiding principles](/2.0/way/principles/overview) we've discovered through years of building and refining developer platforms. + +## Using the Gruntwork Way + +Once you understand these foundational concepts, there are two primary ways to use the Gruntwork Way. + +### Option 1: Read it comprehensively + +You can work through this framework from start to finish to build a complete mental model of developer platforms. This approach works well if you're just getting started or want to strengthen your overall understanding. + +### Option 2: Use it as a reference + +Alternatively, you can jump directly to the sections most relevant to your current challenges. This approach works well once you understand the fundamentals (which you just read above!) and need guidance on a specific aspect of your platform. + +## Next + +Alright, let's dive into the framework that helps you build platforms developers actually want to use! diff --git a/docs/2.0/way/intro/share-your-feedback.md b/docs/2.0/way/intro/share-your-feedback.md new file mode 100644 index 0000000000..4e13512370 --- /dev/null +++ b/docs/2.0/way/intro/share-your-feedback.md @@ -0,0 +1,13 @@ +--- +title: Share your feedback +sidebar_position: 4 +--- + +- This framework is useful because it's opinionated. +- Our opinions are based on nearly a decade of experience across hundreds of customers and thousand of customer conversations +- But maybe your mileage varies, or maybe we got it wrong. If so, tell us! +- To give feedback, go to https://github.com/gruntwork-io/docs and create a GitHub Issue or Pull Request. Alternatively click on the "Edit this page" link at the bottom of each page. + +### Next + +Alright, let's dive into the framework that helps you build platforms developers actually want to use! diff --git a/docs/2.0/way/intro/welcome.md b/docs/2.0/way/intro/welcome.md new file mode 100644 index 0000000000..bcef740355 --- /dev/null +++ b/docs/2.0/way/intro/welcome.md @@ -0,0 +1,44 @@ +--- +title: Welcome! +sidebar_position: 1 +--- + +# Welcome! + +The **Gruntwork Way** is Gruntwork's official framework for how to build a world-class developer platform. + +## What's a developer platform? + +Nearly every company on the planet needs software, and when companies write their own software with more than a few developers, the same common needs arise: + +1. Enable developers to move fast ([velocity](/2.0/way/why/velocity)) +2. ...while meeting the other needs of the business like security, cost, and compliance ([governance](/2.0/way/why/governance)) +3. ...without accumulating an unsustainable level of debt ([maintainability](/2.0/way/why/maintainability)) + +A **developer platform** is the product your platform team builds to balance these three concerns. It consists of three essential building blocks: + +- [Patterns](/2.0/way/platform/patterns/overview) - pre-built solutions to common infrastructure problems +- [Platform components](/2.0/way/platform/components/overview) - the machinery for deploying and managing infrastructure +- [Interfaces](/2.0/way/platform/interfaces/overview) - how developers interact with your platform. + +Together, these enable developers to move fast while meeting the needs of the business, and to do so in a way that incurs manageable levels of debt. + +## Why read this framework + +Making sense of developer platforms can be confusing. We've found bits and pieces of insights scattered across blog posts, vendor solutions, and industry talks, but none of these presents a holistic way to think about developer platforms. + +This framework will help you build comprehensive clarity. + +Ultimately, that clarity will empower you to build a successful developer platform. + +## Why trust Gruntwork + +We've spent years building expertise on how to deliver software effectively by serving as: + +* **Authors** - We've written leading books and guides, including [Terraform Up & Running](https://www.terraformupandrunning.com/), [Fundamentals of DevOps and Software Delivery](https://www.fundamentals-of-devops.com/?ref=blog-post-series), ,[The Startup CTO's Handbook](https://github.com/ZachGoldberg/Startup-CTO-Handbook), and [A Comprehensive Guide to Scaling Apps on AWS](https://joshpadnick.com/posts/a-comprehensive-guide-to-scaling-apps-on-aws-part-1/). +* **Makers** - We've created leading open source DevOps tooling like [OpenTofu](https://opentofu.org/) (co-founder), [Terragrunt](https://terragrunt.gruntwork.io/), [Terratest](https://terratest.gruntwork.io/), [Runbooks](https://runbooks.gruntwork.io), [Boilerplate](https://github.com/gruntwork-io/boilerplate), and [Cloud Nuke](https://github.com/gruntwork-io/cloud-nuke). +* **Experts** - As a vendor, we've equipped over 500 companies and thousands of engineers with AWS, DevOps, and Platform Engineering best practices. + +## Next + +Now let's learn more about who we wrote the Gruntwork Way for. diff --git a/docs/2.0/way/intro/who-this-is-for.md b/docs/2.0/way/intro/who-this-is-for.md new file mode 100644 index 0000000000..3204d431d8 --- /dev/null +++ b/docs/2.0/way/intro/who-this-is-for.md @@ -0,0 +1,34 @@ +--- +title: Who This Is For +sidebar_position: 2 +--- + +We wrote The Gruntwork Way for anyone trying to balance velocity, governance, and maintainability in their quest to better deliver software. + +## Roles + +In practice, we expect users in these roles will find this useful: + +- **Platform engineers** building or improving their developer platforms +- **DevOps engineers** looking for best practices and patterns +- **Engineering leaders** evaluating their developer platform and/or cloud strategy + +## Maturity levels + +As you build your developer platform, you will go through a series of maturity stages. + +Actually, to be more precise, you will build maturity separately in your [patterns](/2.0/way/platform/patterns/overview), [platform components](/2.0/way/platform/components/overview), and [interfaces](/2.0/way/platform/interfaces/overview). Your collective maturity level for the "developer platform" is really just the collection of maturity levels for its component parts. + +In general, the component parts track these maturity levels: + +- **Ad hoc** - There is no standard, so your team does things manually with minimal automation. +- **Building** - You've built something, but it's limited. +- **Established** - This component does add value, but with lots of room for improvement. +- **Mature** - This component performs its functions comprehensively and effectively. +- **Self-improving** - This component has a systematic feedback loop that leads to ongoing improvements. + +The Gruntwork Way is especially useful for organizations with many building blocks in the **ad hoc,** **building,** or **established** stages. + +## Next + +Let's close out the introduction by giving you some pointers on how to use this framework! diff --git a/docs/2.0/way/platform/_category_.json b/docs/2.0/way/platform/_category_.json new file mode 100644 index 0000000000..6e10d8b5c7 --- /dev/null +++ b/docs/2.0/way/platform/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Building Your Platform", + "position": 4, + "className": "way-top-level-item" +} + diff --git a/docs/2.0/way/platform/components/_category_.json b/docs/2.0/way/platform/components/_category_.json new file mode 100644 index 0000000000..e57a838550 --- /dev/null +++ b/docs/2.0/way/platform/components/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Components", + "position": 3 +} diff --git a/docs/2.0/way/platform/components/deploy/_category_.json b/docs/2.0/way/platform/components/deploy/_category_.json new file mode 100644 index 0000000000..d077ba30bb --- /dev/null +++ b/docs/2.0/way/platform/components/deploy/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Deploy Infrastructure", + "position": 5, + "collapsed": true +} + diff --git a/docs/2.0/way/platform/components/deploy/catalog.md b/docs/2.0/way/platform/components/deploy/catalog.md new file mode 100644 index 0000000000..3a7014d44c --- /dev/null +++ b/docs/2.0/way/platform/components/deploy/catalog.md @@ -0,0 +1,59 @@ +--- +sidebar_position: 1 +title: Catalog +--- + +# Catalog + +**Where you store your organization's patterns** + +## What is the catalog component? + +The catalog is the system of record for all your organization's [infrastructure patterns](/2.0/way/platform/patterns/overview). + +When a member of your organization goes to ask "how do we approach infrastructure problem X around here?", you need a system of record to point them to. Or when you update a pattern, your pattern authors need to know which pattern version is canonical and where to place their updates. That system of record is the **catalog.** + +## Good catalogs + +### Minimum requirements + +To qualify as a catalog, you need only meet one requirement. The catalog is: + +- **A system of record.** The latest available information about your patterns lives in the catalog. + +### Effective catalogs + +To be an _effective_ catalog, we need a few more requirements. Effective catalogs are: + +- **Browsable.** Users can browse the available patterns using their preferred [interface](/2.0/way/platform/interfaces/overview). +- **Searchable.** Users can search for exactly the pattern they want. +- **Detailed.** Users can browse detailed information about each pattern. +- **Extensible.** As the catalog maintainer, you can add support for new patterns. +- **Reports pattern usage.** Users can see how often patterns are consumed. +- **Educational.** Users can learn background information necessary to be a savvy consumer of a pattern. +- **Connect authors and consumers.** Users can connect easily with pattern authors, creating a virtuous feedback loop. +- **Connect to self-service.** When user identifies a pattern they want to instantiate, the catalog directs them right to the applicable [runbook](/2.0/way/platform/components/deploy/runbooks). + +## Catalog options + +You have several options when implementing a catalog. In order from least favorable to most favorable: + +### No system of record + +You simply do not have a system of record. If a user wants to know whether a pattern exists, they verbally ask, but there is no written resource to consult. + +### Documentation site + +You have a documentation site that users visit to learn about what patterns are available, but the patterns themselves are scattered across the organization. + +### Git repository + +All your patterns are collected in one or more git repositories. + +### Catalog UI + +You have a Terminal UI and/or Web UI that enables users to browse the catalog. The underlying system of record may still be a git repository. + +:::info +In our open source tool, Terragrunt, we expose a [terragrunt catalog](https://terragrunt.gruntwork.io/docs/features/catalog/) command that presents a Terminal UI to select the desired module from a specified set of git repos. +::: diff --git a/docs/2.0/way/platform/components/deploy/pipelines.md b/docs/2.0/way/platform/components/deploy/pipelines.md new file mode 100644 index 0000000000..e8e9b8712c --- /dev/null +++ b/docs/2.0/way/platform/components/deploy/pipelines.md @@ -0,0 +1,23 @@ +--- +sidebar_position: 3 +title: Pipelines +--- + +# Pipelines + +**How you deploy infrastructure changes** + +Pipelines are the automated workflows that take infrastructure code and deploy it to your cloud environment. They include the checks, tests, and policies that ensure infrastructure meets your standards before deployment. + +## Why it matters + +Centralized pipelines enforce governance, provide audit trails, and ensure consistent deployment processes across all infrastructure changes. + +## What it enables + +- Automated deployment of infrastructure code +- Policy enforcement before deployment +- Testing and validation in pipeline +- Rollback capabilities when issues arise +- Audit trail of all infrastructure changes + diff --git a/docs/2.0/way/platform/components/deploy/runbooks.md b/docs/2.0/way/platform/components/deploy/runbooks.md new file mode 100644 index 0000000000..7a45e8f145 --- /dev/null +++ b/docs/2.0/way/platform/components/deploy/runbooks.md @@ -0,0 +1,151 @@ +--- +sidebar_position: 2 +title: Runbooks +--- + +# Runbooks + +**How developers configure an infrastructure module instance** + +## What is the Runbooks component? + +### The challenge + +When a developer needs to deploy new infrastructure, per the principle of [define all live infrastructure as pattern instances](/2.0/way/principles/technical-foundations/define-all-live-infrastructure-as-pattern-instances), the developer will need a way to create a new [infrastructure module instance](/2.0/way/platform/patterns/types#infrastructure-module-instances). + +But creating a module instance can be complex! The developer may need to do any or all of the following: + +- Confirm that they meet all pre-requisites to use a given infrastructure module +- Perform other actions needed to meet the pre-requisites (e.g. obtain a new internal API key) +- Write new code that calls the desired [infrastructure module](/2.0/way/platform/patterns/types#infrastructure-modules) (e.g. a Terragrunt unit) +- Compose many infrastructure modules together to achieve a more complex use case +- "Apply" the code they generated +- Validate that a pattern instance was deployed successfully + +The breadth and depth of these requirements impose a heavy burden on the humble developer who wishes only to deploy a pattern so they can get back to their real job of building their app. + +### The solution + +The solution is to make meeting the requirements above easy for the developer by encapsulating all required expertise into an intuitive, streamlined format. + +One format for accomplishing this is the **runbook.** We call this format a runbook because users will go through a sequence of individual steps that, once finished, achieve a configured instance of the desired infrastructure module. + +A first-class Runbook should be able to do all the following: + +- Run pre-flight checks to ensure the user is "ready" to instantiate the module +- Perform "side effects" like requesting an API key, or installing new local tools +- Enable the user to configure the module with custom parameter values +- Generate the relevant code +- Validate that the infrastructure module instance deployed correctly by running post-flight checks + +Runbooks pack a wide collection of functionality into a single format, so we explore some real-world options below. + +### Developer self-service + +A common industry term is "developer self-service." We can now assert a definition of developer self-service using the terms we've introduced so far. + +Specifically, developer self-service is the combination of a **[catalog](/2.0/way/platform/components/deploy/catalog)** of infrastructure modules, plus the ability to use **runbooks,** plus a way to deploy the generated code using a **[pipeline](/2.0/way/platform/components/deploy/pipelines)**. + +## Effective runbooks + +### Minimum requirements + +To qualify as a runbook, you need to at least meet the following requirements. A runbook must: + +- **Expose parameter values.** The user can see a list of the parameter values available. +- **Configure parameter values.** The user can configure the parameter values. +- **Generate code.** The user can instantiate an [infrastructure module](/2.0/way/platform/patterns/types#infrastructure-modules), either by generating code or other means. + +### Effective runbooks + +To be an _effective_ runbook, we need a few more requirements. Effective runbooks are/do: + +#### General + +- **Easily authored.** It must be easy for runbook authors to capture their expertise. +- **Easily launched.** It must be easy for the runbook consumer to begin using the runbook. +- **Documented.** The runbook must teach the user about this particular area of subject matter expertise, intermixing documentation and interactivity. +- **Testable.** A runbook must be programmatically testable so that you can continually validate that it functions as expected. +- **Capture feedback.** The runbook must expose a way for runbook consumers to share feedback about their experience and request improvements. + +#### Code execution + +- **Run arbitrary code.** To check pre-requisites, perform "side effects" (like requesting an API key), and validate post-apply correctness, users need to run arbitrary code written in standard programming languages. +- **Real-time.** When users execute arbitrary code to run pre-flight checks, post-flight checks, or achieve side effects, they should see what is happening in real-time. +- **Secure.** The ability to run arbitrary code needs to be paired with a strong security posture that ensures only trusted code is executed. + +#### Configuration + +- **Read external data.** An infrastructure module instance often needs data from other module instances or other sources, so the runbook must be able to "import" these values somehow. +- **Adaptive.** As users fill in configuration values, some configuration options are disabled and new options are revealed. + +## Runbook options + +You have several options when implementing a runbooks. In order from least favorable to most favorable: + +### Static documentation + +You can describe in writing how a user would create the code necessary to instantiate an infrastructure module, how they would identify the parameter values, and how to configure those values. + +You _can_ do all of this, but as you'll see shortly, there are better options. + +### Code templates + +You can define a template that generates the code necessary to instantiate an infrastructure module, and then give some mechanism for users to specify parameters. + +For example, the command [terragrunt scaffold](https://terragrunt.gruntwork.io/docs/features/scaffold/) expects a parameter for an OpenTofu/Terraform module URL and will then generate a template that looks like this: + +```hcl +# This is a Terragrunt unit generated by Gruntwork Boilerplate. +terraform { + source = "git::https://github.com/gruntwork-io/terragrunt-infrastructure-modules-example.git//modules/mysql?ref=v0.8.1" +} + +inputs = { + # -------------------------------------------------------------------------------------------------------------------- + # Required input variables + # -------------------------------------------------------------------------------------------------------------------- + + # Type: string + # Description: The AWS region to deploy to (e.g. us-east-1) + aws_region = "" # TODO: fill in value + + # Type: string + # Description: The name of the DB + name = "" # TODO: fill in value + + # Type: string + # Description: The instance class of the DB (e.g. db.t2.micro) + instance_class = "" # TODO: fill in value + + # (... full list of inputs omitted for brevity ...) +} +``` + +This improves on using static documentation because we offer the user a pre-set opinion on how to generate inputs. Rather than telling the user how to write this code, we can now tell them how to generate a template that writes the code. + +Better yet, either the process of generating the code includes a step to configure variables, or the template itself can leave placeholders for users to enter variables (as it does above). + +:::info +Gruntwork maintains an open-source template generator built specially for DevOps and platform engineering called **Gruntwork Boilerplate.** Learn more at [https://github.com/gruntwork-io/boilerplate](https://github.com/gruntwork-io/boilerplate). +::: + +### Infrastructure module UI + +You can build (or buy) a solution that renders a UI for a given infrastructure module. The UI lists the available parameter values, and users fill in those values directly through the UI. Ideally, the UI can allow users to fetch value from external data sources such as infrastructure-as-code state, a secrets manager, or a third-party tool. + +The primary limitation of the infrastructure module UI is that, by definition, it is scoped to a single infrastructure module. + +### Infrastructure template UI + +An infrastructure template UI renders a UI not around an infrastructure module, but around a [code template](#code-templates). This makes it more powerful because a code template could generate a single infrastructure module instance, or a collection of infrastructure module instances, or really anything. + +Beyond that, the functionality is the same. The UI lists the available parameters, and users fill in those values directly through the UI. The template then renders based on those input parameters. + +### First-class runbooks + +It's not a common solution, but the best option is to combine an [infrastructure template UI](#infrastructure-template-ui) with the ability to execute arbitrary code, all in a format that's easy to document, test, and author. + +:::info +Gruntwork maintains an open source tool called [Gruntwork Runbooks](https://runbooks.gruntwork.io/) that is based on all the insights and philosophy that we capture here. Check it out! +::: diff --git a/docs/2.0/way/platform/components/maintain/_category_.json b/docs/2.0/way/platform/components/maintain/_category_.json new file mode 100644 index 0000000000..53a48e9158 --- /dev/null +++ b/docs/2.0/way/platform/components/maintain/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Maintain Infrastructure", + "position": 7, + "collapsed": true +} + diff --git a/docs/2.0/way/platform/components/maintain/drift-detector.md b/docs/2.0/way/platform/components/maintain/drift-detector.md new file mode 100644 index 0000000000..802927a9d0 --- /dev/null +++ b/docs/2.0/way/platform/components/maintain/drift-detector.md @@ -0,0 +1,22 @@ +--- +sidebar_position: 1 +title: Drift Detector +--- + +# Drift Detector + +**Detect drift between code and cloud** + +Drift detection continuously monitors your infrastructure to identify when deployed resources no longer match their Infrastructure as Code definitions. This helps you maintain the integrity of your infrastructure as code practice. + +## Why it matters + +Drift happens inevitably as teams make manual changes, resources get modified outside IaC, or automation scripts run. Undetected drift means your IaC no longer represents reality, breaking reproducibility and creating hidden risks. + +## What it enables + +- Automated detection of infrastructure drift +- Alerts when drift occurs +- Reports showing what drifted and how +- Ability to reconcile drift automatically or manually + diff --git a/docs/2.0/way/platform/components/maintain/iac-updater.md b/docs/2.0/way/platform/components/maintain/iac-updater.md new file mode 100644 index 0000000000..6911b77c76 --- /dev/null +++ b/docs/2.0/way/platform/components/maintain/iac-updater.md @@ -0,0 +1,22 @@ +--- +sidebar_position: 2 +title: IaC Updater +--- + +# IaC Updater + +**Update out-of-date IaC or tooling versions** + +The IaC updater identifies infrastructure using outdated patterns or tool versions and helps you modernize it. This prevents technical debt from accumulating as your patterns and tools evolve. + +## Why it matters + +Infrastructure ages quickly. What was modern six months ago may now be outdated. Without systematic updates, your infrastructure estate becomes increasingly difficult to maintain. + +## What it enables + +- Identification of outdated infrastructure +- Automated or guided update processes +- Migration paths from old to new patterns +- Tracking update progress across your estate + diff --git a/docs/2.0/way/platform/components/maintain/importer.md b/docs/2.0/way/platform/components/maintain/importer.md new file mode 100644 index 0000000000..f61207095e --- /dev/null +++ b/docs/2.0/way/platform/components/maintain/importer.md @@ -0,0 +1,22 @@ +--- +sidebar_position: 4 +title: Importer +--- + +# Importer + +**Import non-codified assets** + +The importer takes infrastructure that exists in the cloud but isn't reflected in your IaC and generates the code to manage it. This helps you bring legacy or manually-created infrastructure under IaC management. + +## Why it matters + +Most organizations have infrastructure that predates their IaC adoption or was created outside normal processes. Without importing this infrastructure, you have blind spots in your infrastructure management. + +## What it enables + +- Discovery of unmanaged cloud resources +- Automated generation of IaC for existing resources +- Gradual migration to infrastructure as code +- Complete inventory of all cloud resources + diff --git a/docs/2.0/way/platform/components/maintain/scorecard.md b/docs/2.0/way/platform/components/maintain/scorecard.md new file mode 100644 index 0000000000..713889cb84 --- /dev/null +++ b/docs/2.0/way/platform/components/maintain/scorecard.md @@ -0,0 +1,22 @@ +--- +sidebar_position: 3 +title: Scorecard +--- + +# Scorecard + +**Assess standards compliance** + +A scorecard evaluates repositories, deployments, or teams against your organization's standards and best practices. It provides objective measures of infrastructure quality and compliance. + +## Why it matters + +Standards only matter if you can measure compliance. Scorecards make standards concrete and visible, helping teams understand where they meet expectations and where they need improvement. + +## What it enables + +- Objective measurement of standards compliance +- Identification of high-risk infrastructure +- Benchmarking teams against each other +- Progress tracking on improvement initiatives + diff --git a/docs/2.0/way/platform/components/operate/_category_.json b/docs/2.0/way/platform/components/operate/_category_.json new file mode 100644 index 0000000000..358827c90f --- /dev/null +++ b/docs/2.0/way/platform/components/operate/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Operate Infrastructure", + "position": 6, + "collapsed": true +} + diff --git a/docs/2.0/way/platform/components/operate/dashboards.md b/docs/2.0/way/platform/components/operate/dashboards.md new file mode 100644 index 0000000000..9edb203b4a --- /dev/null +++ b/docs/2.0/way/platform/components/operate/dashboards.md @@ -0,0 +1,22 @@ +--- +sidebar_position: 3 +title: Dashboards +--- + +# Dashboards and Data Visualization + +**Translate data points into insights** + +Dashboards aggregate data from across your infrastructure to provide actionable insights. They help teams understand trends, identify problems, and track progress toward goals. + +## Why it matters + +Raw data is overwhelming. Dashboards transform metrics into insights that drive decision-making and highlight areas needing attention. + +## What it enables + +- Visibility into infrastructure trends over time +- Quick identification of anomalies +- Progress tracking toward goals +- Executive visibility into infrastructure health + diff --git a/docs/2.0/way/platform/components/operate/registry.md b/docs/2.0/way/platform/components/operate/registry.md new file mode 100644 index 0000000000..7818a10a4e --- /dev/null +++ b/docs/2.0/way/platform/components/operate/registry.md @@ -0,0 +1,22 @@ +--- +sidebar_position: 2 +title: Registry +--- + +# Registry + +**Where you see and manage what the business is running** + +The registry maintains a comprehensive record of all infrastructure instances that have been deployed. It tracks the high-level entities that matter to your organization, not just individual cloud resources. + +## Why it matters + +A registry provides the source of truth for what infrastructure exists, who owns it, and how it relates to business objectives. This is essential for understanding your infrastructure estate at scale. + +## What it enables + +- Inventory of all infrastructure instances +- Mapping infrastructure to teams and services +- Tracking infrastructure lifecycle and ownership +- Integration point for other tools and systems + diff --git a/docs/2.0/way/platform/components/operate/unit-browser.md b/docs/2.0/way/platform/components/operate/unit-browser.md new file mode 100644 index 0000000000..948b687b8e --- /dev/null +++ b/docs/2.0/way/platform/components/operate/unit-browser.md @@ -0,0 +1,22 @@ +--- +sidebar_position: 1 +title: Unit Browser +--- + +# Unit Browser + +**How to browse what infrastructure is deployed** + +A unit browser lets you explore your infrastructure at the level of abstraction that matters to your organization. Rather than just listing cloud resources, it shows the meaningful entities your teams care about. + +## Why it matters + +Without a unit browser, understanding what's deployed requires piecing together information from multiple sources and translating low-level cloud resources into high-level concepts. + +## What it enables + +- Quick understanding of deployed infrastructure +- Discovery of infrastructure ownership +- Search and filtering by relevant attributes +- Understanding relationships between infrastructure units + diff --git a/docs/2.0/way/platform/components/overview.md b/docs/2.0/way/platform/components/overview.md new file mode 100644 index 0000000000..e78482e84a --- /dev/null +++ b/docs/2.0/way/platform/components/overview.md @@ -0,0 +1,58 @@ +--- +sidebar_position: 1 +title: Overview +--- + +# Platform Components + +**Platform components are the building blocks you need to create and manage modern infrastructure.** They're the tools and systems that enable developers to deploy patterns, maintain infrastructure health, and meet governance requirements. + +While [patterns](/2.0/way/platform/patterns/overview) define your opinions for solving a given infrastructure problem, platform components provide the machinery that makes deploying, operating, and maintaining that infrastructure efficient and reliable at scale. + +## Your developer platform is built from components + +One of the core ideas of The Gruntwork Way is that you build your developer platform by implementing: + +- [Patterns](/2.0/way/platform/patterns/overview) +- Components (this section!) +- [Interfaces](/2.0/way/platform/interfaces/overview) + +We talk about components individually because that is the most helpful way to think about them. But in practice, your end users may simply see a "developer platform" and the lines between specific components may be blurry. + +For example, a developer might browse your **catalog**, create an infrastructure module instance with a **runbook,** and then deploy the new code with your **pipeline.** That these are three separate components is not meaningful to the developer as long as they can accomplish their goal of deploying new infrastructure. + +But for the platform engineer as the maintainer, it is clarifying to understand that to deploy their new infrastructure, your developer end-user actually needs to make use of three separate but connected components. + +## Component categories + +Platform components organize into three major categories based on their purpose: + +### Deploy Infrastructure + +The components you need to create, modify, and destroy infrastructure. These include: +- **[Catalog](/2.0/way/platform/components/deploy/catalog)** - Where you store your organization's patterns +- **[Runbooks](/2.0/way/platform/components/deploy/runbooks)** - How developers request and provision infrastructure +- **[Pipelines](/2.0/way/platform/components/deploy/pipelines)** - How you deploy infrastructure changes + +### Operate Infrastructure + +The components you need to understand what's running and how it's performing. These include: +- **[Unit Browser](/2.0/way/platform/components/operate/unit-browser)** - Browse what infrastructure is deployed +- **[Registry](/2.0/way/platform/components/operate/registry)** - Track what the business is running +- **[Dashboards](/2.0/way/platform/components/operate/dashboards)** - Visualize infrastructure data and insights + +### Maintain Infrastructure + +The components you need to detect and fix infrastructure debt. These include: +- **[Drift Detector](/2.0/way/platform/components/maintain/drift-detector)** - Detect drift between code and cloud +- **[IaC Updater](/2.0/way/platform/components/maintain/iac-updater)** - Update out-of-date IaC or tooling versions +- **[Scorecard](/2.0/way/platform/components/maintain/scorecard)** - Assess standards compliance +- **[Importer](/2.0/way/platform/components/maintain/importer)** - Import non-codified assets + +## Build vs buy + +You can choose to build or buy most of these components. You can build some and buy others. You can also choose to optimize for "best of breed" (the best individual component) or "best of suite" (the best combination of components), or mix and match build and buy to suit your needs. + +## Next + +Now let's learn about each platform component in more depth. \ No newline at end of file diff --git a/docs/2.0/way/platform/interfaces/_category_.json b/docs/2.0/way/platform/interfaces/_category_.json new file mode 100644 index 0000000000..4afaef9da4 --- /dev/null +++ b/docs/2.0/way/platform/interfaces/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Interfaces", + "position": 4 +} + diff --git a/docs/2.0/way/platform/interfaces/overview.md b/docs/2.0/way/platform/interfaces/overview.md new file mode 100644 index 0000000000..49663f3956 --- /dev/null +++ b/docs/2.0/way/platform/interfaces/overview.md @@ -0,0 +1,24 @@ +--- +sidebar_position: 1 +title: Overview +--- + +# Interfaces + +Your developer platform should be accessible through multiple interfaces to meet developers where they work. Different workflows and user personas benefit from different interaction modes, and providing multiple interfaces ensures your platform can serve the diverse needs of your organization. + +## Web UI + +The Web UI works well when developers want to browse the catalog, discover what infrastructure is deployed, or configure patterns through guided self-service workflows. It excels at discoverability and reducing cognitive load for teams new to your platform or performing infrequent tasks. + +## CLI + +The CLI works well when developers want to interact with your platform directly from their terminal, where much of their work already happens. It's ideal for power users who need speed, scriptability, and integration into local development workflows and CI/CD pipelines. + +## API + +The API works well when you need programmatic access to your platform's capabilities for integrating with other systems and automating at scale. It's essential for orchestrating multiple platform operations, building custom tooling, and enabling other systems to interact with your infrastructure. + +## AI/MCP + +The AI interface, including support for the Model Context Protocol (MCP), works well when developers want to interact with your platform using natural language. It reduces the learning curve and provides contextual help, making your platform more accessible to developers of all experience levels. \ No newline at end of file diff --git a/docs/2.0/way/platform/overview.md b/docs/2.0/way/platform/overview.md new file mode 100644 index 0000000000..65d05c5684 --- /dev/null +++ b/docs/2.0/way/platform/overview.md @@ -0,0 +1,26 @@ +--- +sidebar_position: 1 +title: Overview +--- + +# Building your platform + +To build a successful developer platform, you need three essential building blocks: + +1. [Patterns](/2.0/way/platform/patterns/overview) - Pre-built solutions to common infrastructure problems +2. [Platform Components](/2.0/way/platform/components/overview) - A core collection of functional capabilities +3. [Interfaces](/2.0/way/platform/interfaces/overview) - How developers interact with your platform + +## How they work together + +To achieve true developer self-service, you need all three building blocks. For example, when a developer needs to deploy a new Amazon ECS service: +- They have identified a common infrastructure need (pattern) +- They discover the relevant pattern through the **catalog** (platform component) +- They might use a **Web UI** (interface) to configure it +- The **infrastructure pipeline** (platform component) deploys it +- **Policies** (pattern) verify it meets your internal standards +- The **registry** (platform component) tracks the deployment + +## Next + +Let's dig into the first building block: patterns. \ No newline at end of file diff --git a/docs/2.0/way/platform/patterns/_category_.json b/docs/2.0/way/platform/patterns/_category_.json new file mode 100644 index 0000000000..50980e63f7 --- /dev/null +++ b/docs/2.0/way/platform/patterns/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Patterns", + "position": 2 +} diff --git a/docs/2.0/way/platform/patterns/good-patterns.md b/docs/2.0/way/platform/patterns/good-patterns.md new file mode 100644 index 0000000000..5c6aad6b73 --- /dev/null +++ b/docs/2.0/way/platform/patterns/good-patterns.md @@ -0,0 +1,163 @@ +--- +sidebar_position: 4 +title: Good Patterns +--- + +# Good patterns + +What makes an effective pattern? + +## Minimum requirements + +To qualify as a pattern, you need only meet two requirements. A pattern is: + +- **Reusable.** The pattern is intended to be used by potentially many consumers. +- **Opinionated.** The pattern takes a stand on how something should be done. + +## Effective patterns + +To be an _effective_ pattern, we need a few more requirements. Effective patterns are: + +- **Abstracted.** The consumer of the pattern can use it without comprehensively understanding how it is built. +- **Good by default.** A consumer would need to go out of their way to give the pattern a "bad" configuration. +- **Configurable.** Within the confines of the opinions, the consumer can customize the pattern as needed. +- **Tested.** The pattern has been validated to work as advertised. +- **Documented.** The pattern has documentation so that its consumers can understand it. +- **Vetted.** The pattern has been approved for used by a subject matter expert. +- **Maintained.** The pattern is versioned and actively updated to reflect new insights. + +This is admittedly a long list, but it works best as a checklist when building a pattern. + +## Examples + +Let's look at some examples of good patterns. + +### Creating an AWS S3 Bucket + +The [Gruntwork S3 Bucket Module](https://docs.gruntwork.io/reference/services/data-storage/s-3-bucket/) is written for either OpenTofu or Terraform, and exposes a single required variable input, `primary_bucket`, which is the name of the S3 Bucket. A user creating the S3 bucket need understand very little of how the module works to use it successfully, so this pattern is **abstracted.** + +The module exposes 50+ variables that can be used to configure the S3 bucket as needed. That is, this pattern is **good by default** but also **configurable.** + +The module is validated with automated tests using [Terratest](https://github.com/gruntwork-io/terratest), so it is **tested.** Specifically, the tests run `tofu apply`, checks the logging configuration, uploads a file and confirms that it was replicated, and then runs `tofu destroy`. + +
+ See the actual test code for this module. + + ```go + import ( + "os" + "strings" + "testing" + + "github.com/gruntwork-io/aws-service-catalog/test" + + awsgo "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/s3" + "github.com/aws/aws-sdk-go/service/s3/s3manager" + "github.com/gruntwork-io/terratest/modules/aws" + "github.com/gruntwork-io/terratest/modules/random" + "github.com/gruntwork-io/terratest/modules/terraform" + test_structure "github.com/gruntwork-io/terratest/modules/test-structure" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + ) + + func TestS3Bucket(t *testing.T) { + t.Parallel() + + // Uncomment the items below to skip certain parts of the test + //os.Setenv("SKIP_setup", "true") + //os.Setenv("SKIP_deploy_terraform", "true") + //os.Setenv("SKIP_validate_access_logs", "true") + //os.Setenv("SKIP_validate_replication", "true") + //os.Setenv("SKIP_cleanup", "true") + + testFolder := "../../examples/for-learning-and-testing/data-stores/s3-bucket" + + defer test_structure.RunTestStage(t, "cleanup", func() { + terraformOptions := test_structure.LoadTerraformOptions(t, testFolder) + terraform.Destroy(t, terraformOptions) + }) + + test_structure.RunTestStage(t, "setup", func() { + primaryRegion := aws.GetRandomRegion(t, test.RegionsForEc2Tests, nil) + // Choose a different region for cross-region replication + replicaRegion := aws.GetRandomRegion(t, test.RegionsForEc2Tests, []string{primaryRegion}) + uniqueID := strings.ToLower(random.UniqueId()) + + test_structure.SaveString(t, testFolder, "primaryRegion", primaryRegion) + test_structure.SaveString(t, testFolder, "replicaRegion", replicaRegion) + test_structure.SaveString(t, testFolder, "uniqueID", uniqueID) + }) + + test_structure.RunTestStage(t, "deploy_terraform", func() { + primaryRegion := test_structure.LoadString(t, testFolder, "primaryRegion") + replicaRegion := test_structure.LoadString(t, testFolder, "replicaRegion") + uniqueID := test_structure.LoadString(t, testFolder, "uniqueID") + + terraformOptions := CreateS3BucketTerraformOptions(t, testFolder, uniqueID, primaryRegion, replicaRegion) + + test_structure.SaveTerraformOptions(t, testFolder, terraformOptions) + terraform.InitAndApply(t, terraformOptions) + }) + + test_structure.RunTestStage(t, "validate_access_logs", func() { + terraformOptions := test_structure.LoadTerraformOptions(t, testFolder) + accessLogsBucket := terraform.OutputRequired(t, terraformOptions, "access_logging_bucket_name") + primaryBucket := terraform.OutputRequired(t, terraformOptions, "primary_bucket_name") + primaryRegion := test_structure.LoadString(t, testFolder, "primaryRegion") + + primaryClient := aws.NewS3Client(t, primaryRegion) + + // Since access logs can take a long time to appear in the bucket, we confirm the access logging setup + // not by checking for the existence of logs objects, but by checking the logging configuration to the target + // bucket is properly set. + loggingOutput, err := primaryClient.GetBucketLogging(&s3.GetBucketLoggingInput{ + Bucket: awsgo.String(primaryBucket), + }) + require.NoError(t, err) + assert.Equal(t, accessLogsBucket, awsgo.StringValue(loggingOutput.LoggingEnabled.TargetBucket)) + }) + + test_structure.RunTestStage(t, "validate_replication", func() { + testFilePath := "../fixtures/simple-docker-img/Dockerfile" + testFileKey := "config/Dockerfile" + + terraformOptions := test_structure.LoadTerraformOptions(t, testFolder) + primaryBucket := terraform.OutputRequired(t, terraformOptions, "primary_bucket_name") + primaryRegion := test_structure.LoadString(t, testFolder, "primaryRegion") + + testfile, err := os.Open(testFilePath) + require.NoError(t, err) + defer testfile.Close() + + // To test the replication, we upload a test file to the primary bucket and check the replication status + // of the object immediately following the upload is either PENDING or COMPLETE. We do not check that the + // object actually gets replicated to the replica bucket, since this can take a long time. + primaryUploader := aws.NewS3Uploader(t, primaryRegion) + _, err = primaryUploader.Upload(&s3manager.UploadInput{ + Bucket: awsgo.String(primaryBucket), + Key: awsgo.String(testFileKey), + Body: testfile, + }) + require.NoError(t, err) + + primaryClient := aws.NewS3Client(t, primaryRegion) + objectOutput, err := primaryClient.GetObject(&s3.GetObjectInput{ + Bucket: awsgo.String(primaryBucket), + Key: awsgo.String(testFileKey), + }) + require.NoError(t, err) + assert.Contains(t, []string{"PENDING", "COMPLETE"}, awsgo.StringValue(objectOutput.ReplicationStatus)) + }) + } + ``` +
+ +This module has extensive documentation, so it is **documented.** + +It was written by a Gruntwork subject matter expert, and peer reviewed by other Gruntwork subject matter experts, so it is **vetted.** + +Finally, it is has undergone years of revisions, so it is **maintained.** + +As you can see, it takes a lot to get a pattern right, but when you do, your consumers can achieve [velocity](/2.0/way/why/velocity), [governance](/2.0/way/why/governance), and [maintainability](/2.0/way/why/maintainability) with relative ease. \ No newline at end of file diff --git a/docs/2.0/way/platform/patterns/infrastructure-modules/_category_.json b/docs/2.0/way/platform/patterns/infrastructure-modules/_category_.json new file mode 100644 index 0000000000..89b661a0bc --- /dev/null +++ b/docs/2.0/way/platform/patterns/infrastructure-modules/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Infrastructure Modules", + "position": 10 +} diff --git a/docs/2.0/way/platform/patterns/infrastructure-modules/authoring.md b/docs/2.0/way/platform/patterns/infrastructure-modules/authoring.md new file mode 100644 index 0000000000..643218a7ab --- /dev/null +++ b/docs/2.0/way/platform/patterns/infrastructure-modules/authoring.md @@ -0,0 +1,151 @@ +--- +sidebar_position: 6 +title: Authoring +--- + +# Authoring Infrastructure Modules + +Infrastructure modules have both _authors_ and _consumers_. In this section, we talk about infrastructure module authors. + +The **infrastructure module author** is the person who creates or defines the infrastructure module. + +## Infrastructure module author lifecycle + +Infrastructure module authors experience the following lifecycle: + +1. **Create a module** - Create the infrastructure module based on a recurring need from stakeholders. +2. **Update a module** - Update the infrastructure module to reflect the latest best practices. +3. **Retire a module** - When the infrastructure module is no longer recommended or needed, gracefully retire it. + +## Creating infrastructure modules + +When infrastructure module authors create a new module, they need to answer two questions: + +1. From where should I source the infrastructure module? +2. How should I implement the infrastructure module? + +### Sourcing infrastructure modules + +Authors have three main options for sourcing infrastructure modules: + +1. **Build in-house** - You build the infrastructure module yourself from scratch. This gives you full control, but also the highest maintenance burden. (high control, low convenience) +2. **Use open source infrastructure modules** - You find community-authored infrastructure modules that work for your needs, review them, and vet them as an official infrastructure modules for your organization. This gives you low control, but also outsources maintenance to others (low control, high convenience) +3. **Use commercial infrastructure modules** - You purchase a commercial infrastructure module library like the [Gruntwork AWS IaC Library](https://docs.gruntwork.io/2.0/docs/library/concepts/overview). This gives you moderate control (you can engage the vendor if needed) and outsources maintenance to others (moderate control, high convenience). + +### Consuming infrastructure modules + +If you source a module from a third party (open source or commercial), you have three options for how your users can consume it: + +1. **Direct reference** - You decide that users will directly reference the infrastructure module. For example, you choose a commercial infrastructure module and decide your [infrastructure module instances](/2.0/way/platform/patterns/types#infrastructure-module-instances) will directly reference the vendor's GitHub repo. +2. **Wrapper module** - You write an inhouse module that calls the third-party module. This gives you the ability to customize the third-party module without having to re-implement or maintain it. Users will now directly access this wrapper module. For example, you use an open source AWS Lambda module, but write your own Acme AWS Lambda module that uses the open source AWS Lambda module. +3. **Fork** - You fork the third-party module and maintain your own copy, at which point this is a direct reference, but based on a third-party starting point. Platform engineers are often tempted to do this when a single variable or output is missing, but they can massively underestimate the amount of work required to maintain the module over time. Only do this as an absolute last resort. Try working with the module author to incorporate your desired change first. + +Note that if you build your infrastructure module in-house, the only way to implement your module is by building it as a direct reference module. + +### Two layers of abstraction + +When building infrastructure modules, you deal with two somewhat competing needs: + +- **Developers want convenience.** Application developers want often complex infrastructure modules that "just work" out of the box. +- **Platform engineers want maintainability.** Platform engineers want small, narrowly scoped infrastructure modules that are easy to maintain. + +The way to address this tension is to build infrastructure modules at two different layers of abstractions: + +1. **Low-level modules.** Low-level modules should be narrowly scoped, and optimized for reusability and maintainability. +2. **High-level modules.** High-level modules should be broadly scoped, and optimized for convenience. Importantly, high-level modules should be built out of low-level modules. + +For example, you might have a low-level module that only creates a load balancer. A load balancer on its own isn't terribly useful, but as we'll see shortly, you can use the load balancer as a "building block" to create more complex modules. + +To give an example of a high-level module, suppose that you have application teams who want to create their own Kubernetes clusters. In this case, the high-level module we need is a "Kubernetes cluster." But we build that high-level module out of one or more low-level modules like load balancer, K8s control plane, K8s managed workers, container logs, karpenter, ArgoCD, and more. + +The key idea here is that you can _compose_ lower-level modules into one or more higher-level modules. This enables you to reuse a single lower-level module (like one to create an S3 bucket) in many scenarios, and thus lower overall maintenance surface area. It also makes testing easier because you can aggressively test lower-level modules and build more limited tests for higher-level modules. + +Of course, even this approach has its tradeoffs because now any update to a lower-level module has to "propagate" to higher-level modules, which can be cumbersome. Fortunately, there are tools available to mitigate this. For example, [Terragrunt Stacks](https://terragrunt.gruntwork.io/docs/features/stacks/) explicitly defines a higher-level infrastructure module out of lower-level modules, making it far easier to manage individual OpenTofu module version updates. + +:::note +You can see a real-world example of the low-level/high-level split in the [Gruntwork AWS IaC Library](https://docs.gruntwork.io/2.0/docs/library/concepts/overview#two-types-of-modules), which has both "building block modules" (low-level modules) and "service modules" (high-level modules). +::: + +### Testing infrastructure modules + +It's important to write automated tests to validate your infrastructure modules. For example, [Gruntwork Terratest](https://github.com/gruntwork-io/terratest) is a Go library that you can use to automatically apply an OpenTofu/Terraform module, run tests against the deployed infrastructure, and tear down the deployed infrastructure. + +:::note +For more information, see the section on [automated testing](https://books.gruntwork.io/books/fundamentals-of-devops/testing-your-systems#automated_testing) in the _Fundamentals of DevOps and Software Delivery: A hands-on guide to deploying and managing software in production_ by our co-founder, Yevgeniy Brikman. +::: + +### Compliant infrastructure modules + +It can be helpful to scan your infrastructure modules against known security policies so that you assert the _compliance status_ of an infrastructure module. + +In some cases, you can build an infrastructure module so that it can only ever be configured in a compliant way. In other cases, you can only build a "possibly compliant" module, which means that for the right variable inputs, the module will indeed pass a given compliance check. + +Testing modules for compliance can be tricky because you need to identify which compliance standard you care about, identify the relevant rule within that standard that might apply to the module, write automated tests that launch the relevant module infrastructure, and then run the specific compliance checks against the deployed infrastructure. + +This is the gold standard of compliant module checking, but frankly, it's a lot of work! + +### Infrastructure module visualizations + +Many teams like to include a visualization of their infrastructure modules so that consumers can see at a glance what architecture it uses and what resources it contains. + +Once built, these visualizations are great. The challenge is in keeping them up to date. + +## Updating infrastructure modules + +Once an infrastructure module is created, it needs to be updated from time-to-time. That in turn creates some other unique needs. + +### Versioning + +We recommend adopting [semantic versioning](https://semver.org/) for your infrastructure modules, where, given a version number `MAJOR.MINOR.PATCH`, you increment the: + +- `MAJOR` version when you make a breaking change +- `MINOR` version when you add functionality in a backward-compatible manner +- `PATCH` version when you make a backward-compatible bug fix + +But this guideline holds another fundamental tension because you'll also need to balance two somewhat competing concerns: + +- **You want to limit the number of git repos you use.** Infrastructure modules are, by definition, written in code, and so they are stored in git repos. Each git repo involves some degree of overhead to manage, so ideally, you have a limited number of repos used to manage all your infrastructure modules. The ideal here is a single repo containing many modules. +- **You want meaningful new versions.**. Every time you make a change and assign a new version to a module update, you ideally want that new git repo tag to refer only to the new module, and not have any "empty" updates. The ideal here is one repo per module so that you never have empty versions. + +But the ideals of "single repo containing many modules" and "one repo per module" are direct contradictions of each other. So what to do? + +In our experience, the better option is to deal with empty versions and fewer git repos. Better yet, you can write tooling to automate nearly all aspects of empty versions (like your [IaC updater](/2.0/way/platform/components/maintain/iac-updater)). There's also tooling available to manage many git repos at once (like our open source tool, [git-xargs](https://github.com/gruntwork-io/git-xargs)), but using these can be cumbersome. That's why we recommend fewer git repos at the expense of empty versions, mitigated by tooling. + +### Propagating version updates + +When you update a low-level infrastructure module, you often need to propagate that update to all the higher-level modules and infrastructure module instances that depend on it. This can become a significant maintenance burden, especially in large organizations with many modules. + +To manage this effectively: + +- **Use automation** - Tools like [Terragrunt's dependency management](https://terragrunt.gruntwork.io/docs/features/keep-your-terraform-code-dry/) or an [IaC updater](/2.0/way/platform/components/maintain/iac-updater) can help automate the process of updating module versions across your infrastructure. +- **Batch updates** - Group related module updates together to reduce the number of change cycles. +- **Validate your updates** - Run tests after you update infrastructure modules or infrastructure module instances to validate that your modules still work as expected. + +## Retiring infrastructure modules + +Retiring infrastructure modules requires careful planning and communication to ensure a smooth transition for consumers. When an infrastructure module is no longer recommended or needed, follow these steps: + +### Detecting when to retire a module + +Infrastructure modules should be scheduled for retirement when ALL of the following are true: + +- **Better alternatives exist** - A newer module or approach provides superior functionality, security, or maintainability, and +- **You cannot easily update the existing module** - The architecture of the module is such that upgrading it to reflect your latest opinions would be akin to creating a new module. + +Separately, you might retire a module if it is simply not used very often. By reducing your surface area, you increase your team's velocity on the projects that are adding impact. + +### Deprecation process + +When you do retire a module, follow a gradual deprecation process: + +1. **Announce deprecation** - Clearly communicate to all module consumers that the module is being deprecated, why it's being deprecated, and what the recommended alternative is. +2. **Set a timeline** - Give consumers a reasonable amount of time to migrate (typically 6-12 months for critical infrastructure). +3. **Update documentation** - Add clear deprecation warnings to the module's README and any relevant documentation. +4. **Provide migration guidance** - Create detailed migration guides showing consumers how to move from the deprecated module to the recommended alternative. +5. **Stop accepting new features** - Only accept critical bug fixes and security patches during the deprecation period. +6. **Monitor usage** - Track which teams are still using the deprecated module and reach out proactively to help them migrate. +7. **Remove the module** - Once usage drops to zero (or the timeline expires), archive or remove the module from your infrastructure module catalog. + +## Next + +You've made it through a lot of material! Now that your infrastructure module has been created, let's look at the infrastructure module consumer's experience. \ No newline at end of file diff --git a/docs/2.0/way/platform/patterns/infrastructure-modules/categories.md b/docs/2.0/way/platform/patterns/infrastructure-modules/categories.md new file mode 100644 index 0000000000..8a97fa7c18 --- /dev/null +++ b/docs/2.0/way/platform/patterns/infrastructure-modules/categories.md @@ -0,0 +1,77 @@ +--- +sidebar_position: 4 +title: Categories +--- + +# Infrastructure Module Categories + +Earlier, we [defined](/2.0/way/platform/patterns/overview) patterns as _pre-built opinionated solutions to common infrastructure problems._ So what exactly are those common infrastructure problems? + +Based on years of experience, we've identified the following **categories** of infrastructure modules. Each category has one or more **subject matter expertise topics (SME topics)**. Each SME topic has one or more infrastructure modules. + +For example, the Cloud foundations category has a networking SME topic, which for AWS has modules for VPC peering, VPC flow logs, transit gateway, and more. + +Let's look at all the categories now. + +## Cloud foundations + +Cloud foundations includes everything you need to get your cloud provider (e.g. AWS, Azure) environment up and running. This includes account configuration, account baselines, networking configuration, and observability foundations. + +SME topics for AWS include: + +- Networking + - Subnets, route tables, security groups, etc. +- Organizational baselines* + - Service Control Policies, Backup Policies, IAM role definitions, etc. +- Account baselines* + - Baseline security service configurations +- Organization observability + - Alerts, Logs, Metrics + +_*We're using AWS terms here, but there are GCP and Azure equivalents related to projects, subscriptions, etc._ + +## Running apps + +Running apps refers to how you deploy and operate your applications. This covers container orchestration, serverless platforms, and traditional server-based apps. + +SME topics include: + +- K8s +- Amazon ECS +- Serverless +- Amazon EC2 +- Secrets management +- App observability +- App CI/CD + +## Storing data + +Storing data is all about how you store and manage data. It includes relational databases, key-value stores, file storage, queues, streams, and data processing pipelines. + +SME topics include: + +- Relational databases (Amazon RDS, Amazon Aurora) +- Key-value store (Amazon ElastiCache) +- File storage (AWS S3, AWS Glacier) +- Queues & streams (Amazon Kinesis, Amazon SQS) +- Time-series databases (Tiger Data, InfluxDB) + +_Examples: AWS RDS databases, Amazon S3 buckets, AWS DynamoDB tables, Amazon Kinesis streams_ + +## AI + +AI includes any patterns needed to enable AI workloads and integrations. + +SME topics include: + +- AI model access (Azure OpenAI, Amazon Bedrock) +- Data platform configuration (Databricks, Amazon Sagemaker) +- Data lakes (Snowflake, AWS Glue, Amazon Athena, Amazon RedShift) +- Vector databases (Pgvector, Pinecone, Amazon OpenSearch) + +This covers AI model access, data platforms for AI, specialized compute resources, and AI observability. + +## Other categories, SME topics, and modules + +The above list is what we, Gruntwork, have encountered, but naturally your organization's needs will vary slightly. + diff --git a/docs/2.0/way/platform/patterns/infrastructure-modules/overview.md b/docs/2.0/way/platform/patterns/infrastructure-modules/overview.md new file mode 100644 index 0000000000..f3bd4442b6 --- /dev/null +++ b/docs/2.0/way/platform/patterns/infrastructure-modules/overview.md @@ -0,0 +1,11 @@ +--- +sidebar_position: 1 +title: Overview +--- + +Earlier, we talked about the many different [types of patterns](/2.0/way/platform/patterns/types). One of those types, the **infrastructure module**, is by far the most common, so let's discuss them in more detail here. + +In this section, we'll cover: + +- Common [categories](/2.0/way/platform/patterns/infrastructure-modules/categories) of infrastructure modules +- Best practices for [authoring](/2.0/way/platform/patterns/infrastructure-modules/authoring) infrastructure modules \ No newline at end of file diff --git a/docs/2.0/way/platform/patterns/overview.md b/docs/2.0/way/platform/patterns/overview.md new file mode 100644 index 0000000000..5ffbf35bd8 --- /dev/null +++ b/docs/2.0/way/platform/patterns/overview.md @@ -0,0 +1,56 @@ +--- +sidebar_position: 1 +title: Overview +--- + +# Patterns + +## What are patterns? + +**Patterns are pre-built, opinionated solutions to common infrastructure problems.** + +That definition might sound a little abstract, so here are some real-world examples of patterns: + +- OpenTofu/Terraform modules +- Terragrunt stack definitions +- OPA policies +- Helm charts + +The common theme among all patterns is that when a user goes to consume them, they are pre-built, they reflect the opinion of a subject matter expert, and they solve a common infrastructure problem. +Now let's go back to the definition of a pattern so we can understand that more deeply. +### Common infrastructure problems + +_Patterns are pre-built, opinionated solutions to **common infrastructure problems.**_ + +Any DevOps challenge that requires subject matter expertise can be captured as a pattern. For example, all of the following are common DevOps problems that require some degree of subject matter expertise to solve: + +- Building a standard network configuration +- Creating an Azure blob storage configuration +- Launching a Spring Boot app based on a starter template +- Launching an AWS Strands agent from a starter template +- Asserting a repeatable process for handling database recovery +- Creating an EKS service +- Creating a `conftest` policy that forbids public S3 buckets +- Defining a reusable Terragrunt unit to deploy a Lambda function +- Defining a Terragrunt Stack to deploy a Lambda-based application, including observability + +It is said that DevOps is "broad and shallow," and as you can see in the list above, patterns can indeed span a wide range of areas. + +We talk about patterns as a first-class concept because _creating standardized solutions to common problems is one of the most impactful investments you can make in velocity, governance, and maintainability._ + +In general, when you see a pattern that is likely to recur, it's worth building some kind of reusable solution for it. + +### Pre-built, opinionated solutions + +_Patterns are **pre-built, opinionated solutions** to common infrastructure problems._ + + +A pattern is useful because it's _pre-built._ That is, when a developer goes to, say, launch an Azure Blob Container, they can move faster if their organization has already built a way to configure an Azure Blob Container. Now the developer can apply that pattern by entering some parameter values (e.g. container name, metadata, whether to enable versioning, etc.) without having to understand how the entire pattern works. Better yet, they can trust that they're automatically doing it "the right way." + +A pattern is _opinionated_ because the mere act of writing it means you are asserting an opinion on how Azure Blob Containers should be configured at your organization. For example, maybe you always require blob versioning. That is an opinion and anyone who uses your Azure Blob Container pattern will always inherit that opinion. + +Because patterns are opinionated, _who_ authored the pattern becomes important. This is because you only want to use patterns you _trust_. As a counterexample, would you blindly trust a network configuration pattern that ChatGPT generated for you? Hopefully not. But you would presumably trust such a pattern produced by your internal networking team. + +## Next + +Now that you understand what a pattern is, let's look at the different types of patterns that exist in practice. \ No newline at end of file diff --git a/docs/2.0/way/platform/patterns/types.md b/docs/2.0/way/platform/patterns/types.md new file mode 100644 index 0000000000..fabce08136 --- /dev/null +++ b/docs/2.0/way/platform/patterns/types.md @@ -0,0 +1,80 @@ +--- +sidebar_position: 2 +title: Types +--- + +# Pattern Types + +Patterns come in a few different flavors. Let's look at them now. + +## Infrastructure modules + +Infrastructure modules are reusable, parameterized code that define infrastructure resources and their configuration. + +The key point is that a module declares _what will actually be deployed_, and typically allows some customizability by exposing variables or parameters. + +Examples: +- OpenTofu/Terraform modules +- CloudFormation templates +- Pulumi component resources +- Helm charts + +:::info +Infrastructure modules are by far the most common type of pattern. +::: + +## Infrastructure module instances + +Infrastructure module instances are _instances_ of [infrastructure modules](#infrastructure-modules) that include guidance on how the module should be configured and deployed. + +Examples: +- [Terragrunt units](https://terragrunt.gruntwork.io/docs/features/units/) (instantiates an OpenTofu/Terraform module) +- [Terragrunt stacks](https://terragrunt.gruntwork.io/docs/features/stacks/) (instantiates one or more Terragrunt units) +- OpenTofu code that calls an OpenTofu module +- CloudFormation stack (instantiates a CloudFormation template) + +## Policies + +Policies are automated rules that enforce governance, compliance, and security requirements for your infrastructure. + +You can evaluate either live infrastructure, a plan for creating live infrastructure (e.g. `terragrunt plan` output), or an [infrastructure module](#infrastructure-modules) to see if it complies with a given policy. + +Examples: +- OPA/Rego/`conftest` policies +- AWS Config rules +- AWS Service Control Policies (SCPs) + +## Templates + +Templates are predefined project structures that provide customized starting points for new projects, services, or components. They scaffold out directory structures, configuration files, and boilerplate code based on parameterized inputs. + +Templates can be used to generate anything, but are most commonly used to generate [infrastructure module instances](#infrastructure-module-instances), sometimes for complex use cases. + +Examples: +- [Gruntwork Boilerplate](https://github.com/gruntwork-io/boilerplate) templates +- Cookiecutter templates +- Yeoman generators + +## Runbooks + +Runbooks are step-by-step (and sometimes interactive) operational procedures for infrastructure tasks such as incident response, guided code generation, troubleshooting, and recovery. + +Runbooks can be especially well-suited to generating code based on [templates](#templates). + +Examples: +- [Gruntwork Runbooks](https://runbooks.gruntwork.io/) +- Jupyter Notebooks +- Ansible Playbooks + +## Other + +The common theme among pattern types is that, per our definition of a pattern, they are all pre-built, opinionated solutions to common infrastructure problems. + +Notably these problems exist at different levels of abstraction! For example, a **runbook** might use a **template** to generate an **infrastructure module instance,** which in turn instantiates an **infrastructure module.** Likewise, a **policy** can operate on any level of this hierarchy. + +Perhaps in the future we'll have other pattern types that can be listed here. + +## Next + +Next, let's learn about where you store patterns. + diff --git a/docs/2.0/way/platform/patterns/where-to-store-them.md b/docs/2.0/way/platform/patterns/where-to-store-them.md new file mode 100644 index 0000000000..b027a37acc --- /dev/null +++ b/docs/2.0/way/platform/patterns/where-to-store-them.md @@ -0,0 +1,14 @@ +--- +sidebar_position: 3 +title: Where To Store Them +--- + +Once you define a pattern, you need somewhere to store it. This is the core purpose of the **[catalog](/2.0/way/platform/components/deploy/catalog)** platform component. + +## Examples + +You can learn more about the Catalog in the link above, but here are some quick examples how you could implement a Catalog: + +- **Git repository.** The simplest possible catalog is a git repo, with separate folders for each pattern type. +- **Software artifact repository.** Tools like [Artifactory](https://jfrog.com/artifactory/) or [Sonatype Nexus Repository](https://www.sonatype.com/products/sonatype-nexus-repository) can _store_ all the different pattern types, but they often have limited _functionality_ with respect to these infrastructure-as-code pattern types. +- **Native IaC catalogs.** The ideal solution is to build a purpose-built catalog for IaC pattern types that handles each pattern in a first-class way. diff --git a/docs/2.0/way/principles/_category_.json b/docs/2.0/way/principles/_category_.json new file mode 100644 index 0000000000..0b79bca9da --- /dev/null +++ b/docs/2.0/way/principles/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Principles", + "position": 3, + "className": "way-top-level-item" +} diff --git a/docs/2.0/way/principles/core-philosophy/_category_.json b/docs/2.0/way/principles/core-philosophy/_category_.json new file mode 100644 index 0000000000..bfd89c8589 --- /dev/null +++ b/docs/2.0/way/principles/core-philosophy/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Core Philosophy", + "position": 2 +} + diff --git a/docs/2.0/way/principles/core-philosophy/the-platform-is-a-balancing-act.md b/docs/2.0/way/principles/core-philosophy/the-platform-is-a-balancing-act.md new file mode 100644 index 0000000000..0e518aafe6 --- /dev/null +++ b/docs/2.0/way/principles/core-philosophy/the-platform-is-a-balancing-act.md @@ -0,0 +1,19 @@ +--- +sidebar_position: 2 +title: The platform is a balancing act +--- + +# The platform is a balancing act + +Every platform team juggles three competing concerns: + +- [Velocity](/2.0/way/why/velocity): How fast can developers ship? +- [Governance](/2.0/way/why/governance): Are we compliant, secure, and cost-effective? +- [Maintainability](/2.0/way/why/maintainability): Can we sustain this long-term? + +Early-stage companies usually prioritize velocity over everything else ("move fast and break things"). As companies mature, governance becomes critical as regulatory requirements kick in, security incidents become a bigger concern, or cloud costs spiral out of control. Eventually, technical debt accumulates and maintainability demands attention. + +The key insight is that there's no universal "right" balance. A 10-person startup and a 1,000-person enterprise need fundamentally different trade-offs. + +The platform team's job is to consciously choose the right balance for your organization's current stage, then actively adjust as your needs evolve. This requires honest conversations with stakeholders about trade-offs and regular reassessment of priorities. + diff --git a/docs/2.0/way/principles/core-philosophy/your-developer-platform-is-a-product.md b/docs/2.0/way/principles/core-philosophy/your-developer-platform-is-a-product.md new file mode 100644 index 0000000000..5e7fb7154d --- /dev/null +++ b/docs/2.0/way/principles/core-philosophy/your-developer-platform-is-a-product.md @@ -0,0 +1,18 @@ +--- +sidebar_position: 1 +title: Your developer platform is a product +--- + +# Your developer platform is a product + +You can think of your platform team as a _small startup_. Your _revenue_ comes from your budget. Your _expenses_ are your team members, plus whatever resources you consume like office space, cloud spend, or vendor tools. + +You can think of your developer platform as a _product_ whose job is to balance the [three fundamental concerns](/2.0/way/why/overview): + +- Products are built with an _ideal customer profile_ (ICP) in mind, and your ICP is your developer end-users. +- Products need to be _marketed_ to build awareness and drive adoption, and you need to teach your developers about your developer platform and make a compelling case on why to use it. +- Products have _competitive alternatives_, you compete most of all with developers who choose to do it themselves (DIY), and you need to highlight the clear advantages over DIY to end users. +- Products need _data_ to tell you about usage and friction points. You need insight into how your developer platform is being used and where it can be improved. + +Finally, product teams _stay close to the customer_ through continual feedback sessions and always learning about what their customer needs. You'll need to do this, too. + diff --git a/docs/2.0/way/principles/developer-experience/_category_.json b/docs/2.0/way/principles/developer-experience/_category_.json new file mode 100644 index 0000000000..2a6cc9b9ed --- /dev/null +++ b/docs/2.0/way/principles/developer-experience/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Developer Experience", + "position": 3 +} + diff --git a/docs/2.0/way/principles/developer-experience/enable-developer-self-service.md b/docs/2.0/way/principles/developer-experience/enable-developer-self-service.md new file mode 100644 index 0000000000..dd497af63d --- /dev/null +++ b/docs/2.0/way/principles/developer-experience/enable-developer-self-service.md @@ -0,0 +1,19 @@ +--- +sidebar_position: 2 +title: Enable developer self-service +--- + +# Enable developer self-service + +The ideal velocity for application developers is that they get the infrastructure they need, when they need it, without human gatekeepers. This known as **developer self-service.** + +To achieve developer self-service, application developers need: + +- A [catalog](/2.0/way/platform/components/deploy/catalog) of patterns to choose from +- A way to [configure a pattern](/2.0/way/platform/components/deploy/runbooks) +- A way to [deploy the pattern](/2.0/way/platform/components/deploy/pipelines) +- A way to [monitor the pattern instance](/2.0/way/platform/components/operate/registry) +- A way to [update or delete the pattern instance](/2.0/way/platform/components/operate/unit-browser) + +As you can see, developer self-service is highly valuable, but also hard to get right. + diff --git a/docs/2.0/way/principles/developer-experience/make-good-practices-the-default.md b/docs/2.0/way/principles/developer-experience/make-good-practices-the-default.md new file mode 100644 index 0000000000..4b074c56de --- /dev/null +++ b/docs/2.0/way/principles/developer-experience/make-good-practices-the-default.md @@ -0,0 +1,11 @@ +--- +sidebar_position: 3 +title: Make good practices the default +--- + +# Make good practices the default + +Developers want to focus on their application, not compliance, reliability, cost management, or any other number of governance and maintainability concerns. So they generally _won't_ invest in these areas unless they are forced to. + +But as a platform engineer, you can build in compliance, security, and standard tool adoption directly into patterns and platform components. Then when developers consume those patterns, they get governance and maintainability "for free." + diff --git a/docs/2.0/way/principles/developer-experience/pre-built-patterns-are-your-leverage.md b/docs/2.0/way/principles/developer-experience/pre-built-patterns-are-your-leverage.md new file mode 100644 index 0000000000..c98e98600d --- /dev/null +++ b/docs/2.0/way/principles/developer-experience/pre-built-patterns-are-your-leverage.md @@ -0,0 +1,23 @@ +--- +sidebar_position: 1 +title: Pre-built patterns are your leverage +--- + +# Pre-built patterns are your leverage + +A small platform team can support hundreds or thousands of developers by offering a collection of pre-written, opinionated solutions to common DevOps problems, better known as **patterns.** + +Any reusable opinion that can be expressed as code can qualify as a pattern. For example, one popular way to implement patterns among platform teams is to author OpenTofu/Terraform modules. + +:::info +See more examples of [Patterns](/2.0/way/platform/patterns/overview). +::: + +A pattern has both an _author_ and a _consumer._ The author is a DevOps subject matter expert (SME) of some kind. For example, a pattern author might be an SME in deploying Amazon RDS as a production-grade database. The author "codifies" all her expertise on deployment, monitoring, disaster recovery, gotchas and anything else that might be relevant into a reusable pattern. + +The consumer is someone who has a problem that is solved by a given pattern. The consumer browses available patterns, selects the right one and then "consumes" the expertise of the SME by applying the pattern. A consumer who chooses a vetted pattern will typically get much higher velocity (no need to re-invent the wheel!) while getting governance and maintainability "for free." + +It can be helpful to think of patterns as "mini products" within your developer platform. Each one should solve a real need, be easy to use, and get better over time based on feedback. When prioritizing patterns, start with the DevOps problems that your customers (app developers) request the most. + +Patterns can be authored inhouse, or "vendored" from open source or commercial pattern libraries (like [Gruntwork IaC Library](https://docs.gruntwork.io/2.0/docs/library/concepts/overview)). + diff --git a/docs/2.0/way/principles/developer-experience/shift-left.md b/docs/2.0/way/principles/developer-experience/shift-left.md new file mode 100644 index 0000000000..2fa0612ef0 --- /dev/null +++ b/docs/2.0/way/principles/developer-experience/shift-left.md @@ -0,0 +1,19 @@ +--- +sidebar_position: 4 +title: Shift left +--- + +# Shift left + +:::info +The concept of "shift left" was introduced all the way back in 2001 by Larry Smith in his article [Shift-Left Testing](https://jacobfilipp.com/DrDobbs/articles/DDJ/2001/0109/0109e/0109e.htm). It has since evolved to include a broader range of activities, including security, compliance, and maintainability and is now considered a core DevOps principle. +::: + +In traditional software development, validation checks happen late: security reviews just before deploying to production, cost analysis after the bill arrives, compliance audits months after deployment. By the time problems surface, the code is written, the architecture is set, and changes require rework across multiple systems, making validation failures expensive to fix. + +"Shifting left" means moving feedback and decision-making as early in the development process as possible. Instead of discovering a policy violation during a pre-production review, you catch it during code review. Or better yet, catch it while the developer is still writing the code, when the context is fresh and the fix is trivial. + +This principle applies to everything: security scanning in CI/CD pipelines, cost estimation when resources are provisioned, policy validation before infrastructure is deployed. In general, the earlier you catch issues, the cheaper they are to fix and the less they disrupt flow. + +Modern tools make shifting left practical. IDE plugins can validate infrastructure code as it's written. Git pre-commit hooks can run security scans before code is pushed. Components like your [IaC Pipeline](/2.0/way/platform/components/deploy/pipelines) can enforce policies before deployment. Collectively, these tools give developers immediate, actionable feedback when they're in the best position to act on it. + diff --git a/docs/2.0/way/principles/failure-modes.md b/docs/2.0/way/principles/failure-modes.md new file mode 100644 index 0000000000..eaf418c3e3 --- /dev/null +++ b/docs/2.0/way/principles/failure-modes.md @@ -0,0 +1,67 @@ +--- +sidebar_position: 10 +toc_max_heading_level: 2 +--- + +# Failure modes + +If we identified the right principles, then a developer platform "failure" means there was some violation of the principles. In that spirit, let's talk about common failure modes and how we interpret them: + +## You built a platform that no one adopts + +### What went wrong +You violated the principle that [your developer platform is a product](/2.0/way/principles/core-philosophy/your-developer-platform-is-a-product). + +Successful products need active marketing, customer feedback, and product-market fit. When platform teams build in isolation without understanding their users' needs, they create solutions nobody wants to use. As a result, developers will alternatives like ClickOps, custom scripts, or one-off tools. + +### How to avoid it + +Treat developers as customers. Market your platform to make them aware of what you've built. Measure satisfaction through surveys and metrics—especially asking "how would you feel if you could no longer use the platform?" and noting the number who would be "very disappointed." Put in place regular feedback loops and use insights to drive your roadmap. Instrument everything to understand adoption patterns and pain points. + +## You stumbled into "DevOps bankruptcy" + +### What went wrong + +You ignored [maintainability](/2.0/way/why/maintainability) concerns until technical debt overwhelmed your ability to operate. + +DevOps bankruptcy happens when teams prioritize velocity at all costs without addressing the inevitable accumulation of debt: drift between code and reality, non-codified resources created through the console, outdated patterns, and tool sprawl. Eventually, the maintenance burden becomes so high that teams can barely keep systems running, let alone ship new features. Unplanned work crowds out planned work entirely. + +### How to avoid it + +Consciously prioritize maintainability. Track key metrics like infrastructure drift rate, IaC coverage, and up-to-date coverage. Allocate dedicated time for debt reduction—treat it as an investment, not overhead. Implement automated drift detection and remediation. + +Build feedback loops to catch problems early. Remember that continuous improvement isn't a side project—it's part of your platform's operating system. Zero debt is unrealistic, but the bill of unmanaged debt will eventually come due. + +## You're stuck in endless debates about tooling + +### What went wrong + +You didn't establish [centralized standards](/2.0/way/principles/governance-maintainability/offer-golden-paths) soon enough and your platform's surface area has become unmanageable. + +Without clear standards, every team wants to choose their own tools for the same problems. You end up in endless debates: Terraform vs. OpenTofu vs. Pulumi, Kubernetes vs. ECS, GitHub Actions vs. GitLab CI. Each discussion consumes time and energy, and when teams each make different choices, your platform team must support an ever-growing matrix of tools. This hampers your ability to provide pre-built patterns, self-service, and pipeline automation. + +### How to avoid it + +Apply the principle of centralized standards, decentralized execution. Make opinionated choices about tooling and patterns at the center, then build ["golden paths"](/2.0/way/principles/governance-maintainability/offer-golden-paths) for these standards. Show developer teams that when they use official standards, they get a large collection of supported tooling. + +Optionally permit teams to deviate from standards with the knowledge that they must support non-standard tools themselves. + +## You reinvented solutions that already exist + +### What went wrong + +You (or your developer teams) didn't leverage [pre-built patterns](/2.0/way/principles/developer-experience/pre-built-patterns-are-your-leverage) when populating your [catalog](/2.0/way/platform/patterns/overview), so you effectively reimplemented the same solutions from scratch. + +When developers need to deploy a Kubernetes service, set up a database, or configure observability, they shouldn't have to figure it out from first principles every time. Likewise, when you platform engineers go to populate their catalog, they shouldn't have to reimplement everything from scratch every time. + +Without vetted, reusable patterns, every team wastes time solving the same problems in slightly different ways. This saps velocity and multiplies your maintenance burden. Worse, developers building from scratch often make security and reliability mistakes that could have been prevented. + +### How to avoid it + +[Pre-built patterns are your leverage](/2.0/way/principles/developer-experience/pre-built-patterns-are-your-leverage). + +First, as platform engineers, look for pre-built, battle-tested patterns wherever possible. These could be open source modules, or commercial module catalogs like the [Gruntwork AWS IaC Library](https://docs.gruntwork.io/2.0/docs/library/concepts/overview). + +Second, for developers, provide vetted infrastructure and application patterns that developers can adopt instead of reinventing. When you enable developers to deploy proven solutions, you achieve velocity, governance, and maintainability simultaneously. + +Combine these patterns with [self-service](/2.0/way/principles/developer-experience/enable-developer-self-service) so developers can provision infrastructure on demand without waiting for approvals. Build compliance and security into these patterns so [good practices become the easy default](/2.0/way/principles/developer-experience/make-good-practices-the-default). \ No newline at end of file diff --git a/docs/2.0/way/principles/governance-maintainability/_category_.json b/docs/2.0/way/principles/governance-maintainability/_category_.json new file mode 100644 index 0000000000..4927802c25 --- /dev/null +++ b/docs/2.0/way/principles/governance-maintainability/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Governance & Maintainability", + "position": 5 +} + diff --git a/docs/2.0/way/principles/governance-maintainability/guardrails-over-gates.md b/docs/2.0/way/principles/governance-maintainability/guardrails-over-gates.md new file mode 100644 index 0000000000..4d3c40cb00 --- /dev/null +++ b/docs/2.0/way/principles/governance-maintainability/guardrails-over-gates.md @@ -0,0 +1,25 @@ +--- +sidebar_position: 1 +title: Guardrails over gates +--- + +# Guardrails over gates + +Manual approval processes don't scale. As your organization grows, gatekeepers become bottlenecks, and developers learn to work around them. + +While manual approval processes may not scale, they do have one big advantage: a human can render an on-the-spot judgement about whether a given operation or status meets the organization's governance needs. Essentially, manual approvals guarantee [governance](/2.0/way/why/governance) but demand a heavy price on [velocity](/2.0/way/why/velocity). + +But we don't want to view governance and velocity as opposite ends of a spectrum. Rather, we want both good governance _and_ high velocity! + +Enter the guardrail. A **guardrail** is any automated rule or policy that checks infrastructure operations or status against your standards. When you create a guardrail, you are reducing that manual human judgment down to a narrowly scoped rule that systematically renders an opinion on a well-defined situation. + +For example, most network experts agree that a virtual machine should not allow SSH access from any arbitrary IP address. This is a human judgement and we can codify it as a _policy_. If we're dealing with AWS, that policy might declare that any AWS security group on an EC2 Instance must not allow port 22 from `0.0.0.0/0` (all IP addresses). We can apply that policy either to live infrastructure by scanning the current state of a security group, or we can [shift left](/2.0/way/principles/developer-experience/shift-left) and apply that policy to a _plan_ (e.g. from `tofu plan`) to create such a security group, or even to _in-progress code_ in an IDE that configures such a security group. + +The challenge with guardrails is that they're a pain to write. And once again, we are reminded that [the platform is a balancing act](/2.0/way/principles/core-philosophy/the-platform-is-a-balancing-act). Guardrails take more time upfront to create. But then they remove the need for manual approvals, and thus increase velocity without compromising governance. + +Guardrails are a form of [pattern](/2.0/way/platform/patterns/overview). For example, an OPA policy that follows the [conftest](https://www.conftest.dev/) convention is one popular way to codify rules. + +As we discussed above, guardrails can be applied at various stages of the development process, but one especially important place to apply them is in your [IaC pipeline](/2.0/way/platform/components/deploy/pipelines). + +Invest in building guardrails early, as they're the key to scaling both governance and velocity together. + diff --git a/docs/2.0/way/principles/governance-maintainability/offer-golden-paths.md b/docs/2.0/way/principles/governance-maintainability/offer-golden-paths.md new file mode 100644 index 0000000000..4cd9775ddf --- /dev/null +++ b/docs/2.0/way/principles/governance-maintainability/offer-golden-paths.md @@ -0,0 +1,19 @@ +--- +sidebar_position: 2 +title: Offer "golden paths" +--- + +# Offer "golden paths" + +Platform engineers face a fundamental tension: you want _consistency_ because it means a smaller surface area to maintain and more confidence around governance, but application developers want _flexibility_ because autonomy is both motivating and fast. + +What's the right balance? The answer lies in offering **golden paths,** which are well-supported, pre-built ways to accomplish the most common infrastructure problems that developers face. To solve a given problem, application developers can choose to use the golden path approach and enjoy a pre-built solution, ongoing maintenance, and support from the platform team, or they can go their own way knowing that development, support, and maintenance will now be their responsibility. + +The golden paths concept was [first coined by Spotify](https://engineering.atspotify.com/2020/08/how-we-use-golden-paths-to-solve-fragmentation-in-our-software-ecosystem) back in 2020 and remains popular today. It works because it aligns incentives. Platform engineers want to see their work used and need to make their patterns compelling enough to drive adoption. Application developers seek to minimize their responsibilities so they can focus on building core features and not infrastructure. + +Once again, we see the need to [treat your platform as a product](/2.0/way/principles/core-philosophy/your-developer-platform-is-a-product). In this case, you are "competing" against the application developer's next best alternative of "I'll just do it myself." Sometimes your developer platform will lose because it's actually not the right fit for a particular use case, and that's okay. But once you create a golden path for a given use case, it should win most of the time. + +How a golden pattern wins out is a nuanced topic. As former Spotify Staff Agile Coach [Jason Yip points out](https://jchyip.medium.com/my-critique-of-the-spotify-model-part-1-197d335ef7af), teams need four things to make good decisions (like choosing the golden path option!) as an autonomous team: competence to evaluate technical tradeoffs, clarity on the full business context, exposure to what's working well elsewhere in the organization, and a cultural orientation to act in the best interest of the company. + +In practice, golden paths are implementations of [patterns](/2.0/way/platform/patterns/overview). They might be written as OpenTofu/Terraform modules, Terragrunt Stack definitions, [Runbooks](https://github.com/gruntwork-io/runbooks) or any other valid pattern technology. Whatever form they take, they need to be easy to discover, easy to use, work as expected, and actually meet developers' needs. + diff --git a/docs/2.0/way/principles/overview.md b/docs/2.0/way/principles/overview.md new file mode 100644 index 0000000000..808c1d64bd --- /dev/null +++ b/docs/2.0/way/principles/overview.md @@ -0,0 +1,41 @@ +--- +sidebar_position: 1 +title: Overview +--- + +# Developer Platform Principles + +We have discovered a set of guiding principles that collectively shape our approach to building developer platforms. We organize them into four categories: + +## Core Philosophy + +These principles shape the foundational mindset for building developer platforms: + +- [Your developer platform is a product](/2.0/way/principles/core-philosophy/your-developer-platform-is-a-product) +- [The platform is a balancing act](/2.0/way/principles/core-philosophy/the-platform-is-a-balancing-act) + +## Developer Experience + +These principles focus on making developers productive and happy: + +- [Pre-built patterns are your leverage](/2.0/way/principles/developer-experience/pre-built-patterns-are-your-leverage) +- [Enable developer self-service](/2.0/way/principles/developer-experience/enable-developer-self-service) +- [Make good practices the default](/2.0/way/principles/developer-experience/make-good-practices-the-default) +- [Shift left](/2.0/way/principles/developer-experience/shift-left) + +## Technical Foundations + +These principles establish the architectural patterns that make platforms reliable, scalable, and maintainable: + +- [Build platform components](/2.0/way/principles/technical-foundations/build-platform-components) +- [Represent all patterns as code](/2.0/way/principles/technical-foundations/represent-all-patterns-as-code) +- [Define all live infrastructure as pattern instances](/2.0/way/principles/technical-foundations/define-all-live-infrastructure-as-pattern-instances) +- [Embrace immutable infrastructure](/2.0/way/principles/technical-foundations/embrace-immutable-infrastructure) +- [Use GitOps](/2.0/way/principles/technical-foundations/use-gitops) + +## Governance & Maintainability + +These principles address how to meet your governance and maintainability needs as the platform scales: + +- [Guardrails over gates](/2.0/way/principles/governance-maintainability/guardrails-over-gates) +- [Offer "golden paths"](/2.0/way/principles/governance-maintainability/offer-golden-paths) diff --git a/docs/2.0/way/principles/technical-foundations/_category_.json b/docs/2.0/way/principles/technical-foundations/_category_.json new file mode 100644 index 0000000000..9a2e2d29c6 --- /dev/null +++ b/docs/2.0/way/principles/technical-foundations/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Technical Foundations", + "position": 4 +} + diff --git a/docs/2.0/way/principles/technical-foundations/build-platform-components.md b/docs/2.0/way/principles/technical-foundations/build-platform-components.md new file mode 100644 index 0000000000..51b3529132 --- /dev/null +++ b/docs/2.0/way/principles/technical-foundations/build-platform-components.md @@ -0,0 +1,15 @@ +--- +sidebar_position: 1 +title: Build platform components +--- + +# Build platform components + +The best way to build a developer platform is to build out a collection of discrete **platform components.** + +A platform component is a narrowly scoped collection of functionality focused on solving one well-defined problem. For example, the [Pipelines component](/2.0/way/platform/components/deploy/pipelines) is focused specifically on deploying infrastructure changes. It is not aware of or concerned with how those infrastructure changes are generated, only that it deploys infrastructure changes in a way that meets your organization's needs. + +The real power of platform components comes from combining them together. For example, catalog, runbooks, and pipelines collectively enable a complete developer self-service experience. Or combine drift detector and pipelines to detect and automatically remediate drift. Or combine scorecard and unit browser to show the status of each repo and all infrastructure units across your entire infrastructure. + +Platform components generally fall into three major categories: Deploy, Operate, and Maintain. In addition, every component plays some role in achieving velocity, governance, and maintainability. + diff --git a/docs/2.0/way/principles/technical-foundations/define-all-live-infrastructure-as-pattern-instances.md b/docs/2.0/way/principles/technical-foundations/define-all-live-infrastructure-as-pattern-instances.md new file mode 100644 index 0000000000..e6db6e8866 --- /dev/null +++ b/docs/2.0/way/principles/technical-foundations/define-all-live-infrastructure-as-pattern-instances.md @@ -0,0 +1,26 @@ +--- +sidebar_position: 3 +title: Define all live infrastructure as pattern instances +--- + +# Define all live infrastructure as pattern instances + +To actually make use of patterns in our infrastructure we need to: + +1. Define a pattern +2. Deploy an _instance_ of the pattern + +For example, suppose you have an expert opinion on how AWS Lambda functions should be deployed that includes always launching them in a VPC, optionally including Lambda shared layers, and limiting the number of runtime engines to certain versions of Python and Go. You might represent these opinions as an OpenTofu module. You expose configuration options as OpenTofu variables. When you make updates to your opinion, you release new versions of your OpenTofu module. + +But critically, defining a pattern is different from launching an instance of that pattern into live infrastructure. To do that, we need a different construct. + +Continuing our example, to actually launch an instance of the pattern, we could use a [Terragrunt unit](https://terragrunt.gruntwork.io/docs/features/units/), an [OpenTofu workspace](https://opentofu.org/docs/language/state/workspaces/), or one-off OpenTofu code that references the OpenTofu module. In each case, the code references the original pattern (in this case the OpenTofu module) and specifies a set of configuration values relevant to this instance of the pattern. + +Zooming back out to a general principle, you should aim to deploy 100% of your live infrastructure as instances of patterns. + +This way, every piece of your infrastructure follows a vetted, well-understood pattern. When you need to update best practices -- say, to add new security requirements or optimize costs -- you update the pattern once and systematically roll out the change to all instances. Your infrastructure becomes more consistent, easier to understand, and significantly more maintainable at scale. Instead of having to track down and update dozens of bespoke configurations, you update the pattern and its instances follow suit. + +Of course, there are times when you might have a pattern that is trivial to the point of deploying a single resource. Should even that trivial code be represented as a reusable pattern? Yes, it should. In the future, you may want to expand the sophistication of that pattern. Or you may want to track all instances of that trivial pattern. Or you may want to replace a deprecated resource. Or you may want to define a [Terragrunt Stack](https://terragrunt.gruntwork.io/docs/features/stacks/) that forces you to deploy only instances of patterns. + +Once again, [the platform is a balancing act](/2.0/way/principles/core-philosophy/the-platform-is-a-balancing-act) and it will take more overhead to launch all infrastructure as instances of a pattern, but the long-term maintainability and governance gains are well worth it. + diff --git a/docs/2.0/way/principles/technical-foundations/embrace-immutable-infrastructure.md b/docs/2.0/way/principles/technical-foundations/embrace-immutable-infrastructure.md new file mode 100644 index 0000000000..f1929f5b61 --- /dev/null +++ b/docs/2.0/way/principles/technical-foundations/embrace-immutable-infrastructure.md @@ -0,0 +1,19 @@ +--- +sidebar_position: 4 +title: Embrace immutable infrastructure +--- + +# Embrace immutable infrastructure + +It is perhaps a brutal metaphor, but many DevOps practitioners capture the idea of immutable infrastructure by recommending that we treat our infrastructure resources such as servers and databases "like cattle, not pets." The idea is that instead of carefully nurturing individual servers with unique configurations and histories, you should treat them as interchangeable and replaceable. When something needs to change, you don't modify the existing resource—you replace it entirely with a new one. + +You can think of "pet" infrastructure management like editing a document: you SSH into a server, install an update, modify a configuration file, and restart a service. Over time, each environment accumulates unique changes. Production has patches that staging doesn't. Servers that were provisioned at different times have different configurations. Troubleshooting requires understanding each system's individual history. + +Immutable infrastructure takes a different approach. Rather than modifying running resources, you replace them with new versions. Instead of SSHing into a server to patch it, you build a new server image with the patch and replace the old server. Instead of updating a Lambda function in place, you deploy a new version and cut over to it. + +For example, instead of manually configuring a server, the immutable approach involves building an Amazon Machine Image (AMI) using a tool like [Packer](https://github.com/hashicorp/packer) or [EC2 Image Builder](https://aws.amazon.com/image-builder/) to capture your base server configuration as code. You then make sure that any running EC2 instance is using the latest version of your AMI, and that future server configuration changes are reflected in a new AMI version (or a new set of instructions to execute after the EC2 instance launches). + +Embracing immutable infrastructure gives you consistency across environments because production, staging, and development all run identical builds of your infrastructure, just with different configuration parameters. It also makes configuration rollbacks straightforward because you can easily switch back to the previous version. + +As we saw when you [represent all patterns as code](/2.0/way/principles/technical-foundations/represent-all-patterns-as-code), it _is_ more work upfront to create immutable infrastructure than to manually configure a resource. And just as before, [the platform is a balancing act](/2.0/way/principles/core-philosophy/the-platform-is-a-balancing-act), so sometimes you may not have time to create a fully immutable artifact. But once again, not working with immutable artifacts is a form of debt, and investing in creating them upfront will enable better velocity, governance and maintainability over time. + diff --git a/docs/2.0/way/principles/technical-foundations/represent-all-patterns-as-code.md b/docs/2.0/way/principles/technical-foundations/represent-all-patterns-as-code.md new file mode 100644 index 0000000000..c3ebd0d20e --- /dev/null +++ b/docs/2.0/way/principles/technical-foundations/represent-all-patterns-as-code.md @@ -0,0 +1,54 @@ +--- +sidebar_position: 2 +title: Represent all patterns as code +--- + +# Represent all patterns as code + +We talked earlier about how [pre-built patterns are your leverage](/2.0/way/principles/developer-experience/pre-built-patterns-are-your-leverage). To realize that principle in practice, those patterns should be represented as code. + +Naturally, there are many technologies and tools to choose from when codifying a pattern. Some examples of patterns-as-code are: + +- OpenTofu modules, CloudFormation modules, or Pulumi Component Resources +- OPA policies +- Helm charts +- Terragrunt unit or stack definitions +- Gruntwork Runbooks +- Gruntwork Boilerplate templates + +The common theme here is that an expert -- the "pattern author" -- codified their knowledge and experience into one format or another. + +To better understand why this is so important, let's look at what happens when you _don't_ follow it. When patterns are not represented as code, users deploy infrastructure through a series of clicks in the cloud console ("ClickOps"), or perhaps with a collection of manually typed commands. + +This approach leads to many issues: + +1. **Changes are invisible.** It's not clear who did the original configuration, and it's hard to track or know about any updates to that configuration. +2. **Reproducibility is impossible.** You can't reliably recreate the configuration because there's no artifact that represents the set of steps that happened. +3. **Knowledge is siloed.** Only the person who clicked through the console or ran the manual commands understands how it works. +4. **Not vetted.** Without code review and automated testing, patterns haven't been validated against your organization's standards and requirements. +5. **Not documented.** Manual processes often lack documentation, and when documentation does exist, it quickly becomes outdated and diverges from reality. +6. **Maintainability is harder.** As new best practices emerge, manually accessing each bespoke instance of a pattern and updating it to reflect the latest best practices is often prohibitively painful or just plain not worth doing because the value-to-cost ratio is unfavorable. + +Around 2014, these issues reached an inflection point and the idea of infrastructure-as-code was then a major paradigm shift. More than a decade later, we can now generalize the idea of representing all infrastructure as code to the idea that you should **represent all patterns as code,** and then [define all live infrastructure as pattern instances](/2.0/way/principles/technical-foundations/define-all-live-infrastructure-as-pattern-instances). + +Representing patterns as code brings many benefits. To start, when anything is captured as code, it can be stored in version control. That means that every change is now tracked, attributed, and can be reverted if needed. Code can be peer reviewed, and automated tests can run to immediately validate the newly committed code. + +More generally, when a pattern is represented as code, what were liabilities above now become key advantages: + +1. **Changes are visible.** Every modification is tracked in version control with full attribution and history, making it clear who changed what and why. +2. **Reproducibility is guaranteed.** The code artifact can reliably recreate the same configuration anywhere, anytime. +3. **Knowledge is shared.** The pattern is documented in code that anyone on the team can read, understand, and contribute to. +4. **Vetted through process.** Code review and automated testing validate patterns against your organization's standards before they're approved. +5. **Self-documenting.** In addition to any supplemental written documentation, the code itself serves as living documentation that stays up-to-date because it _is_ the implementation. +6. **Maintainability at scale.** Updates to best practices can be made once in code and rolled out systematically across all instances of the pattern. + +:::info +Patterns are a fundamental concept of developer platforms. Learn much more about them in the [Patterns section](/2.0/way/platform/patterns/overview). +::: + +Of course, writing all patterns as code does take more time. Sometimes in emergency situations, engineers might need to optimize for speed over maintainability and resort to ClickOps. Or perhaps the team is not clear on how to represent patterns as code but has urgent deadlines to hit. + +These decisions remind us that [the platform is a balancing act](/2.0/way/principles/core-philosophy/the-platform-is-a-balancing-act). There is nothing inherently "wrong" about optimizing for velocity over maintainability in a given moment. But it's important to note that every pattern not represented as code does represent [debt](/2.0/way/why/maintainability#what-it-covers) in the form of non-codified assets. + +So in general, if it can be code, it should be code. And if you don't have time to make it code, incur the debt and solve your immediate problem, but don't forget to pay off that debt soon after by representing the pattern as code. + diff --git a/docs/2.0/way/principles/technical-foundations/use-gitops.md b/docs/2.0/way/principles/technical-foundations/use-gitops.md new file mode 100644 index 0000000000..0fdf102f87 --- /dev/null +++ b/docs/2.0/way/principles/technical-foundations/use-gitops.md @@ -0,0 +1,17 @@ +--- +sidebar_position: 5 +title: Use GitOps +--- + +# Use GitOps + +GitOps is the natural evolution of infrastructure as code. If your infrastructure is defined in Git, why not use Git as the source of truth for what should be running? In a GitOps model, the desired state lives in version control, and automated systems continuously reconcile the actual state to match it. + +This means all infrastructure changes follow a disciplined workflow: commit to Git, get peer review, run automated checks, then merge. Once merged, automation deploys the changes. Git becomes the single point of control and the complete audit trail. + +The benefits of GitOps are substantial. Every change is visible and attributed because you always know who changed what and why. Rollbacks are more straightforward because you have the option to revert a commit. Compliance is built-in because the Git history is your audit log. And the workflow is familiar because developers already know how to use Git, pull/merge requests, and code review. + +:::info +GitOps is closely related to the [pipelines component](/2.0/way/platform/components/deploy/pipelines). +::: + diff --git a/docs/2.0/way/resources/_category_.json b/docs/2.0/way/resources/_category_.json new file mode 100644 index 0000000000..da789d904e --- /dev/null +++ b/docs/2.0/way/resources/_category_.json @@ -0,0 +1,10 @@ +{ + "label": "Resources", + "position": 8, + "className": "way-top-level-item", + "link": { + "type": "generated-index", + "description": "Reference materials and definitions for The Gruntwork Way." + } +} + diff --git a/docs/2.0/way/resources/glossary.md b/docs/2.0/way/resources/glossary.md new file mode 100644 index 0000000000..cbef1c7248 --- /dev/null +++ b/docs/2.0/way/resources/glossary.md @@ -0,0 +1,41 @@ +--- +sidebar_position: 2 +--- + +# Glossary + +Here's the Gruntwork take on the most common terms we encounter in Platform Engineering and DevOps: + +### Developer Platform + +A Developer Platform is the collection of tools, methods, and services used to enable developers to quickly deploy the infrastructure they need using a standardized approach. It abstracts operational complexity so application teams can focus on building business value. + +### DevOps Engineer + +A DevOps engineer works within application teams to automate software delivery and manage the infrastructure needed to run their applications. They handle both building applications and operating the underlying systems. + +### Platform Engineer + +A Platform engineer builds internal developer platforms that provide standardized, self-service infrastructure tools across the organization. They abstract operational complexity so application teams can focus on building business value. + +### DevOps vs. Platform Engineer + +| | DevOps Engineer | Platform Engineer | +|--------|----------------|-------------------| +| **Scope** | Works within application teams | Works across the organization | +| **Focus** | Building and running specific applications | Building platforms that enable other teams | +| **Cognitive Load** | Carries operational burden for their applications | Reduces operational burden for all teams | +| **Output** | Ships applications and features | Ships platforms and tooling | +| **Primary Goal** | Make their team ship faster | Make all teams ship faster with less complexity | + +### DevOps bankruptcy + +When organizations wind up in some or all of these failure modes they may reach a breaking point we describe as **DevOps Bankruptcy**: a point at which starting fresh is often the better option because correcting the anti-patterns in their current infrastructure paradigm would be too costly. + +### Unit + +Any company's infrastructure is made up of many component parts. We call the most basic component part a **unit of infrastructure** or just **unit** for short. For example, we consider one instance of one OpenTofu/Terraform module, one "unit." In fact, our open source IaC orchestrator [Terragrunt](https://terragrunt.gruntwork.io) uses this exact terminology! + +### Stack + +Companies often need to combine their [units](#unit) into common, repeated patterns. We call an opinionated combination of units a **stack.** \ No newline at end of file diff --git a/docs/2.0/way/resources/sources.md b/docs/2.0/way/resources/sources.md new file mode 100644 index 0000000000..928c0be717 --- /dev/null +++ b/docs/2.0/way/resources/sources.md @@ -0,0 +1,56 @@ +--- +sidebar_position: 1 +--- + +# Sources + +In writing the Gruntwork Way, we drew heavily on our own internal experience and discussions. We also built on the existing body of knowledge and insights developed by previous generations of developer platform practitioners. + +Wherever possible, we have directly cited the original source inline. In addition, we found the following resources especially useful in sharpening our general thinking. + +## Methods + +### DORA Metrics + +The DevOps Research and Assessment (DORA) metrics provide data-driven insights into software delivery performance. These metrics inspired many of the performance indicators we use throughout this guide, especially lead time, deployment frequency, change failure rate, and recovery time. + +- **Link:** [DORA](https://dora.dev/) + +## Books + +### Fundamentals of DevOps and Software Delivery + +This book provides a comprehensive overview of DevOps principles that offers a deeper dive in many of the infrastructure patterns. + +- **Link:** [Official book website](https://www.fundamentals-of-devops.com/) +- **Link:** [Read it free](https://books.gruntwork.io/books/fundamentals-of-devops) +- **Author:** Yevgeniy Brikman + +## Videos + +### What is Platform Engineering and How It Fits into DevOps and Cloud World + +In this video, the author, Nana, does a great job explaining how we got from DevOps to Platform Engineering, and how best to adopt your internal developer platform. + +- **Link:** [Watch the video](https://www.youtube.com/watch?v=ghzsBm8vOms) +- **Author:** Nana from [Techworld with Nana](https://www.techworld-with-nana.com/) + +## Posts + +### How We Use Golden Paths to Solve Fragmentation in Our Software Ecosystem + +- **Link:** [Read the article](https://engineering.atspotify.com/2020/08/how-we-use-golden-paths-to-solve-fragmentation-in-our-software-ecosystem) +- **Author:** Spotify Engineering + +Today, "golden path" is a popular term among platform engineers. This is the 2020 blog post from Spotify in which the term was first coined. + +### My Critique of "the Spotify Model": Part 1 + +Jason Yip is a former Staff Agile Coach at Spotify. In this 2-part series, he reviews how the very popular [Spotify Squads model](https://www.youtube.com/watch?v=Yvfz4HGtoPc) played out over the years, what it got right, and where it needed improvements. + +- **Link:** [Read the article](https://jchyip.medium.com/my-critique-of-the-spotify-model-part-1-197d335ef7af) +- **Author:** Jason Yip + +## Proper attribution + +We are committed to properly crediting the ideas and work of others. If you notice any content that you believe should include attribution or where our citation doesn't adequately acknowledge the original source, please let us know and we'll address it promptly. \ No newline at end of file diff --git a/docs/2.0/way/why/_category_.json b/docs/2.0/way/why/_category_.json new file mode 100644 index 0000000000..39321e2bb7 --- /dev/null +++ b/docs/2.0/way/why/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Why Developer Platforms?", + "position": 2, + "className": "way-top-level-item" +} diff --git a/docs/2.0/way/why/governance.mdx b/docs/2.0/way/why/governance.mdx new file mode 100644 index 0000000000..f07c84b453 --- /dev/null +++ b/docs/2.0/way/why/governance.mdx @@ -0,0 +1,126 @@ +--- +sidebar_position: 3 +--- + +# Governance + +## What it is + +**Governance is the set of steps you take to protect your customers and your company.** It's the safety net that allows you to move fast while maintaining control over your infrastructure estate. + +While compliance frameworks like SOC 2 focus broadly on governance, your developer platform should be focused on how to _automate_ governance using policies and guardrails. Ultimately, the goal is to make non-compliance hard to do rather than easy to do. + +## What it covers + +Governance applies across several key business requirements. Whenever you deploy any form of new infrastructure or application, most companies have to ask themselves a few key questions: + +- **Cost effectiveness** - Can you afford the infrastructure you're deploying? +- **Compliance** - Does it satisfy your legal and regulatory obligations? +- **Security** - Will it protect your customers and data? +- **Standards adherence** - Does it follow your organization's patterns and policies? + +## How to improve it + +Achieving effective governance comes from putting in place the right core tools and automations. We believe the following are the most important: + +### Use IaC pipelines + +When 100% of infrastructure changes all deploy through the same centralized pipeline, you have a mechanism to enforce the same workflows and governance processes. + +_Related: [Pipelines component](/2.0/way/platform/components/deploy/pipelines)_ + +### Automate policy enforcement + +When your governance policies are captured as code, your infrastructure pipelines can automatically enforce them. This prevents human error and ensures consistency. + +_Related: [Guardrails over gates principle](/2.0/way/principles/governance-maintainability/guardrails-over-gates), [Pipelines component](/2.0/way/platform/components/deploy/pipelines)_ + +### Use pre-built patterns + +Developers don't just need "infrastructure," they need a specific set of [infrastructure patterns](/2.0/way/platform/patterns/overview) such as deploying a K8s service, launching a database, or connecting to an LLM in an authorized way. When you provide developers with pre-built implementations of these patterns that already meet your governance requirements, compliance becomes the easy default choice rather than an afterthought. + +_Related: [Offer golden paths principle](/2.0/way/principles/governance-maintainability/offer-golden-paths), [Patterns concept](/2.0/way/platform/patterns/overview), [Catalog component](/2.0/way/platform/components/deploy/catalog)_ + +### Offer developer self-service + +Most teams think of offering developers a self-service way to do infrastructure primarily as an investment in [velocity](/2.0/way/why/velocity) and this is true! But when you lower the "pain" of deploying new infrastructure the right way, developers are far more likely to adopt your pre-approved patterns, so developer self-service is actually a key enabler of effective pre-built patterns, which are themselves an enabler of governance. + +_Related: [Enable developer self-service principle](/2.0/way/principles/developer-experience/enable-developer-self-service), [Runbooks component](/2.0/way/platform/components/deploy/runbooks)_ + +### Enable unit-level oversight + +The best way to monitor infrastructure is to be able to view its status at its lowest level, the [unit](/2.0/way/resources/glossary#unit), and then by [stack](/2.0/way/resources/glossary#stack), repo, and ultimately the infrastructure as a whole. With this filtering mechanism, you can view how either an individual unit or the entire infrastructure is faring against your governance standards. + +For example, you want to see not just how overall infrastructure trend is spending, but which units have increased in cost the most, or which stacks are suffering the most compliance issues. + +_Related: [Operate Infrastructure components](/2.0/way/platform/components/overview#operate-infrastructure)_ + +### Leverage specialized tooling + +Some elements of governance are sufficiently complex to warrant having a dedicated solution to monitor and remediate them. For example, most companies benefit from a security platform like [Snyk](https://snyk.io/) or [Wiz](https://www.wiz.io/), or from dedicated financial oversight tools like [Finout](https://www.finout.io/) or [Infracost](https://www.infracost.io/). + +More generally, dedicated tools are especially useful for: + +- Security +- Cost management +- Observability +- Compliance + +And each of these categories has numerous vendors available. + +## How to measure it + +As we've seen, governance breaks down into a discrete set of needs such as security, cost management, etc. For each governance need, it's important to understand both: + +1. The overall state of the need (state metric) +2. How effective you are at fixing issues that arise for the need (flow metric) + +This highlights an important point: good governance is not just about having a positive moment-in-time posture, but also about how quickly you can respond to issues in a complex, fast-moving world of many demands. + +At the same time, governance has a very large surface area and you can easily overwhelm yourself with metrics. So it's important to focus on the critical few metrics that drive the most insight. + +Let's look at those now, though your own mileage may vary. + +### Need: Compliance + +**State Metric: Infrastructure compliance rate** + +Your infrastructure compliance rate measures what percentage of your existing infrastructure units currently meet your policy requirements. + +**Flow Metric: Mean time to remediation (MTTR)** + +Your mean time to remediation (MTTR) measures how long a non-compliance issue takes to be resolved. + +### Need: Security + +**State Metric: Critical vulnerability coverage** + +Your critical vulnerability coverage measures what percentage of critical and high-severity security findings have been remediated across your infrastructure estate. This gives you a snapshot of your current security posture and exposure to known threats. + +**Flow Metric: Mean time to patch (MTTP)** + +Your mean time to patch (MTTP) measures how long it takes from when a security vulnerability is identified to when it's patched across all affected infrastructure. This metric reveals how responsive your security remediation processes are and whether you can meet SLAs for critical vulnerabilities (e.g., patching critical CVEs within 7 days). + +### Need: Cost Management + +**State Metric: Infrastructure cost efficiency ratio** + +Your infrastructure cost efficiency ratio measures your actual infrastructure spend against your budgeted or forecasted costs. For example, if you budgeted $100K/month but spent $120K, your efficiency ratio is 83%. This metric helps identify cost overruns and enables tracking trends over time. + +**Flow Metric: Cost anomaly response time** + +Your cost anomaly response time measures how long it takes from when a significant cost spike or waste is detected to when corrective action is taken. This reveals the effectiveness of your cost monitoring and your team's ability to quickly address unexpected spending. + +### Need: Observability + +**State Metric: Service observability coverage** + +Your service observability coverage measures what percentage of your production services have complete observability instrumentation (logs, metrics, traces, and alerting). This reveals blind spots in your monitoring and helps ensure you can detect and diagnose issues across your entire infrastructure estate. + +**Flow Metric: None** + +The nature of observability is to be able to detect issues as they happen, so there is no flow metric for this need. + +## Next + +Good governance gives you the confidence to move fast, but to maintain that over time, you need to focus on keeping your infrastructure estate healthy and manageable. Let's learn more about that now. \ No newline at end of file diff --git a/docs/2.0/way/why/maintainability.md b/docs/2.0/way/why/maintainability.md new file mode 100644 index 0000000000..79d7bc684e --- /dev/null +++ b/docs/2.0/way/why/maintainability.md @@ -0,0 +1,101 @@ +--- +sidebar_position: 4 +--- + +# Maintainability + +## What it is + +**Maintainability is your ability to spend time on planned work.** + +Your ability to focus on _planned work_ depends on how often you are forced to deal with _unplanned work_. If you spend too much time on unplanned work, you will not be able to achieve high levels of velocity. + +## What it covers + +Maintainability covers all forms of "infrastructure debt." + +### The debt analogy + +In the financial world, debt means we owe someone _money_. The more debt we have, the more we pay in interest. If debt grows faster than we can service it, we risk _financial bankruptcy_. + +In the infrastructure world, debt takes on a variety of forms: + +- **Drift** - Your infrastructure code doesn't match what's actually deployed in your cloud +- **Non-codified assets** - Resources exist in your cloud but aren't represented in code +- **Outdated IaC** - You're using old patterns or tools when better options exist +- **Non-standardization** - Your organization solves the same problem in many different ways + +We pay off infrastructure debt with _resources_, mainly time, focus, and money. + +The more infrastructure debt we accumulate, the more resources we must allocate just to maintain the status quo. If we can't allocate enough resources to both maintain current systems and achieve our velocity goals, we risk _DevOps bankruptcy_, a state where the infrastructure becomes unmaintainable and requires fundamental restructuring. + +## How to improve it + +Identify the sources of debt, and for each one, put in place processes and tooling that systematically and proactively address the debt. + +### Automated drift detection + +**Debt source:** Drift + +**Prevention and remediation:** Automatically detect drift on a scheduled basis, along with a proposal on how to resolve it. E.g. Open a pull request on a weekly basis that identifies drift and can resolve it by merging the pull request. + +_Related: [Drift detector component](/2.0/way/platform/components/maintain/drift-detector)_ + +### Streamlined resource imports + +**Debt source:** Non-codified assets + +**Prevention and remediation:** Use tooling that can discover unmanaged resources and generate the necessary code to bring them under IaC management. + +_Related: [Importer component](/2.0/way/platform/components/maintain/importer)_ + +### Automated IaC updates + +**Debt source:** Outdated IaC + +**Prevention and remediation:** Automate the process of updating your IaC to use the latest approved versions of tools, modules, and patterns. Track available updates and provide automated pull requests that upgrade dependencies while running tests to ensure compatibility. + +_Related: [IaC updater component](/2.0/way/platform/components/maintain/iac-updater)_ + +### Infrastructure estate insights + +**Debt source:** Non-standardization + +**Prevention and remediation:** Provide visibility into each repo, environment, and unit to identify where teams do not adhere to your standards. Make this information easy to discover, both for the platform engineers and application teams. + +_Related: [Scorecard component](/2.0/way/platform/components/maintain/scorecard)_ + +## How to measure it + +As we've seen, maintainability breaks down into specific sources of debt. For each debt source, focus on the critical metric that drives the most insight. + +Let's look at those now, though your own mileage may vary. + +### Debt source: Drift + +Measure drift by tracking the **drift rate,** which is the percentage of your IaC resources that have drifted from their codified state. + +### Debt source: Non-codified assets + +Measure non-codified assets by tracking the **IaC coverage rate,** which is the percentage of your cloud resources that are managed with Infrastructure as Code. + +### Debt source: Outdated IaC + +Measure outdated IaC by tracking the **up-to-date coverage rate,** which is the percentage of your deployed infrastructure that uses the latest versions of your approved tools and patterns. + +### Debt source: Non-standardization + +You can break standardization down into a discrete set of categories such as: + +- **Tooling choices:** IaC tool, CI/CD tool, etc. +- **Tooling configuration:** IaC patterns, CI/CD configuration, etc. +- **Component Use:** Catalog, Runbooks, etc. +- **Governance Status:** Static analysis, Security, cost management, policies, etc. + +You can evaluate how well these standards are applied at the repo, environment, or [unit](/2.0/way/resources/glossary#unit) level. You can measure each standard as either a binary value (complies / does not comply) or range value (e.g. 0 to 10). + +For example, you could assess whether a given unit uses Terragrunt, which might be your standard IaC orchestrator. You could then ask how many units in a given environment or repo use Terragrunt. + +## Next + +You've now covered the three fundamental concerns! Now it's time to talk about how we build a developer platform to address them. Let's start by covering the principles of such a platform. \ No newline at end of file diff --git a/docs/2.0/way/why/overview.md b/docs/2.0/way/why/overview.md new file mode 100644 index 0000000000..32c50837c3 --- /dev/null +++ b/docs/2.0/way/why/overview.md @@ -0,0 +1,66 @@ +--- +sidebar_position: 1 +title: Overview +--- + +# Why Developer Platforms? + +When companies write their own software with more than a few developers, they need to find a way to balance **three fundamental concerns**. + +### 1. Velocity + +**Velocity is how quickly you ship changes to production.** + +Software teams add value by shipping new features that drive business impact. This applies to both: + +- **Applications** - Launching new customer-facing or internal features +- **Infrastructure** - Launching or updating infrastructure to support applications + + +### 2. Governance + +**Governance is how you protect your business and customers.** + +Every time you ship a change, you need to answer critical questions like: + +- **Is it cost-effective?** Can we afford to run this? +- **Is it compliant?** Does it satisfy our legal obligations? +- **Is it secure?** Will it protect our customers' data? +- **Is it reliable?** Will this work as expected? + +### 3. Maintainability + +**Maintainability is your ability to spend time on planned work.** + +Your ability to focus on *planned work* depends on how often you are forced to deal with *unplanned work*. Unplanned work arises as a result of various forms of *debt*, such as: + +- **Drift** - Your infrastructure code does not match what's live in your cloud +- **Non-codified assets** - You have resources in your live cloud that are not represented by an infrastructure code +- **Outdated IaC** - You are using old patterns or tools when newer and better options exist +- **Non-standardization** - Your organization solves the same problem in many different ways + +Achieving maintainability is about identifying the different types of debt, knowing how to address each of them, and deciding which forms to prioritize for improvement. + +## The Balancing Act + +Unfortunately, **you can't maximize all three concerns simultaneously**. + +- Push too hard on velocity, and you'll compromise governance and accumulate debt +- Focus only on governance, and you'll slow development to a crawl +- Obsess over maintainability, and you'll never ship anything new + +The art of platform engineering is finding the right balance among velocity, governance, and maintainability for your organization at this moment in time, and adjusting that balance as your needs evolve. + +## Building a developer platform + +In [principles](/2.0/way/principles/overview), we share a set of principles we embrace when it comes to designing a developer platform. + +In [building blocks](/2.0/way/platform/overview), we describe the three major elements of a developer platform: + +1. Patterns +2. Platform Components +3. Interfaces + +## Next steps + +Let's learn more about the three fundamental concerns, starting with velocity. \ No newline at end of file diff --git a/docs/2.0/way/why/velocity.md b/docs/2.0/way/why/velocity.md new file mode 100644 index 0000000000..380e7d675b --- /dev/null +++ b/docs/2.0/way/why/velocity.md @@ -0,0 +1,129 @@ +--- +sidebar_position: 2 +--- + +# Velocity + +## What it is + +**Velocity is how quickly you ship changes to production.** It's the primary driver of business value—the faster teams can ship new features and updates, the more quickly your organization can respond to market needs and customer feedback. + +## What it covers + +Velocity applies across two key areas: + +- **Application velocity** - How rapidly you launch new customer-facing or internal features +- **Infrastructure velocity** - How quickly you provision, modify, or destroy infrastructure resources + + +## How to improve it + +Achieving high velocity comes from building the right core abstractions and automations. We believe the following are the most important: + +### Offer pre-built infrastructure patterns + +Provide vetted, reusable solutions for common infrastructure patterns. Make it easy for developers to adopt these pre-built implementations rather than reinventing them. + +_Related: [Patterns concept](/2.0/way/platform/patterns/overview), [Catalog component](/2.0/way/platform/components/deploy/catalog)_ + +### Offer pre-built app patterns + +Provide application templates that enable developers to quickly bootstrap new services. + +_Related: [Patterns concept](/2.0/way/platform/patterns/overview), [Catalog component](/2.0/way/platform/components/deploy/catalog)_ + +### Offer developer self-service + +Developers need a convenient way to browse and configure the available patterns without waiting for a human to approve their request. + +_Related: [Enable developer self-service principle](/2.0/way/principles/developer-experience/enable-developer-self-service), [Runbooks component](/2.0/way/platform/components/deploy/runbooks)_ + +### Automate IaC pipelines + +Once a pattern is configured, developers need a streamlined way to deploy it. + +_Related: [Pipelines component](/2.0/way/platform/components/deploy/pipelines)_ + +### Set up guardrails + +Velocity requires safety. To give developers the confidence to deploy quickly, you need guardrails that prevent them from doing the wrong thing. + +_Related: [guardrails over gates principle](/2.0/way/principles/governance-maintainability/guardrails-over-gates), [Pipelines component](/2.0/way/platform/components/deploy/pipelines)_ + +### Set and measure internal standards + +To have pre-built patterns, self-service, and pipeline automation, you need centralized standards to keep your platform team's surface area manageable. + +_Related: [offer golden paths principle](/2.0/way/principles/governance-maintainability/offer-golden-paths)_ + +### Platform team velocity + +The items above are focused mostly on _developer velocity_, but of course the ability of the _platform team_ to quickly ship is itself a driver of velocity! Doing that effectively is a separate discussion and outside the scope of this framework. + +## How to measure it + +How do you know how your team is performing at velocity? While there are many possible ways to measure it, we've found the most success with the following metrics: + +### 1. Lead time + +What you really care about is how quickly you can ship new ideas. So it doesn't take much analysis to conclude that one of the best measures of velocity is **lead time,** which is how long it takes a change to go from committed to deployed. + +### 2. Deployment frequency + +The famous four [DORA metrics](https://dora.dev/guides/dora-metrics-four-keys/) proved (through lots of survey data) that teams with good lead times also deploy often. This is because frequent deploys lead to: + +- A smaller blast radius when things go wrong +- Easier debugging (fewer changes to investigate) +- More practice with your deployment process +- Better feedback loops + +In short, deployment frequency is a strong indicator of many best practices. For this reason, we recommend **deployment frequency,** or how often you deploy to production, as the second way to measure velocity. + +### 3. Developer satisfaction + +The clock starts ticking on lead time the moment you commit code to source control, but what about everything required to enable you to commit code in the first place? For example, security reviews, new cloud pattern development, or approval times could all be an issue. + + + +Because there's a wide range of possibilities here, we fall back to something more fundamental: customer satisfaction. When you [treat the developer platform as a product](/2.0/way/principles/core-philosophy/your-developer-platform-is-a-product), you need to know how happy your users are. You can ask qualitatively by just chatting with them, but to get the hard metrics, it's better to do user surveys. + +:::note + +There are many ways to gauge user satisfaction. But if it's a product we're building, then why not ask questions that reveal product-market fit? Inspired by [Rahul Vohra](https://review.firstround.com/how-superhuman-built-an-engine-to-find-product-market-fit/), one effective way to do that is to ask users “how would you feel if you could no longer use the developer platform?” and measure the percent who answer “very disappointed." +::: + + + +
+The Impact Of World-Class Software Delivery + +:::note + +This is an excerpt from the preface of [The Fundamentals of DevOps and Software Delivery](https://www.fundamentals-of-devops.com/?ref=gruntwork-way) by Yevgeniy Brikman. + +::: + +The vast majority of developers have never had the opportunity to see what world-class software delivery looks like first hand. If you’re one of them, you’ll be astonished by the gap between companies with world-class software delivery processes and everyone else. It’s not a 1.1x or 1.5x improvement: it’s 10x, 100x, or more. + +**Table P-1** shows the difference between elite performers and low performers at the four key _DevOps Research and Assessment (DORA)_ metrics, which are a quick way to assess the performance of a software development team: + +**Table P-1. DORA metrics performance from the _2024 State of DevOps Report_** + +| Metric | Description | Elite vs low performers | +|---------------------|----------------------------------------------------------------------|-------------------------------| +| Deployment frequency| How often you deploy to production | 182× more often | +| Lead time | How long it takes a change to go from committed to deployed | 127× faster | +| Change failure rate | How often deployments cause failures that need immediate remediation | 8× lower | +| Recovery time | How long it takes to recover from a failed deployment | 2293× faster | + +These are _**staggering**_ differences. To put them into perspective, we’re talking the difference between: + +- Deploying once per month versus many times per day. +- Deployment processes that take 36 hours versus 5 minutes. +- Two out of three deployments causing problems versus one out of twenty. + +
+ +## Next + +When done well, velocity improvements actually enhance security, compliance, and maintainability by making good practices the easy default choice. So let's read about how to put those in place next. \ No newline at end of file diff --git a/docusaurus.config.js b/docusaurus.config.js index 02db0d29c2..a0abc49906 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -258,6 +258,11 @@ async function createConfig() { label: "Reference", docId: "2.0/reference/index", }, + { + type: "doc", + label: "Way", + docId: "2.0/way/intro/welcome", + }, { type: "doc", label: "Release Notes", diff --git a/package.json b/package.json index 30b87180b3..621fce736c 100644 --- a/package.json +++ b/package.json @@ -57,6 +57,7 @@ "@docusaurus/tsconfig": "3.7.0", "@docusaurus/types": "^3.7.0", "cspell": "^8.19.4", + "husky": "^9.1.7", "jest": "^27.4.7", "onchange": "^7.1.0", "ts-node": "^10.7.0", diff --git a/sidebars.js b/sidebars.js index f6a4e7a627..c7821979ec 100644 --- a/sidebars.js +++ b/sidebars.js @@ -14,6 +14,7 @@ const infrastructurePipelinesSidebars = require("./sidebars/infrastructure-pipel const ecsDeployRunnerSidebars = require("./sidebars/ecs-deploy-runner.js") const docsSidebar = require("./sidebars/docs.js") const referenceSidebar = require("./sidebars/reference.js") +const waySidebar = require("./sidebars/way.js") // @ts-check @@ -24,6 +25,7 @@ const sidebars = { ecsDeployRunnerSidebars, docsSidebar, referenceSidebar, + waySidebar, } module.exports = sidebars diff --git a/sidebars/way.js b/sidebars/way.js new file mode 100644 index 0000000000..c631f3902c --- /dev/null +++ b/sidebars/way.js @@ -0,0 +1,9 @@ +const sidebar = [ + { + type: "autogenerated", + dirName: "2.0/way", + }, +] + +module.exports = sidebar + diff --git a/src/css/custom.css b/src/css/custom.css index 7df917f44e..65a0f5b3a5 100644 --- a/src/css/custom.css +++ b/src/css/custom.css @@ -740,3 +740,9 @@ html[data-theme="dark"] .customizable-value span::after { background-color: var(--ifm-color-primary); border-color: var(--ifm-color-primary); } + +/* Bold top-level sidebar items in the Way section only */ +.way-top-level-item > div > a.menu__link, +.way-top-level-item > a.menu__link { + font-weight: bold; +} diff --git a/yarn.lock b/yarn.lock index 81c7b64bf0..e4d802569c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6387,20 +6387,10 @@ caniuse-api@^3.0.0: lodash.memoize "^4.1.2" lodash.uniq "^4.5.0" -caniuse-lite@^1.0.0, caniuse-lite@^1.0.30001646, caniuse-lite@^1.0.30001669: - version "1.0.30001676" - resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001676.tgz#fe133d41fe74af8f7cc93b8a714c3e86a86e6f04" - integrity sha512-Qz6zwGCiPghQXGJvgQAem79esjitvJ+CxSbSQkW9H/UX5hg8XM88d4lp2W+MEQ81j+Hip58Il+jGVdazk1z9cw== - -caniuse-lite@^1.0.30001616: - version "1.0.30001677" - resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001677.tgz#27c2e2c637e007cfa864a16f7dfe7cde66b38b5f" - integrity sha512-fmfjsOlJUpMWu+mAAtZZZHz7UEwsUxIIvu1TJfO1HqFQvB/B+ii0xr9B5HpbZY/mC4XZ8SvjHJqtAY6pDPQEog== - -caniuse-lite@^1.0.30001688: - version "1.0.30001697" - resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001697.tgz#040bbbb54463c4b4b3377c716b34a322d16e6fc7" - integrity sha512-GwNPlWJin8E+d7Gxq96jxM6w0w+VFeyyXRsjU58emtkYqnbwHqXm5uT2uCmO0RQE9htWknOP4xtBlLmM/gWxvQ== +caniuse-lite@^1.0.0, caniuse-lite@^1.0.30001616, caniuse-lite@^1.0.30001646, caniuse-lite@^1.0.30001669, caniuse-lite@^1.0.30001688: + version "1.0.30001754" + resolved "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001754.tgz" + integrity sha512-x6OeBXueoAceOmotzx3PO4Zpt4rzpeIFsSr6AAePTZxSkXiYDUmpypEl7e2+8NCd9bD7bXjqyef8CJYPC1jfxg== ccount@^1.0.0, ccount@^1.0.3: version "1.1.0" @@ -9841,6 +9831,11 @@ human-signals@^2.1.0: resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0" integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw== +husky@^9.1.7: + version "9.1.7" + resolved "https://registry.yarnpkg.com/husky/-/husky-9.1.7.tgz#d46a38035d101b46a70456a850ff4201344c0b2d" + integrity sha512-5gs5ytaNjBrh5Ow3zrvdUUY+0VxIuWVL4i9irt6friV+BqdCfmV11CQTWMiBYWHbXhco+J1kHfTOUkePhCDvMA== + iconv-lite@0.4.24: version "0.4.24" resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.4.24.tgz#2022b4b25fbddc21d2f524974a474aafe733908b"