From 9cee127ab0d0f632aad094296ebb01bb1d48fb2b Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 19 May 2026 16:00:15 +0200 Subject: [PATCH 01/10] Make secure token storage the default storage mode U2M tokens for the databricks-cli auth type now write to the OS-native keyring by default. Users who need the previous file-backed cache can opt back via DATABRICKS_AUTH_STORAGE=plaintext or auth_storage = plaintext under [__settings__] in .databrickscfg; the env var takes precedence. The login-time keyring probe and fallback (already on main) activate with this change. Co-authored-by: Isaac --- NEXT_CHANGELOG.md | 2 ++ .../auth/describe/u2m-json-output/output.txt | 4 ++-- .../describe/u2m-plaintext-default/test.toml | 3 --- .../out.test.toml | 0 .../output.txt | 4 ++-- .../script | 0 .../auth/describe/u2m-secure-default/test.toml | 11 +++++++++++ acceptance/script.prepare | 6 ++++++ cmd/auth/describe_test.go | 14 +++++++------- libs/auth/storage/cache.go | 18 +++--------------- libs/auth/storage/cache_test.go | 6 +++--- libs/auth/storage/mode.go | 18 +++++++++++------- libs/auth/storage/mode_test.go | 16 ++++++++-------- 13 files changed, 55 insertions(+), 47 deletions(-) delete mode 100644 acceptance/cmd/auth/describe/u2m-plaintext-default/test.toml rename acceptance/cmd/auth/describe/{u2m-plaintext-default => u2m-secure-default}/out.test.toml (100%) rename acceptance/cmd/auth/describe/{u2m-plaintext-default => u2m-secure-default}/output.txt (78%) rename acceptance/cmd/auth/describe/{u2m-plaintext-default => u2m-secure-default}/script (100%) create mode 100644 acceptance/cmd/auth/describe/u2m-secure-default/test.toml diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index d85528c1e59..c33fa1ea56f 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -4,6 +4,8 @@ ### Notable Changes +* Breaking change: U2M (`auth_type = databricks-cli`) tokens are now stored in the OS-native secure store by default (Keychain on macOS, Credential Manager on Windows, Secret Service on Linux) instead of `~/.databricks/token-cache.json`. After upgrading, run `databricks auth login` once per profile to re-authenticate; cached tokens from older versions are not migrated. To keep the previous file-backed storage, set `DATABRICKS_AUTH_STORAGE=plaintext` or add `auth_storage = plaintext` under `[__settings__]` in `~/.databrickscfg` (the env var takes precedence over the config setting), then re-run `databricks auth login`. + ### CLI * Added `databricks aitools` command group for installing Databricks skills into your coding agents (Claude Code, Cursor, Codex CLI, OpenCode, GitHub Copilot, Antigravity). Skills are fetched from [github.com/databricks/databricks-agent-skills](https://github.com/databricks/databricks-agent-skills) and either symlinked into each agent's skills directory or copied into the current project. Use `databricks aitools install` to set up, `update` to pull newer versions, `list` to see what's available, and `uninstall` to remove them. diff --git a/acceptance/cmd/auth/describe/u2m-json-output/output.txt b/acceptance/cmd/auth/describe/u2m-json-output/output.txt index 7e2ac070cbc..bd5e6108735 100644 --- a/acceptance/cmd/auth/describe/u2m-json-output/output.txt +++ b/acceptance/cmd/auth/describe/u2m-json-output/output.txt @@ -2,7 +2,7 @@ >>> [CLI] auth describe --profile u2m-profile --output json Warn: [hostmetadata] failed to fetch host metadata for https://u2m-profile.databricks.test, will skip for 1m0s { - "mode": "plaintext", - "location": "~/.databricks/token-cache.json", + "mode": "secure", + "location": "OS keyring (service: databricks-cli)", "source": "default" } diff --git a/acceptance/cmd/auth/describe/u2m-plaintext-default/test.toml b/acceptance/cmd/auth/describe/u2m-plaintext-default/test.toml deleted file mode 100644 index 36c0e7e237b..00000000000 --- a/acceptance/cmd/auth/describe/u2m-plaintext-default/test.toml +++ /dev/null @@ -1,3 +0,0 @@ -Ignore = [ - "home" -] diff --git a/acceptance/cmd/auth/describe/u2m-plaintext-default/out.test.toml b/acceptance/cmd/auth/describe/u2m-secure-default/out.test.toml similarity index 100% rename from acceptance/cmd/auth/describe/u2m-plaintext-default/out.test.toml rename to acceptance/cmd/auth/describe/u2m-secure-default/out.test.toml diff --git a/acceptance/cmd/auth/describe/u2m-plaintext-default/output.txt b/acceptance/cmd/auth/describe/u2m-secure-default/output.txt similarity index 78% rename from acceptance/cmd/auth/describe/u2m-plaintext-default/output.txt rename to acceptance/cmd/auth/describe/u2m-secure-default/output.txt index 981244ff8d9..de16f4551db 100644 --- a/acceptance/cmd/auth/describe/u2m-plaintext-default/output.txt +++ b/acceptance/cmd/auth/describe/u2m-secure-default/output.txt @@ -1,8 +1,8 @@ >>> [CLI] auth describe --profile u2m-profile Warn: [hostmetadata] failed to fetch host metadata for https://u2m-profile.databricks.test, will skip for 1m0s -Unable to authenticate: error getting token: cache: token not found -Token storage: plaintext, ~/.databricks/token-cache.json (from default) +Unable to authenticate: error getting token: [KEYRING_LOOKUP_ERROR] +Token storage: secure, OS keyring (service: databricks-cli) (from default) ----- Current configuration: ✓ host: https://u2m-profile.databricks.test (from ./home/.databrickscfg config file) diff --git a/acceptance/cmd/auth/describe/u2m-plaintext-default/script b/acceptance/cmd/auth/describe/u2m-secure-default/script similarity index 100% rename from acceptance/cmd/auth/describe/u2m-plaintext-default/script rename to acceptance/cmd/auth/describe/u2m-secure-default/script diff --git a/acceptance/cmd/auth/describe/u2m-secure-default/test.toml b/acceptance/cmd/auth/describe/u2m-secure-default/test.toml new file mode 100644 index 00000000000..784352740f5 --- /dev/null +++ b/acceptance/cmd/auth/describe/u2m-secure-default/test.toml @@ -0,0 +1,11 @@ +Ignore = [ + "home" +] + +# Normalize the platform-specific keyring lookup error. macOS returns +# cache.ErrNotFound for a clean miss; Linux without D-Bus returns a +# backend-specific error. The point of this test is the resolved storage +# mode, not the lookup outcome. +[[Repls]] +Old = 'Unable to authenticate: error getting token: .*' +New = 'Unable to authenticate: error getting token: [KEYRING_LOOKUP_ERROR]' diff --git a/acceptance/script.prepare b/acceptance/script.prepare index b158ca3c74e..b33141f9493 100644 --- a/acceptance/script.prepare +++ b/acceptance/script.prepare @@ -1,3 +1,9 @@ +# Force plaintext token storage so acceptance tests exercise the file-backed +# cache rather than the OS keyring, which is not reliably reachable in CI. +# Tests that want to exercise the secure path or the resolver default unset +# or override this in their own script.prepare or script. +export DATABRICKS_AUTH_STORAGE=plaintext + errcode() { # Temporarily disable 'set -e' to prevent the script from exiting on error set +e diff --git a/cmd/auth/describe_test.go b/cmd/auth/describe_test.go index 528decf1022..72fee3486bc 100644 --- a/cmd/auth/describe_test.go +++ b/cmd/auth/describe_test.go @@ -240,21 +240,21 @@ func TestResolveTokenStorageInfo(t *testing.T) { want: nil, }, { - name: "databricks-cli with default plaintext", + name: "databricks-cli with default secure", authType: authTypeDatabricksCLI, want: &tokenStorageInfo{ - Mode: "plaintext", - Location: plaintextLocation, + Mode: "secure", + Location: secureLocation, Source: "default", }, }, { - name: "databricks-cli with secure from env", + name: "databricks-cli with plaintext from env", authType: authTypeDatabricksCLI, - envValue: "secure", + envValue: "plaintext", want: &tokenStorageInfo{ - Mode: "secure", - Location: secureLocation, + Mode: "plaintext", + Location: plaintextLocation, Source: "DATABRICKS_AUTH_STORAGE environment variable", }, }, diff --git a/libs/auth/storage/cache.go b/libs/auth/storage/cache.go index d3a5fa85d29..0e3f311dede 100644 --- a/libs/auth/storage/cache.go +++ b/libs/auth/storage/cache.go @@ -59,10 +59,6 @@ func ResolveCache(ctx context.Context, override StorageMode) (cache.TokenCache, // The timeout is ambiguous (locked vs hung); a misdiagnosis fails // the final Store rather than silently downgrading to plaintext. // -// Rules 1 and 2 are dormant today: the resolver default is plaintext, so -// (mode=Secure, explicit=false) is unreachable. They activate when the -// default flips to secure at GA. -// // Login-specific. Read paths (auth token, bundle commands) keep the original // keyring error so they don't silently mint plaintext copies of tokens that // were stored in the keyring on another machine. @@ -120,12 +116,6 @@ func resolveCacheForLoginWith(ctx context.Context, override StorageMode, f cache // resolved mode and whether the user explicitly asked for it. Split out so // tests can drive the (mode, explicit) input space directly without depending // on whatever the resolver's default mode happens to be at any point in time. -// -// Pin-on-success across modes (locking in the first working behavior to -// insulate users from keyring flakiness) is intentionally not implemented -// here. It lands at GA alongside the default flip; pinning before the -// flip would freeze every default user into plaintext and make the flip a -// no-op for them. func applyLoginFallback(ctx context.Context, mode StorageMode, explicit bool, f cacheFactories) (cache.TokenCache, StorageMode, error) { switch mode { case StorageModePlaintext: @@ -168,11 +158,9 @@ func applyLoginFallback(ctx context.Context, mode StorageMode, explicit bool, f // in .databrickscfg so subsequent commands skip the (slow/blocking) keyring // probe and route straight to the file cache. // -// Only called on the (mode=Secure, explicit=false) probe-failure branch. That -// branch is unreachable today (resolver default is plaintext), so this is -// dormant infrastructure: it activates when the default flips to secure -// at GA and lets default-on-broken-keyring users avoid a 3s probe on every -// command. +// Only called on the (mode=Secure, explicit=false) probe-failure branch. +// Persisting the fallback lets default-on-broken-keyring users avoid a 3s +// probe on every command. func persistPlaintextFallback(ctx context.Context) error { configPath := env.Get(ctx, "DATABRICKS_CONFIG_FILE") return databrickscfg.SetConfiguredAuthStorage(ctx, string(StorageModePlaintext), configPath) diff --git a/libs/auth/storage/cache_test.go b/libs/auth/storage/cache_test.go index db059299dc3..21bb487f963 100644 --- a/libs/auth/storage/cache_test.go +++ b/libs/auth/storage/cache_test.go @@ -41,15 +41,15 @@ func hermetic(t *testing.T) { t.Setenv("DATABRICKS_CONFIG_FILE", filepath.Join(t.TempDir(), "databrickscfg")) } -func TestResolveCache_DefaultsToPlaintextFile(t *testing.T) { +func TestResolveCache_DefaultsToSecureKeyring(t *testing.T) { hermetic(t) ctx := t.Context() got, mode, err := resolveCacheWith(ctx, "", fakeFactories(t)) require.NoError(t, err) - assert.Equal(t, StorageModePlaintext, mode) - assert.Equal(t, "file", got.(stubCache).source) + assert.Equal(t, StorageModeSecure, mode) + assert.Equal(t, "keyring", got.(stubCache).source) } func TestResolveCache_OverrideSecureUsesKeyring(t *testing.T) { diff --git a/libs/auth/storage/mode.go b/libs/auth/storage/mode.go index 6dd3c6e5dff..60c285f51ad 100644 --- a/libs/auth/storage/mode.go +++ b/libs/auth/storage/mode.go @@ -1,9 +1,10 @@ // Package storage selects and constructs the CLI's U2M token storage backend. // -// Two modes are supported. Plaintext writes to ~/.databricks/token-cache.json -// with host-key dual-write for older Go SDK versions (v0.61-v0.103); it is the -// resolver default. Secure writes to the OS-native keyring under the profile -// cache key only; it is opt-in pre-GA and slated to become the default at GA. +// Two modes are supported. Secure writes to the OS-native keyring under the +// profile cache key only; it is the resolver default. Plaintext writes to +// ~/.databricks/token-cache.json with host-key dual-write for older Go SDK +// versions (v0.61-v0.103); it is the opt-in fallback for environments where +// the OS keyring is not available. package storage import ( @@ -26,12 +27,15 @@ const ( // StorageModePlaintext writes tokens to ~/.databricks/token-cache.json // and mirrors each token under the legacy host-based cache key for - // older Go SDK versions (v0.61-v0.103). This is the resolver default. + // older Go SDK versions (v0.61-v0.103). Opt-in via DATABRICKS_AUTH_STORAGE + // or [__settings__].auth_storage for environments where the OS keyring + // is not available. StorageModePlaintext StorageMode = "plaintext" // StorageModeSecure writes tokens to the OS-native secure store // (macOS Keychain, Windows Credential Manager, Linux Secret Service) // under the profile cache key only. No host-key entry is written. + // This is the resolver default. StorageModeSecure StorageMode = "secure" ) @@ -101,7 +105,7 @@ func ParseMode(raw string) StorageMode { // 1. override (typically from a command-level flag such as --secure-storage). // 2. DATABRICKS_AUTH_STORAGE env var. // 3. [__settings__].auth_storage in .databrickscfg. -// 4. StorageModePlaintext. +// 4. StorageModeSecure. // // StorageModeUnknown as override means "no flag set; fall through." The // override is trusted to be a valid StorageMode: callers that parse user @@ -138,7 +142,7 @@ func ResolveStorageModeWithSource(ctx context.Context, override StorageMode) (St return mode, StorageSourceConfig, err } - return StorageModePlaintext, StorageSourceDefault, nil + return StorageModeSecure, StorageSourceDefault, nil } func parseFromSource(raw, source string) (StorageMode, error) { diff --git a/libs/auth/storage/mode_test.go b/libs/auth/storage/mode_test.go index 4c6cf0e1614..431cc903801 100644 --- a/libs/auth/storage/mode_test.go +++ b/libs/auth/storage/mode_test.go @@ -41,7 +41,7 @@ func TestResolveStorageMode(t *testing.T) { }{ { name: "default when nothing is set", - want: StorageModePlaintext, + want: StorageModeSecure, }, { name: "override wins over env and config", @@ -63,8 +63,8 @@ func TestResolveStorageMode(t *testing.T) { }, { name: "config sets mode when env and override unset", - configBody: "[__settings__]\nauth_storage = secure\n", - want: StorageModeSecure, + configBody: "[__settings__]\nauth_storage = plaintext\n", + want: StorageModePlaintext, }, { name: "env value is case-insensitive and trimmed", @@ -141,19 +141,19 @@ func TestResolveStorageModeWithSource(t *testing.T) { }{ { name: "default", - wantMode: StorageModePlaintext, + wantMode: StorageModeSecure, wantSource: StorageSourceDefault, }, { name: "override", - override: StorageModeSecure, - wantMode: StorageModeSecure, + override: StorageModePlaintext, + wantMode: StorageModePlaintext, wantSource: StorageSourceOverride, }, { name: "env", - envValue: "secure", - wantMode: StorageModeSecure, + envValue: "plaintext", + wantMode: StorageModePlaintext, wantSource: StorageSourceEnvVar, }, { From be527b05fe2758b5ea69e35fe70df0dda7a637f7 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 19 May 2026 16:22:20 +0200 Subject: [PATCH 02/10] Pin secure mode after first successful login When the resolver returns secure from the default (no env, no config), login now writes auth_storage = secure to [__settings__] after the keyring Store succeeds. Subsequent invocations see source=Config, so the explicit-secure branch of applyLoginFallback surfaces a transient keyring probe failure as an error instead of silently demoting the user to plaintext. Without this pin, a working secure-storage user could get stranded on the file cache after a single flaky probe. No-op when mode is plaintext (silent fallback already happened) or when the user already chose a mode explicitly. Persistence failures are logged at debug and never block login. Co-authored-by: Isaac --- cmd/auth/login.go | 7 ++- cmd/auth/token.go | 1 + libs/auth/storage/cache.go | 32 +++++++++++++ libs/auth/storage/cache_test.go | 81 +++++++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 1 deletion(-) diff --git a/cmd/auth/login.go b/cmd/auth/login.go index d45a77806fd..415db7df67b 100644 --- a/cmd/auth/login.go +++ b/cmd/auth/login.go @@ -295,7 +295,11 @@ a new profile is created. return err } // At this point, an OAuth token has been successfully minted and stored - // in the CLI cache. The rest of the command focuses on: + // in the CLI cache. Pin the resolved storage mode so a transient + // keyring failure on a future login can no longer silently demote a + // working secure-storage user to plaintext. + storage.PinSecureMode(ctx, mode) + // The rest of the command focuses on: // 1. Workspace selection for SPOG hosts (best-effort); // 2. Configuring cluster and serverless; // 3. Saving the profile. @@ -633,6 +637,7 @@ func discoveryLogin(ctx context.Context, in discoveryLoginInputs) error { if err := persistentAuth.Challenge(); err != nil { return discoveryErr("login via login.databricks.com failed", err) } + storage.PinSecureMode(ctx, in.mode) discoveredHost := arg.GetDiscoveredHost() if discoveredHost == "" { diff --git a/cmd/auth/token.go b/cmd/auth/token.go index d7c25ecece3..26fd0a7b047 100644 --- a/cmd/auth/token.go +++ b/cmd/auth/token.go @@ -433,6 +433,7 @@ func runInlineLogin(ctx context.Context, profiler profile.Profiler, tokenCache c if err = persistentAuth.Challenge(); err != nil { return "", nil, err } + storage.PinSecureMode(ctx, mode) clearKeys := oauthLoginClearKeys() clearKeys = append(clearKeys, databrickscfg.ExperimentalIsUnifiedHostKey) diff --git a/libs/auth/storage/cache.go b/libs/auth/storage/cache.go index 0e3f311dede..891747f6e6a 100644 --- a/libs/auth/storage/cache.go +++ b/libs/auth/storage/cache.go @@ -165,3 +165,35 @@ func persistPlaintextFallback(ctx context.Context) error { configPath := env.Get(ctx, "DATABRICKS_CONFIG_FILE") return databrickscfg.SetConfiguredAuthStorage(ctx, string(StorageModePlaintext), configPath) } + +// PinSecureMode persists auth_storage = secure to [__settings__] when the +// user is currently on the secure-from-default path: mode resolved to secure +// without an explicit override, env var, or config setting. +// +// Call this after a successful keyring write (post-Challenge in login). Once +// pinned, subsequent invocations see source=Config and the explicit-secure +// branch of applyLoginFallback returns an error instead of silently demoting +// to plaintext, so a transient keyring probe failure cannot strand a working +// user on the file cache. +// +// No-op when mode is not secure (e.g. silent plaintext fallback already +// happened) or when the user already chose a mode explicitly. Persisting +// failures are logged at debug; pinning is best-effort and must not block +// login. +func PinSecureMode(ctx context.Context, mode StorageMode) { + if mode != StorageModeSecure { + return + } + _, source, err := ResolveStorageModeWithSource(ctx, "") + if err != nil { + log.Debugf(ctx, "pin secure mode: resolve: %v", err) + return + } + if source.Explicit() { + return + } + configPath := env.Get(ctx, "DATABRICKS_CONFIG_FILE") + if err := databrickscfg.SetConfiguredAuthStorage(ctx, string(StorageModeSecure), configPath); err != nil { + log.Debugf(ctx, "pin secure mode: persist: %v", err) + } +} diff --git a/libs/auth/storage/cache_test.go b/libs/auth/storage/cache_test.go index 21bb487f963..e28136b5274 100644 --- a/libs/auth/storage/cache_test.go +++ b/libs/auth/storage/cache_test.go @@ -272,6 +272,87 @@ func TestApplyLoginFallback_ProbeTimeout_StaysOnKeyring(t *testing.T) { } } +func TestPinSecureMode(t *testing.T) { + cases := []struct { + name string + mode StorageMode + envValue string + configBody string + wantWritten string + wantSkipMsg string + }{ + { + name: "secure from default persists secure", + mode: StorageModeSecure, + wantWritten: "secure", + }, + { + name: "plaintext mode is a no-op", + mode: StorageModePlaintext, + wantWritten: "", + }, + { + name: "secure from env is a no-op", + mode: StorageModeSecure, + envValue: "secure", + wantWritten: "", + }, + { + name: "secure from config is a no-op (already pinned)", + mode: StorageModeSecure, + configBody: "[__settings__]\nauth_storage = secure\n", + wantWritten: "secure", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + hermetic(t) + ctx := t.Context() + configPath := env.Get(ctx, "DATABRICKS_CONFIG_FILE") + if tc.configBody != "" { + require.NoError(t, os.WriteFile(configPath, []byte(tc.configBody), 0o600)) + } + if tc.envValue != "" { + ctx = env.Set(ctx, EnvVar, tc.envValue) + } + + PinSecureMode(ctx, tc.mode) + + got, err := databrickscfg.GetConfiguredAuthStorage(ctx, configPath) + require.NoError(t, err) + assert.Equal(t, tc.wantWritten, got) + }) + } +} + +func TestPinSecureMode_IsIdempotent(t *testing.T) { + hermetic(t) + ctx := t.Context() + configPath := env.Get(ctx, "DATABRICKS_CONFIG_FILE") + + PinSecureMode(ctx, StorageModeSecure) + first, err := databrickscfg.GetConfiguredAuthStorage(ctx, configPath) + require.NoError(t, err) + require.Equal(t, "secure", first) + + // Second call should see source=Config and skip the write. + PinSecureMode(ctx, StorageModeSecure) + second, err := databrickscfg.GetConfiguredAuthStorage(ctx, configPath) + require.NoError(t, err) + assert.Equal(t, "secure", second) +} + +func TestPinSecureMode_PersistFailureIsSwallowed(t *testing.T) { + hermetic(t) + ctx := t.Context() + // Point DATABRICKS_CONFIG_FILE at an unwritable path so SetConfiguredAuthStorage fails. + t.Setenv("DATABRICKS_CONFIG_FILE", filepath.Join(t.TempDir(), "no-such-dir", ".databrickscfg")) + + // Must not panic or block; failures are logged at debug. + PinSecureMode(ctx, StorageModeSecure) +} + func TestWrapForOAuthArgument(t *testing.T) { const ( host = "https://example.com" From a7eeb47a4272c78493a17538cc8af0264775625a Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 19 May 2026 16:51:32 +0200 Subject: [PATCH 03/10] Address self-review nits - login.go: move PinSecureMode call out of the existing comment block so the "At this point... / The rest of the command focuses on" narration stays together - cache.go: trim PinSecureMode doc comment and acknowledge that concurrent logins racing the write is benign because both write the same value - cache_test.go: drop the unused wantSkipMsg struct field; strengthen TestPinSecureMode_PersistFailureIsSwallowed to assert no file was written (and that the underlying os.OpenFile failure is the real trigger) - u2m-secure-default test.toml: rephrase the fixture comment to keep internal Go API names out of test config Co-authored-by: Isaac --- .../describe/u2m-secure-default/test.toml | 8 ++++---- cmd/auth/login.go | 11 +++++----- libs/auth/storage/cache.go | 20 ++++++++----------- libs/auth/storage/cache_test.go | 13 +++++++++--- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/acceptance/cmd/auth/describe/u2m-secure-default/test.toml b/acceptance/cmd/auth/describe/u2m-secure-default/test.toml index 784352740f5..d0d5090e6a5 100644 --- a/acceptance/cmd/auth/describe/u2m-secure-default/test.toml +++ b/acceptance/cmd/auth/describe/u2m-secure-default/test.toml @@ -2,10 +2,10 @@ Ignore = [ "home" ] -# Normalize the platform-specific keyring lookup error. macOS returns -# cache.ErrNotFound for a clean miss; Linux without D-Bus returns a -# backend-specific error. The point of this test is the resolved storage -# mode, not the lookup outcome. +# This test runs against the real OS keyring at Lookup time (no writes). +# macOS produces a clean miss; Linux without a usable D-Bus session bus +# produces a backend error. Normalize both so the assertion stays on the +# resolved storage mode, not the lookup outcome. [[Repls]] Old = 'Unable to authenticate: error getting token: .*' New = 'Unable to authenticate: error getting token: [KEYRING_LOOKUP_ERROR]' diff --git a/cmd/auth/login.go b/cmd/auth/login.go index 415db7df67b..3a290c05e97 100644 --- a/cmd/auth/login.go +++ b/cmd/auth/login.go @@ -294,12 +294,13 @@ a new profile is created. if err = persistentAuth.Challenge(); err != nil { return err } - // At this point, an OAuth token has been successfully minted and stored - // in the CLI cache. Pin the resolved storage mode so a transient - // keyring failure on a future login can no longer silently demote a - // working secure-storage user to plaintext. + // Lock secure mode in after a successful keyring write so a later + // transient keyring probe failure cannot silently demote this user + // to plaintext. storage.PinSecureMode(ctx, mode) - // The rest of the command focuses on: + + // At this point, an OAuth token has been successfully minted and stored + // in the CLI cache. The rest of the command focuses on: // 1. Workspace selection for SPOG hosts (best-effort); // 2. Configuring cluster and serverless; // 3. Saving the profile. diff --git a/libs/auth/storage/cache.go b/libs/auth/storage/cache.go index 891747f6e6a..8b146379588 100644 --- a/libs/auth/storage/cache.go +++ b/libs/auth/storage/cache.go @@ -167,19 +167,15 @@ func persistPlaintextFallback(ctx context.Context) error { } // PinSecureMode persists auth_storage = secure to [__settings__] when the -// user is currently on the secure-from-default path: mode resolved to secure -// without an explicit override, env var, or config setting. +// user is currently on the secure-from-default path. Once pinned, subsequent +// invocations see source=Config (explicit), so applyLoginFallback returns an +// error on a transient keyring probe failure instead of silently demoting +// the user to plaintext. // -// Call this after a successful keyring write (post-Challenge in login). Once -// pinned, subsequent invocations see source=Config and the explicit-secure -// branch of applyLoginFallback returns an error instead of silently demoting -// to plaintext, so a transient keyring probe failure cannot strand a working -// user on the file cache. -// -// No-op when mode is not secure (e.g. silent plaintext fallback already -// happened) or when the user already chose a mode explicitly. Persisting -// failures are logged at debug; pinning is best-effort and must not block -// login. +// No-op when mode is not secure or when the user already chose a mode +// explicitly. Best-effort: persistence failures are logged at debug and +// never block login. Concurrent logins racing this write is benign because +// both write the same value. func PinSecureMode(ctx context.Context, mode StorageMode) { if mode != StorageModeSecure { return diff --git a/libs/auth/storage/cache_test.go b/libs/auth/storage/cache_test.go index e28136b5274..32979c00c7b 100644 --- a/libs/auth/storage/cache_test.go +++ b/libs/auth/storage/cache_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "io/fs" "os" "path/filepath" "testing" @@ -279,7 +280,6 @@ func TestPinSecureMode(t *testing.T) { envValue string configBody string wantWritten string - wantSkipMsg string }{ { name: "secure from default persists secure", @@ -346,11 +346,18 @@ func TestPinSecureMode_IsIdempotent(t *testing.T) { func TestPinSecureMode_PersistFailureIsSwallowed(t *testing.T) { hermetic(t) ctx := t.Context() - // Point DATABRICKS_CONFIG_FILE at an unwritable path so SetConfiguredAuthStorage fails. - t.Setenv("DATABRICKS_CONFIG_FILE", filepath.Join(t.TempDir(), "no-such-dir", ".databrickscfg")) + // Point DATABRICKS_CONFIG_FILE at a path whose parent does not exist. + // loadOrCreateConfigFile does not mkdir, so the underlying os.OpenFile + // fails and SetConfiguredAuthStorage returns an error. + configPath := filepath.Join(t.TempDir(), "no-such-dir", ".databrickscfg") + t.Setenv("DATABRICKS_CONFIG_FILE", configPath) // Must not panic or block; failures are logged at debug. PinSecureMode(ctx, StorageModeSecure) + + // The persist failure must not have produced any file. + _, err := os.Stat(configPath) + assert.ErrorIs(t, err, fs.ErrNotExist, "no file should have been written") } func TestWrapForOAuthArgument(t *testing.T) { From 1561bfaa9ff0cb2bac1742e161ae071c03aebe5b Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 19 May 2026 17:10:35 +0200 Subject: [PATCH 04/10] Fix linux CI: don't mutate the .databrickscfg fixture during login tests After the default flipped to secure, any test that runs the login command on linux (no D-Bus) hits applyLoginFallback, which silently persists auth_storage = plaintext to whatever DATABRICKS_CONFIG_FILE points at. TestProfileHostCompatibleViaCobra points it at the checked- in cmd/auth/testdata/.databrickscfg fixture, so the test run leaves a dirty working tree and CI's `git diff --exit-code` step fails. Two changes: 1. Move ResolveCacheForLogin in login.go to run after input validation (cluster/serverless mutex + positional-arg check) rather than before. Trivially-invalid commands now fail without probing the keyring, so TestLoginRejectsPositionalArgWithHostFlag / WithProfileFlag no longer hit applyLoginFallback. The "resolve before browser step" property the original comment cared about is preserved: cache resolution still happens before NewPersistentAuth and Challenge. 2. Force plaintext via DATABRICKS_AUTH_STORAGE in TestProfileHostCompatibleViaCobra, which legitimately passes all input validation and reaches the resolver. The test is about flag compatibility, not storage mode; pinning it to plaintext keeps it hermetic. Co-authored-by: Isaac --- cmd/auth/auth_test.go | 5 +++++ cmd/auth/login.go | 19 ++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/cmd/auth/auth_test.go b/cmd/auth/auth_test.go index fc7e5d533d4..13aa916a906 100644 --- a/cmd/auth/auth_test.go +++ b/cmd/auth/auth_test.go @@ -4,6 +4,7 @@ import ( "testing" "github.com/databricks/cli/cmd/root" + "github.com/databricks/cli/libs/auth/storage" "github.com/databricks/cli/libs/cmdctx" "github.com/databricks/cli/libs/databrickscfg/profile" "github.com/stretchr/testify/assert" @@ -114,6 +115,10 @@ func TestProfileHostConflictTokenViaCobra(t *testing.T) { // NOT with a conflict error). func TestProfileHostCompatibleViaCobra(t *testing.T) { t.Setenv("DATABRICKS_CONFIG_FILE", "./testdata/.databrickscfg") + // Force plaintext so the storage resolver does not probe the OS keyring + // and silently persist auth_storage = plaintext to the checked-in fixture + // on CI runners without a usable keyring. + t.Setenv(storage.EnvVar, "plaintext") ctx := cmdctx.GenerateExecId(t.Context()) cli := root.New(ctx) diff --git a/cmd/auth/login.go b/cmd/auth/login.go index 3a290c05e97..4491b422106 100644 --- a/cmd/auth/login.go +++ b/cmd/auth/login.go @@ -144,15 +144,6 @@ a new profile is created. ctx := cmd.Context() profileName := cmd.Flag("profile").Value.String() - // Resolve the cache before the browser step so an unavailable - // keyring surfaces here rather than after OAuth. The probe also - // triggers the OS unlock prompt, which the user can answer during - // OAuth. - tokenCache, mode, err := storage.ResolveCacheForLogin(ctx, "") - if err != nil { - return err - } - // Cluster and Serverless are mutually exclusive. if configureCluster && configureServerless { return errors.New("please either configure serverless or cluster, not both") @@ -178,6 +169,16 @@ a new profile is created. } } + // Resolve the cache before the browser step so an unavailable + // keyring surfaces here rather than after OAuth. The probe also + // triggers the OS unlock prompt, which the user can answer during + // OAuth. Run after input validation so trivially-invalid commands + // fail without probing. + tokenCache, mode, err := storage.ResolveCacheForLogin(ctx, "") + if err != nil { + return err + } + // When interactive and nothing was specified, show a picker that lets // the user re-login to an existing profile, create a new one, or enter // a host URL. With no profiles configured the picker still shows the From ba6668215a0c1f7f52482ae04dffd5f166720cb3 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 20 May 2026 11:35:14 +0200 Subject: [PATCH 05/10] Wrap unreachable-keyring errors on the read path Before: a databricks-cli auth command on a system without a usable OS keyring (Linux without D-Bus, headless containers, certain SSH sessions) surfaced the raw backend error, typically something like "Cannot autolaunch D-Bus without X11 $DISPLAY". The user had no way to tell that switching storage modes would fix it. After: keyringCache.Lookup wraps non-ErrNotFound errors with actionable guidance pointing at DATABRICKS_AUTH_STORAGE=plaintext and `databricks auth login` (which auto-falls back to plaintext when the keyring is unreachable). ErrNotFound passes through unchanged because a clean miss is the "run auth login" signal, not an unreachability problem. Store is left alone: the login Challenge path is already handled by applyLoginFallback with mode-aware messaging, so wrapping Store would duplicate the hint inside "secure storage was requested but the OS keyring is not reachable: ...". Co-authored-by: Isaac --- libs/auth/storage/keyring.go | 20 +++++++++++++++++++- libs/auth/storage/keyring_test.go | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/libs/auth/storage/keyring.go b/libs/auth/storage/keyring.go index 8ce87525b35..f79fa6a628d 100644 --- a/libs/auth/storage/keyring.go +++ b/libs/auth/storage/keyring.go @@ -149,7 +149,7 @@ func (k *keyringCache) Lookup(key string) (*oauth2.Token, error) { return nil, cache.ErrNotFound } if err != nil { - return nil, err + return nil, wrapKeyringUnreachable(err) } var entry keyringEntry @@ -159,6 +159,24 @@ func (k *keyringCache) Lookup(key string) (*oauth2.Token, error) { return entry.Token, nil } +// wrapKeyringUnreachable wraps a non-ErrNotFound keyring error with +// actionable guidance for users whose system has no usable keyring +// backend (Linux without a Secret Service / D-Bus session bus, headless +// containers, certain SSH sessions). Surfaces on the read path, where +// the resolver does not silently fall back to plaintext: a missing +// token might actually be reachable from the keyring on another machine, +// so we surface the unreachability instead of minting a fresh plaintext +// copy. +// +// ErrNotFound passes through unchanged because a clean miss is not an +// availability problem. +func wrapKeyringUnreachable(err error) error { + if err == nil || errors.Is(err, keyring.ErrNotFound) { + return err + } + return fmt.Errorf("OS keyring unreachable: %w (set DATABRICKS_AUTH_STORAGE=plaintext or run `databricks auth login` to use file-based token storage)", err) +} + // Compile-time confirmation that keyringCache satisfies the SDK interface. var _ cache.TokenCache = (*keyringCache)(nil) diff --git a/libs/auth/storage/keyring_test.go b/libs/auth/storage/keyring_test.go index f2b933ee42b..e200bc4b856 100644 --- a/libs/auth/storage/keyring_test.go +++ b/libs/auth/storage/keyring_test.go @@ -137,6 +137,25 @@ func TestKeyringCache_Lookup_PropagatesOtherErrors(t *testing.T) { _, err := c.Lookup("my-profile") require.Error(t, err) assert.ErrorIs(t, err, boom) + // The non-ErrNotFound path wraps the backend error with actionable + // guidance so users on systems without a usable keyring backend + // (e.g. headless Linux) know what to do. + assert.Contains(t, err.Error(), "OS keyring unreachable") + assert.Contains(t, err.Error(), "DATABRICKS_AUTH_STORAGE=plaintext") + assert.Contains(t, err.Error(), "databricks auth login") +} + +// ErrNotFound has to pass through unwrapped because callers branch on it +// (cache.ErrNotFound is the "no token, please log in" signal). Wrapping it +// with the unreachability hint would mislead the user. +func TestKeyringCache_Lookup_NotFoundIsNotWrapped(t *testing.T) { + backend := newFakeBackend() + c := newTestCache(backend) + + _, err := c.Lookup("nope") + require.Error(t, err) + assert.ErrorIs(t, err, cache.ErrNotFound) + assert.NotContains(t, err.Error(), "OS keyring unreachable") } func TestKeyringCache_Lookup_CorruptedJSONReturnsError(t *testing.T) { From 8a8ce50d1b4ff13ae471f05897ec12dff2f7a236 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 20 May 2026 12:08:41 +0200 Subject: [PATCH 06/10] Surface 'run auth login' hint when no cached credentials exist Before: the SDK's cache.ErrNotFound rendered as "cache: token not found" at the CLI surface. Pre-flip, plaintext users who never ran auth login or were missing a profile saw that message. Post-flip, every user with an existing plaintext token sees it too until they re-login, since the secure resolver does not read token-cache.json. After: ResolveCache and ResolveCacheForLogin wrap their result in a notFoundHintCache. When Lookup returns ErrNotFound: - If mode=secure and ~/.databricks/token-cache.json exists with entries, the error reads "stored credentials from older CLI versions are no longer used; run `databricks auth login` to sign in again, or set DATABRICKS_AUTH_STORAGE=plaintext to keep using the file cache". This is the upgrade scenario from the CLI Secure Storage Troubleshooting doc. - Otherwise the error reads "no cached credentials; run `databricks auth login` to sign in". A custom notFoundHint type backs both messages so they replace the SDK's "token not found" string rather than getting appended to it; the SDK's outer "cache: %w" wrap then produces clean output like "cache: no cached credentials; run `databricks auth login` to sign in" instead of "cache: : token not found". errors.Is(err, cache.ErrNotFound) continues to return true via the Unwrap method, so the SDK's existing branches on ErrNotFound still fire. The wrap lives in the exported ResolveCache / ResolveCacheForLogin, not in the internal resolveCacheWith / resolveCacheForLoginWith test helpers, so existing unit tests that type-assert on the inner cache type still work. Co-authored-by: Isaac --- .../describe/u2m-plaintext-config/output.txt | 2 +- .../describe/u2m-plaintext-env/output.txt | 2 +- libs/auth/storage/cache.go | 12 +- libs/auth/storage/not_found_hint.go | 93 +++++++++++ libs/auth/storage/not_found_hint_test.go | 149 ++++++++++++++++++ 5 files changed, 254 insertions(+), 4 deletions(-) create mode 100644 libs/auth/storage/not_found_hint.go create mode 100644 libs/auth/storage/not_found_hint_test.go diff --git a/acceptance/cmd/auth/describe/u2m-plaintext-config/output.txt b/acceptance/cmd/auth/describe/u2m-plaintext-config/output.txt index d1e522be97b..85c9edca654 100644 --- a/acceptance/cmd/auth/describe/u2m-plaintext-config/output.txt +++ b/acceptance/cmd/auth/describe/u2m-plaintext-config/output.txt @@ -1,7 +1,7 @@ >>> [CLI] auth describe --profile u2m-profile Warn: [hostmetadata] failed to fetch host metadata for https://u2m-profile.databricks.test, will skip for 1m0s -Unable to authenticate: error getting token: cache: token not found +Unable to authenticate: error getting token: cache: no cached credentials; run `databricks auth login` to sign in Token storage: plaintext, ~/.databricks/token-cache.json (from auth_storage in [__settings__] section of [TEST_TMP_DIR]/home/.databrickscfg) ----- Current configuration: diff --git a/acceptance/cmd/auth/describe/u2m-plaintext-env/output.txt b/acceptance/cmd/auth/describe/u2m-plaintext-env/output.txt index edfdc0a1939..10d1896b040 100644 --- a/acceptance/cmd/auth/describe/u2m-plaintext-env/output.txt +++ b/acceptance/cmd/auth/describe/u2m-plaintext-env/output.txt @@ -1,7 +1,7 @@ >>> [CLI] auth describe --profile u2m-profile Warn: [hostmetadata] failed to fetch host metadata for https://u2m-profile.databricks.test, will skip for 1m0s -Unable to authenticate: error getting token: cache: token not found +Unable to authenticate: error getting token: cache: no cached credentials; run `databricks auth login` to sign in Token storage: plaintext, ~/.databricks/token-cache.json (from DATABRICKS_AUTH_STORAGE environment variable) ----- Current configuration: diff --git a/libs/auth/storage/cache.go b/libs/auth/storage/cache.go index 8b146379588..370f2ae25cd 100644 --- a/libs/auth/storage/cache.go +++ b/libs/auth/storage/cache.go @@ -41,7 +41,11 @@ func defaultCacheFactories() cacheFactories { // through u2m.WithTokenCache, otherwise the SDK defaults to the file cache // and splits the user's tokens across two backends. func ResolveCache(ctx context.Context, override StorageMode) (cache.TokenCache, StorageMode, error) { - return resolveCacheWith(ctx, override, defaultCacheFactories()) + inner, mode, err := resolveCacheWith(ctx, override, defaultCacheFactories()) + if err != nil { + return nil, "", err + } + return withNotFoundHint(ctx, inner, mode), mode, nil } // ResolveCacheForLogin resolves the cache like ResolveCache with extra rules @@ -63,7 +67,11 @@ func ResolveCache(ctx context.Context, override StorageMode) (cache.TokenCache, // keyring error so they don't silently mint plaintext copies of tokens that // were stored in the keyring on another machine. func ResolveCacheForLogin(ctx context.Context, override StorageMode) (cache.TokenCache, StorageMode, error) { - return resolveCacheForLoginWith(ctx, override, defaultCacheFactories()) + inner, mode, err := resolveCacheForLoginWith(ctx, override, defaultCacheFactories()) + if err != nil { + return nil, "", err + } + return withNotFoundHint(ctx, inner, mode), mode, nil } // WrapForOAuthArgument wraps tokenCache so SDK-side writes (Challenge, refresh) diff --git a/libs/auth/storage/not_found_hint.go b/libs/auth/storage/not_found_hint.go new file mode 100644 index 00000000000..28ebd1e60b2 --- /dev/null +++ b/libs/auth/storage/not_found_hint.go @@ -0,0 +1,93 @@ +package storage + +import ( + "context" + "encoding/json" + "errors" + "os" + "path/filepath" + + "github.com/databricks/cli/libs/env" + "github.com/databricks/databricks-sdk-go/credentials/u2m/cache" + "golang.org/x/oauth2" +) + +// notFoundHintCache wraps a TokenCache so Lookup returns ErrNotFound with +// a hint pointing the user at `databricks auth login`. When mode is secure +// and the legacy file-backed cache has entries, the hint uses the upgrade- +// specific copy so users who logged in with an older CLI version know why +// their cached credentials are no longer being read. +// +// errors.Is(err, cache.ErrNotFound) continues to return true because the +// wrap uses %w; the SDK's branches on ErrNotFound still fire. +// +// Store is delegated unchanged; only Lookup needs the message polish. +type notFoundHintCache struct { + inner cache.TokenCache + mode StorageMode + legacyCachePath string +} + +func (c *notFoundHintCache) Store(key string, t *oauth2.Token) error { + return c.inner.Store(key, t) +} + +func (c *notFoundHintCache) Lookup(key string) (*oauth2.Token, error) { + tok, err := c.inner.Lookup(key) + if err == nil || !errors.Is(err, cache.ErrNotFound) { + return tok, err + } + if c.mode == StorageModeSecure && legacyCacheHasTokens(c.legacyCachePath) { + return nil, ¬FoundHint{msg: "stored credentials from older CLI versions are no longer used; run `databricks auth login` to sign in again, or set DATABRICKS_AUTH_STORAGE=plaintext to keep using the file cache"} + } + return nil, ¬FoundHint{msg: "no cached credentials; run `databricks auth login` to sign in"} +} + +// notFoundHint replaces cache.ErrNotFound's terse "token not found" string +// with an actionable message while still satisfying errors.Is(err, +// cache.ErrNotFound). The SDK's loadToken wraps every cache error with +// "cache: %w", and fmt.Errorf("...: %w", ErrNotFound) would tack the +// original "token not found" onto the end of our hint, producing +// "cache: : token not found". A custom type lets us own the +// rendered message while still unwrapping to ErrNotFound for callers +// that branch on it. +type notFoundHint struct { + msg string +} + +func (e *notFoundHint) Error() string { return e.msg } +func (e *notFoundHint) Unwrap() error { return cache.ErrNotFound } + +// withNotFoundHint wraps inner so ErrNotFound from Lookup carries an +// actionable hint. The legacy file path is resolved up front (where ctx +// is available) so Lookup can do its check without needing a context. +// +// Resolution failures for the home directory are not fatal: an empty +// legacyCachePath simply disables the upgrade-specific message, which +// falls back to the generic "run auth login" hint. +func withNotFoundHint(ctx context.Context, inner cache.TokenCache, mode StorageMode) cache.TokenCache { + var legacyCachePath string + if home, err := env.UserHomeDir(ctx); err == nil { + legacyCachePath = filepath.Join(home, tokenCacheFilePath) + } + return ¬FoundHintCache{inner: inner, mode: mode, legacyCachePath: legacyCachePath} +} + +// legacyCacheHasTokens reports whether the file at path is a valid token +// cache with at least one entry. Best-effort and read-only: any I/O or +// parse error returns false so we never claim "you have legacy tokens" +// when we cannot actually tell. +func legacyCacheHasTokens(path string) bool { + if path == "" { + return false + } + raw, err := os.ReadFile(path) + if err != nil { + return false + } + var f tokenCacheFile + if err := json.Unmarshal(raw, &f); err != nil { + return false + } + return len(f.Tokens) > 0 +} diff --git a/libs/auth/storage/not_found_hint_test.go b/libs/auth/storage/not_found_hint_test.go new file mode 100644 index 00000000000..53c46fd5b1d --- /dev/null +++ b/libs/auth/storage/not_found_hint_test.go @@ -0,0 +1,149 @@ +package storage + +import ( + "encoding/json" + "errors" + "os" + "path/filepath" + "testing" + + "github.com/databricks/databricks-sdk-go/credentials/u2m/cache" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/oauth2" +) + +// missingCache always returns ErrNotFound on Lookup. Lets us drive the +// wrapper without going through the real file or keyring cache. +type missingCache struct{} + +func (missingCache) Store(string, *oauth2.Token) error { return nil } +func (missingCache) Lookup(string) (*oauth2.Token, error) { return nil, cache.ErrNotFound } + +// foundCache always returns a token. Used to confirm the wrapper passes +// successful lookups through unchanged. +type foundCache struct{ tok *oauth2.Token } + +func (c foundCache) Store(string, *oauth2.Token) error { return nil } +func (c foundCache) Lookup(string) (*oauth2.Token, error) { return c.tok, nil } + +// boomCache returns a non-ErrNotFound error. The wrapper must not add a +// "run auth login" hint here; the error is about something else. +type boomCache struct{ err error } + +func (c boomCache) Store(string, *oauth2.Token) error { return nil } +func (c boomCache) Lookup(string) (*oauth2.Token, error) { return nil, c.err } + +func writeLegacyCache(t *testing.T, path string, hasEntries bool) { + t.Helper() + require.NoError(t, os.MkdirAll(filepath.Dir(path), 0o700)) + tokens := map[string]*oauth2.Token{} + if hasEntries { + tokens["my-profile"] = &oauth2.Token{AccessToken: "abc"} + } + body, err := json.Marshal(tokenCacheFile{Version: tokenCacheVersion, Tokens: tokens}) + require.NoError(t, err) + require.NoError(t, os.WriteFile(path, body, 0o600)) +} + +func TestNotFoundHintCache_SecureWithLegacyEntries_UsesUpgradeMessage(t *testing.T) { + tmp := t.TempDir() + legacyPath := filepath.Join(tmp, tokenCacheFilePath) + writeLegacyCache(t, legacyPath, true) + + c := ¬FoundHintCache{inner: missingCache{}, mode: StorageModeSecure, legacyCachePath: legacyPath} + _, err := c.Lookup("anything") + require.Error(t, err) + assert.ErrorIs(t, err, cache.ErrNotFound) + assert.Contains(t, err.Error(), "stored credentials from older CLI versions") + assert.Contains(t, err.Error(), "databricks auth login") + assert.Contains(t, err.Error(), "DATABRICKS_AUTH_STORAGE=plaintext") +} + +func TestNotFoundHintCache_SecureWithEmptyLegacyFile_UsesGenericMessage(t *testing.T) { + tmp := t.TempDir() + legacyPath := filepath.Join(tmp, tokenCacheFilePath) + writeLegacyCache(t, legacyPath, false) + + c := ¬FoundHintCache{inner: missingCache{}, mode: StorageModeSecure, legacyCachePath: legacyPath} + _, err := c.Lookup("anything") + require.Error(t, err) + assert.ErrorIs(t, err, cache.ErrNotFound) + assert.Contains(t, err.Error(), "no cached credentials") + assert.NotContains(t, err.Error(), "stored credentials from older CLI versions") +} + +func TestNotFoundHintCache_SecureNoLegacyFile_UsesGenericMessage(t *testing.T) { + c := ¬FoundHintCache{inner: missingCache{}, mode: StorageModeSecure, legacyCachePath: filepath.Join(t.TempDir(), "missing.json")} + _, err := c.Lookup("anything") + require.Error(t, err) + assert.ErrorIs(t, err, cache.ErrNotFound) + assert.Contains(t, err.Error(), "no cached credentials") +} + +func TestNotFoundHintCache_Plaintext_AlwaysGenericMessage(t *testing.T) { + tmp := t.TempDir() + legacyPath := filepath.Join(tmp, tokenCacheFilePath) + writeLegacyCache(t, legacyPath, true) + + // Even with a populated legacy file present, plaintext mode reads from + // that same file, so the upgrade copy would be misleading. + c := ¬FoundHintCache{inner: missingCache{}, mode: StorageModePlaintext, legacyCachePath: legacyPath} + _, err := c.Lookup("anything") + require.Error(t, err) + assert.ErrorIs(t, err, cache.ErrNotFound) + assert.Contains(t, err.Error(), "no cached credentials") + assert.NotContains(t, err.Error(), "stored credentials from older CLI versions") +} + +func TestNotFoundHintCache_NonErrNotFound_PassesThrough(t *testing.T) { + boom := errors.New("backend blew up") + c := ¬FoundHintCache{inner: boomCache{err: boom}, mode: StorageModeSecure, legacyCachePath: ""} + _, err := c.Lookup("anything") + require.Error(t, err) + assert.ErrorIs(t, err, boom) + assert.NotContains(t, err.Error(), "no cached credentials") +} + +func TestNotFoundHintCache_SuccessfulLookupUnchanged(t *testing.T) { + tok := &oauth2.Token{AccessToken: "abc"} + c := ¬FoundHintCache{inner: foundCache{tok: tok}, mode: StorageModeSecure, legacyCachePath: ""} + got, err := c.Lookup("anything") + require.NoError(t, err) + assert.Equal(t, tok, got) +} + +func TestNotFoundHintCache_StoreIsDelegated(t *testing.T) { + c := ¬FoundHintCache{inner: missingCache{}, mode: StorageModeSecure, legacyCachePath: ""} + require.NoError(t, c.Store("k", &oauth2.Token{AccessToken: "abc"})) +} + +func TestLegacyCacheHasTokens(t *testing.T) { + tmp := t.TempDir() + + t.Run("empty path returns false", func(t *testing.T) { + assert.False(t, legacyCacheHasTokens("")) + }) + + t.Run("missing file returns false", func(t *testing.T) { + assert.False(t, legacyCacheHasTokens(filepath.Join(tmp, "missing.json"))) + }) + + t.Run("garbage file returns false", func(t *testing.T) { + p := filepath.Join(tmp, "garbage.json") + require.NoError(t, os.WriteFile(p, []byte("not json"), 0o600)) + assert.False(t, legacyCacheHasTokens(p)) + }) + + t.Run("empty token map returns false", func(t *testing.T) { + p := filepath.Join(tmp, "empty.json") + writeLegacyCache(t, p, false) + assert.False(t, legacyCacheHasTokens(p)) + }) + + t.Run("populated token map returns true", func(t *testing.T) { + p := filepath.Join(tmp, "populated.json") + writeLegacyCache(t, p, true) + assert.True(t, legacyCacheHasTokens(p)) + }) +} From 03c3d022e0809dcaa399fb8f1ec4c051871fffac Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 20 May 2026 12:15:13 +0200 Subject: [PATCH 07/10] Address PR review comments from Pieter 1. NEXT_CHANGELOG: drop "U2M" jargon in favor of "OAuth tokens for interactive logins (auth_type = databricks-cli)". 2. TestProfileHostCompatibleViaCobra: copy the testdata fixture into a t.TempDir() and point DATABRICKS_CONFIG_FILE there, instead of forcing DATABRICKS_AUTH_STORAGE=plaintext to suppress the silent plaintext-fallback write. The temp-file approach is more hermetic and removes the question Pieter raised about whether the env var alters cfg behavior. The storage import in auth_test.go is dropped. 3. persistPlaintextFallback: log internally at debug instead of returning the error for the caller to log. Same shape as PinSecureMode, which was already doing this. Call site in applyLoginFallback drops the err check. 4. PinSecureMode: pass StorageModeUnknown (the named "no override selected" constant) instead of "" to ResolveStorageModeWithSource. Co-authored-by: Isaac --- NEXT_CHANGELOG.md | 2 +- cmd/auth/auth_test.go | 18 +++++++++++------- libs/auth/storage/cache.go | 18 +++++++++--------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 32cfb74a43f..087a737059d 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -4,7 +4,7 @@ ### Notable Changes -* Breaking change: U2M (`auth_type = databricks-cli`) tokens are now stored in the OS-native secure store by default (Keychain on macOS, Credential Manager on Windows, Secret Service on Linux) instead of `~/.databricks/token-cache.json`. After upgrading, run `databricks auth login` once per profile to re-authenticate; cached tokens from older versions are not migrated. To keep the previous file-backed storage, set `DATABRICKS_AUTH_STORAGE=plaintext` or add `auth_storage = plaintext` under `[__settings__]` in `~/.databrickscfg` (the env var takes precedence over the config setting), then re-run `databricks auth login`. +* Breaking change: OAuth tokens for interactive logins (`auth_type = databricks-cli`) are now stored in the OS-native secure store by default (Keychain on macOS, Credential Manager on Windows, Secret Service on Linux) instead of `~/.databricks/token-cache.json`. After upgrading, run `databricks auth login` once per profile to re-authenticate; cached tokens from older versions are not migrated. To keep the previous file-backed storage, set `DATABRICKS_AUTH_STORAGE=plaintext` or add `auth_storage = plaintext` under `[__settings__]` in `~/.databrickscfg` (the env var takes precedence over the config setting), then re-run `databricks auth login`. ### CLI diff --git a/cmd/auth/auth_test.go b/cmd/auth/auth_test.go index 13aa916a906..9dbf04fc664 100644 --- a/cmd/auth/auth_test.go +++ b/cmd/auth/auth_test.go @@ -1,10 +1,11 @@ package auth import ( + "os" + "path/filepath" "testing" "github.com/databricks/cli/cmd/root" - "github.com/databricks/cli/libs/auth/storage" "github.com/databricks/cli/libs/cmdctx" "github.com/databricks/cli/libs/databrickscfg/profile" "github.com/stretchr/testify/assert" @@ -114,11 +115,14 @@ func TestProfileHostConflictTokenViaCobra(t *testing.T) { // pass the conflict check (the command will fail later for other reasons, but // NOT with a conflict error). func TestProfileHostCompatibleViaCobra(t *testing.T) { - t.Setenv("DATABRICKS_CONFIG_FILE", "./testdata/.databrickscfg") - // Force plaintext so the storage resolver does not probe the OS keyring - // and silently persist auth_storage = plaintext to the checked-in fixture - // on CI runners without a usable keyring. - t.Setenv(storage.EnvVar, "plaintext") + // Copy the fixture into a temp directory so the auth login flow's writes + // (e.g. silent plaintext-fallback persistence on CI runners without a + // usable keyring) cannot dirty the checked-in fixture. + configPath := filepath.Join(t.TempDir(), ".databrickscfg") + fixture, err := os.ReadFile("./testdata/.databrickscfg") + require.NoError(t, err) + require.NoError(t, os.WriteFile(configPath, fixture, 0o600)) + t.Setenv("DATABRICKS_CONFIG_FILE", configPath) ctx := cmdctx.GenerateExecId(t.Context()) cli := root.New(ctx) @@ -130,7 +134,7 @@ func TestProfileHostCompatibleViaCobra(t *testing.T) { "--host", "https://www.host1.test", }) - _, err := cli.ExecuteContextC(ctx) + _, err = cli.ExecuteContextC(ctx) // The command may fail for other reasons (no browser, non-interactive, etc.) // but it should NOT fail with a conflict error. if err != nil { diff --git a/libs/auth/storage/cache.go b/libs/auth/storage/cache.go index 370f2ae25cd..0d68ea58831 100644 --- a/libs/auth/storage/cache.go +++ b/libs/auth/storage/cache.go @@ -151,9 +151,7 @@ func applyLoginFallback(ctx context.Context, mode StorageMode, explicit bool, f if fileErr != nil { return nil, "", fmt.Errorf("open file token cache: %w", fileErr) } - if err := persistPlaintextFallback(ctx); err != nil { - log.Debugf(ctx, "persisting auth_storage fallback failed: %v", err) - } + persistPlaintextFallback(ctx) return fileCache, StorageModePlaintext, nil } return f.newKeyring(), mode, nil @@ -167,11 +165,13 @@ func applyLoginFallback(ctx context.Context, mode StorageMode, explicit bool, f // probe and route straight to the file cache. // // Only called on the (mode=Secure, explicit=false) probe-failure branch. -// Persisting the fallback lets default-on-broken-keyring users avoid a 3s -// probe on every command. -func persistPlaintextFallback(ctx context.Context) error { +// Best-effort: persistence failures are logged at debug and never block +// login. +func persistPlaintextFallback(ctx context.Context) { configPath := env.Get(ctx, "DATABRICKS_CONFIG_FILE") - return databrickscfg.SetConfiguredAuthStorage(ctx, string(StorageModePlaintext), configPath) + if err := databrickscfg.SetConfiguredAuthStorage(ctx, string(StorageModePlaintext), configPath); err != nil { + log.Debugf(ctx, "persist auth_storage=plaintext fallback failed: %v", err) + } } // PinSecureMode persists auth_storage = secure to [__settings__] when the @@ -188,7 +188,7 @@ func PinSecureMode(ctx context.Context, mode StorageMode) { if mode != StorageModeSecure { return } - _, source, err := ResolveStorageModeWithSource(ctx, "") + _, source, err := ResolveStorageModeWithSource(ctx, StorageModeUnknown) if err != nil { log.Debugf(ctx, "pin secure mode: resolve: %v", err) return @@ -198,6 +198,6 @@ func PinSecureMode(ctx context.Context, mode StorageMode) { } configPath := env.Get(ctx, "DATABRICKS_CONFIG_FILE") if err := databrickscfg.SetConfiguredAuthStorage(ctx, string(StorageModeSecure), configPath); err != nil { - log.Debugf(ctx, "pin secure mode: persist: %v", err) + log.Debugf(ctx, "persist auth_storage=secure pin failed: %v", err) } } From a15feae13e1b7d94611528e8bb7e4d80dd195d5c Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 20 May 2026 12:46:23 +0200 Subject: [PATCH 08/10] Fall back to file cache on read when keyring is unreachable Mirrors the login-path fallback for the read path. When the resolver returns (mode=Secure, source=Default) and the keyring probes as definitively unreachable, ResolveCache now returns the file cache instead of the keyring. This means users upgrading on systems without a usable keyring (e.g. Linux containers without a D-Bus session bus) keep reading from their pre-upgrade token-cache.json without any manual configuration. Two behaviours intentionally differ from the login-path fallback: 1. The probe is read-only (ProbeKeyringRead does a Get on a non-existent account name, treating keyring.ErrNotFound as "reachable"). The login-path probe still uses the write+delete cycle because login needs to validate write capability before committing tokens. 2. The read-path fallback does NOT persist auth_storage = plaintext to [__settings__]. A read failure could be transient (e.g. a momentarily broken D-Bus connection) and we don't have strong enough evidence to pin the user out of the keyring permanently. Pinning stays the responsibility of the login command, which has the stronger write-probe signal and an explicit user action. Timeouts in the read probe stay on the keyring (same logic as the login path): a locked keyring being unlocked is the most common timeout case, and a misdiagnosed hang fails the actual Lookup, which is preferable to silently routing reads to a different backend. Explicit-secure callers (env var or config) get the keyring cache regardless of probe result, so an explicit "I want secure" request surfaces the unreachability instead of being silently downgraded. Co-authored-by: Isaac --- NEXT_CHANGELOG.md | 2 +- libs/auth/storage/cache.go | 85 ++++++++++++++++++--- libs/auth/storage/cache_test.go | 121 ++++++++++++++++++++++++++++-- libs/auth/storage/keyring.go | 32 ++++++++ libs/auth/storage/keyring_test.go | 56 ++++++++++++++ 5 files changed, 280 insertions(+), 16 deletions(-) diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 087a737059d..39ed2bc07af 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -4,7 +4,7 @@ ### Notable Changes -* Breaking change: OAuth tokens for interactive logins (`auth_type = databricks-cli`) are now stored in the OS-native secure store by default (Keychain on macOS, Credential Manager on Windows, Secret Service on Linux) instead of `~/.databricks/token-cache.json`. After upgrading, run `databricks auth login` once per profile to re-authenticate; cached tokens from older versions are not migrated. To keep the previous file-backed storage, set `DATABRICKS_AUTH_STORAGE=plaintext` or add `auth_storage = plaintext` under `[__settings__]` in `~/.databrickscfg` (the env var takes precedence over the config setting), then re-run `databricks auth login`. +* Breaking change: OAuth tokens for interactive logins (`auth_type = databricks-cli`) are now stored in the OS-native secure store by default (Keychain on macOS, Credential Manager on Windows, Secret Service on Linux) instead of `~/.databricks/token-cache.json`. After upgrading, run `databricks auth login` once per profile to re-authenticate; cached tokens from older versions are not migrated. To keep the previous file-backed storage, set `DATABRICKS_AUTH_STORAGE=plaintext` or add `auth_storage = plaintext` under `[__settings__]` in `~/.databrickscfg` (the env var takes precedence over the config setting), then re-run `databricks auth login`. On systems where the OS keyring is not reachable (e.g. Linux containers without a D-Bus session bus), the CLI transparently falls back to the file cache when reading tokens so legacy `token-cache.json` entries remain accessible without manual configuration. ### CLI diff --git a/libs/auth/storage/cache.go b/libs/auth/storage/cache.go index 0d68ea58831..b449a5aabf1 100644 --- a/libs/auth/storage/cache.go +++ b/libs/auth/storage/cache.go @@ -16,17 +16,19 @@ import ( // so unit tests can inject stubs without hitting the real OS keyring or // filesystem. Production code uses defaultCacheFactories(). type cacheFactories struct { - newFile func(context.Context) (cache.TokenCache, error) - newKeyring func() cache.TokenCache - probeKeyring func() error + newFile func(context.Context) (cache.TokenCache, error) + newKeyring func() cache.TokenCache + probeKeyring func() error + probeKeyringRead func() error } // defaultCacheFactories returns the production factory set. func defaultCacheFactories() cacheFactories { return cacheFactories{ - newFile: func(ctx context.Context) (cache.TokenCache, error) { return NewFileTokenCache(ctx) }, - newKeyring: NewKeyringCache, - probeKeyring: ProbeKeyring, + newFile: func(ctx context.Context) (cache.TokenCache, error) { return NewFileTokenCache(ctx) }, + newKeyring: NewKeyringCache, + probeKeyring: ProbeKeyring, + probeKeyringRead: ProbeKeyringRead, } } @@ -37,11 +39,18 @@ func defaultCacheFactories() cacheFactories { // override is usually the command-level flag value. Pass "" when the command // has no flag; precedence then falls through to env -> config -> default. // +// When the resolver returns (mode=Secure, source=Default) and the OS +// keyring is definitively unreachable (a non-timeout probe error), reads +// fall back to the plaintext file cache so post-upgrade users with legacy +// token-cache.json entries are not stranded. Unlike the login path, this +// fallback does not persist auth_storage = plaintext to [__settings__]; +// pinning happens only on successful login. +// // Every CLI code path that calls u2m.NewPersistentAuth must route the result // through u2m.WithTokenCache, otherwise the SDK defaults to the file cache // and splits the user's tokens across two backends. func ResolveCache(ctx context.Context, override StorageMode) (cache.TokenCache, StorageMode, error) { - inner, mode, err := resolveCacheWith(ctx, override, defaultCacheFactories()) + inner, mode, err := resolveCacheForReadWith(ctx, override, defaultCacheFactories()) if err != nil { return nil, "", err } @@ -89,8 +98,11 @@ func WrapForOAuthArgument(tokenCache cache.TokenCache, mode StorageMode, arg u2m return NewDualWritingTokenCache(tokenCache, arg) } -// resolveCacheWith is the pure form of ResolveCache. It takes the factory -// set as a parameter so tests can inject stubs. +// resolveCacheWith is the pure form of ResolveCache without the read-path +// fallback. Takes the factory set as a parameter so tests can inject stubs. +// Used directly by ResolveCacheForLogin (which has its own fallback rules) +// and indirectly by ResolveCache (which adds the read-path fallback in +// resolveCacheForReadWith). func resolveCacheWith(ctx context.Context, override StorageMode, f cacheFactories) (cache.TokenCache, StorageMode, error) { mode, err := ResolveStorageMode(ctx, override) if err != nil { @@ -110,6 +122,61 @@ func resolveCacheWith(ctx context.Context, override StorageMode, f cacheFactorie } } +// resolveCacheForReadWith is the pure form of ResolveCache. It applies the +// read-path fallback: when mode is secure-from-default and the keyring +// probes as definitively unavailable, return the file cache instead. +// Timeouts keep the keyring (could be transient). +func resolveCacheForReadWith(ctx context.Context, override StorageMode, f cacheFactories) (cache.TokenCache, StorageMode, error) { + mode, source, err := ResolveStorageModeWithSource(ctx, override) + if err != nil { + return nil, "", err + } + return applyReadFallback(ctx, mode, source.Explicit(), f) +} + +// applyReadFallback realizes the read-path fallback. Mirrors +// applyLoginFallback but: +// +// - Uses a read-only probe (ProbeKeyringRead) so calls do not write to +// the keyring on every CLI invocation. +// - Does not persist auth_storage = plaintext. Pinning happens only on +// successful login, where the write-probe gives us stronger evidence +// that the keyring is truly unavailable on this machine. +// +// Explicit secure is honored: callers who asked for secure get the keyring +// cache even if the probe fails, so the actual Lookup error surfaces the +// unreachability instead of silently using a different backend. +func applyReadFallback(ctx context.Context, mode StorageMode, explicit bool, f cacheFactories) (cache.TokenCache, StorageMode, error) { + switch mode { + case StorageModePlaintext: + c, err := f.newFile(ctx) + if err != nil { + return nil, "", fmt.Errorf("open file token cache: %w", err) + } + return c, mode, nil + case StorageModeSecure: + if explicit { + return f.newKeyring(), mode, nil + } + if probeErr := f.probeKeyringRead(); probeErr != nil { + var timeoutErr *TimeoutError + if errors.As(probeErr, &timeoutErr) { + log.Debugf(ctx, "keyring read probe timed out (%v); staying on keyring", probeErr) + return f.newKeyring(), mode, nil + } + log.Debugf(ctx, "secure storage unavailable on read path (%v), using file cache", probeErr) + fileCache, fileErr := f.newFile(ctx) + if fileErr != nil { + return nil, "", fmt.Errorf("open file token cache: %w", fileErr) + } + return fileCache, StorageModePlaintext, nil + } + return f.newKeyring(), mode, nil + default: + return nil, "", fmt.Errorf("unsupported storage mode %q", string(mode)) + } +} + // resolveCacheForLoginWith is the pure form of ResolveCacheForLogin. It takes // the factory set as a parameter so tests can inject stubs. func resolveCacheForLoginWith(ctx context.Context, override StorageMode, f cacheFactories) (cache.TokenCache, StorageMode, error) { diff --git a/libs/auth/storage/cache_test.go b/libs/auth/storage/cache_test.go index 32979c00c7b..209186a5453 100644 --- a/libs/auth/storage/cache_test.go +++ b/libs/auth/storage/cache_test.go @@ -28,9 +28,10 @@ func (stubCache) Lookup(string) (*oauth2.Token, error) { return nil, cache.ErrNo func fakeFactories(t *testing.T) cacheFactories { t.Helper() return cacheFactories{ - newFile: func(context.Context) (cache.TokenCache, error) { return stubCache{source: "file"}, nil }, - newKeyring: func() cache.TokenCache { return stubCache{source: "keyring"} }, - probeKeyring: func() error { return nil }, + newFile: func(context.Context) (cache.TokenCache, error) { return stubCache{source: "file"}, nil }, + newKeyring: func() cache.TokenCache { return stubCache{source: "keyring"} }, + probeKeyring: func() error { return nil }, + probeKeyringRead: func() error { return nil }, } } @@ -111,9 +112,10 @@ func TestResolveCache_FileFactoryErrorPropagates(t *testing.T) { ctx := t.Context() boom := errors.New("disk full") factories := cacheFactories{ - newFile: func(context.Context) (cache.TokenCache, error) { return nil, boom }, - newKeyring: func() cache.TokenCache { return stubCache{source: "keyring"} }, - probeKeyring: func() error { return nil }, + newFile: func(context.Context) (cache.TokenCache, error) { return nil, boom }, + newKeyring: func() cache.TokenCache { return stubCache{source: "keyring"} }, + probeKeyring: func() error { return nil }, + probeKeyringRead: func() error { return nil }, } _, _, err := resolveCacheWith(ctx, StorageModePlaintext, factories) @@ -122,6 +124,113 @@ func TestResolveCache_FileFactoryErrorPropagates(t *testing.T) { assert.ErrorIs(t, err, boom) } +// applyReadFallback mirrors applyLoginFallback's logic on the read path: +// keyring is probed read-only, definitive failures fall through to the file +// cache so legacy plaintext tokens stay reachable, timeouts stay on the +// keyring, and explicit-secure is honored even when the probe fails. + +func TestApplyReadFallback_PlaintextSkipsProbe(t *testing.T) { + hermetic(t) + ctx := t.Context() + probed := false + f := fakeFactories(t) + f.probeKeyringRead = func() error { + probed = true + return nil + } + + got, mode, err := applyReadFallback(ctx, StorageModePlaintext, false, f) + + require.NoError(t, err) + assert.Equal(t, StorageModePlaintext, mode) + assert.Equal(t, "file", got.(stubCache).source) + assert.False(t, probed, "probe must not run when mode is already plaintext") +} + +func TestApplyReadFallback_ExplicitSecureSkipsProbe(t *testing.T) { + hermetic(t) + ctx := t.Context() + probed := false + f := fakeFactories(t) + f.probeKeyringRead = func() error { + probed = true + return errors.New("unreachable") + } + + got, mode, err := applyReadFallback(ctx, StorageModeSecure, true, f) + + require.NoError(t, err) + assert.Equal(t, StorageModeSecure, mode) + assert.Equal(t, "keyring", got.(stubCache).source) + assert.False(t, probed, "probe must not run when user is explicit about secure mode") +} + +func TestApplyReadFallback_DefaultSecure_ProbeOK_UsesKeyring(t *testing.T) { + hermetic(t) + ctx := t.Context() + + got, mode, err := applyReadFallback(ctx, StorageModeSecure, false, fakeFactories(t)) + + require.NoError(t, err) + assert.Equal(t, StorageModeSecure, mode) + assert.Equal(t, "keyring", got.(stubCache).source) +} + +func TestApplyReadFallback_DefaultSecure_ProbeFail_FallsBack(t *testing.T) { + hermetic(t) + ctx := t.Context() + configPath := env.Get(ctx, "DATABRICKS_CONFIG_FILE") + + f := fakeFactories(t) + f.probeKeyringRead = func() error { return errors.New("no keyring") } + + got, mode, err := applyReadFallback(ctx, StorageModeSecure, false, f) + + require.NoError(t, err) + assert.Equal(t, StorageModePlaintext, mode) + assert.Equal(t, "file", got.(stubCache).source) + + // Read-path fallback must NOT pin: pinning is reserved for login, + // where the write-probe gives stronger evidence of unavailability. + persisted, gerr := databrickscfg.GetConfiguredAuthStorage(ctx, configPath) + require.NoError(t, gerr) + assert.Equal(t, "", persisted, "read-path fallback must not persist auth_storage") +} + +// A timeout could mean a locked keyring that will work once the user unlocks +// it. Stay on the keyring so the actual Lookup surfaces the real outcome +// rather than silently routing reads to the file cache. +func TestApplyReadFallback_DefaultSecure_ProbeTimeout_StaysOnKeyring(t *testing.T) { + cases := []struct { + name string + probeErr error + }{ + {"bare TimeoutError", &TimeoutError{Op: "get"}}, + {"wrapped TimeoutError", fmt.Errorf("get: %w", &TimeoutError{Op: "get"})}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + hermetic(t) + ctx := t.Context() + configPath := env.Get(ctx, "DATABRICKS_CONFIG_FILE") + + f := fakeFactories(t) + f.probeKeyringRead = func() error { return tc.probeErr } + + got, mode, err := applyReadFallback(ctx, StorageModeSecure, false, f) + + require.NoError(t, err) + assert.Equal(t, StorageModeSecure, mode) + assert.Equal(t, "keyring", got.(stubCache).source) + + persisted, gerr := databrickscfg.GetConfiguredAuthStorage(ctx, configPath) + require.NoError(t, gerr) + assert.Equal(t, "", persisted, "probe timeout must not persist anything") + }) + } +} + func TestResolveCacheForLogin_PlaintextSkipsProbe(t *testing.T) { hermetic(t) ctx := t.Context() diff --git a/libs/auth/storage/keyring.go b/libs/auth/storage/keyring.go index f79fa6a628d..e0099ebe6bd 100644 --- a/libs/auth/storage/keyring.go +++ b/libs/auth/storage/keyring.go @@ -92,6 +92,9 @@ func NewKeyringCache() cache.TokenCache { // cycle within the standard timeout. *TimeoutError means the keyring // was unresponsive (locked or hung, indistinguishable here); other // errors are definitive failures. +// +// Used by the login path, where we want to validate both read and write +// capability before committing to the keyring backend. func ProbeKeyring() error { return probeWithBackend(zalandoBackend{}, defaultKeyringTimeout) } @@ -113,6 +116,35 @@ func probeWithBackend(backend keyringBackend, timeout time.Duration) error { return nil } +// ProbeKeyringRead returns nil if the OS keyring accepted a Get for a +// non-existent account within the standard timeout (i.e. the backend is +// reachable and responded with keyring.ErrNotFound). *TimeoutError means +// the keyring was unresponsive; other errors are definitive failures. +// +// Used by the read path so probing does not write to the keyring. A +// successful probe is indistinguishable from the user not having an +// entry for that probe account; we treat both as "reachable". +func ProbeKeyringRead() error { + return probeReadWithBackend(zalandoBackend{}, defaultKeyringTimeout) +} + +func probeReadWithBackend(backend keyringBackend, timeout time.Duration) error { + c := &keyringCache{ + backend: backend, + timeout: timeout, + keyringSvcName: keyringServiceName, + } + account := keyringProbeAccountPrefix + uuid.NewString() + err := c.withTimeout("get", func() error { + _, gerr := c.backend.Get(c.keyringSvcName, account) + return gerr + }) + if errors.Is(err, keyring.ErrNotFound) { + return nil + } + return err +} + // Store stores t under key. Nil t deletes the entry; deleting a missing // entry is not an error. func (k *keyringCache) Store(key string, t *oauth2.Token) error { diff --git a/libs/auth/storage/keyring_test.go b/libs/auth/storage/keyring_test.go index e200bc4b856..868e3ef5d3c 100644 --- a/libs/auth/storage/keyring_test.go +++ b/libs/auth/storage/keyring_test.go @@ -296,3 +296,59 @@ func TestProbeKeyring(t *testing.T) { }) } } + +func TestProbeKeyringRead(t *testing.T) { + boom := errors.New("backend boom") + cases := []struct { + name string + getErr error + getBlock bool + timeout time.Duration + wantErr error + wantTimeout bool + }{ + { + // keyring.ErrNotFound is the success signal: the backend + // responded that no entry exists for our probe account, + // which means it is reachable. + name: "ErrNotFound counts as reachable", + getErr: keyring.ErrNotFound, + timeout: 100 * time.Millisecond, + }, + { + name: "other backend error propagates", + getErr: boom, + timeout: 100 * time.Millisecond, + wantErr: boom, + }, + { + name: "get times out", + getBlock: true, + timeout: 50 * time.Millisecond, + wantTimeout: true, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + backend := newFakeBackend() + backend.getErr = tc.getErr + backend.getBlock = tc.getBlock + + err := probeReadWithBackend(backend, tc.timeout) + + switch { + case tc.wantErr != nil: + require.Error(t, err) + assert.ErrorIs(t, err, tc.wantErr) + case tc.wantTimeout: + require.Error(t, err) + var timeoutErr *TimeoutError + assert.ErrorAs(t, err, &timeoutErr) + default: + require.NoError(t, err) + assert.Empty(t, backend.items, "read probe must not write to the keyring") + } + }) + } +} From a01ac025e775ca15216c8af7ab5c6e61814d69e5 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 20 May 2026 13:10:13 +0200 Subject: [PATCH 09/10] Address codex review on PinSecureMode Two findings: 1. The persist failure was logged at debug, which silently weakens the stated pin-on-success guarantee: login succeeds, the keyring write succeeds, but auth_storage = secure is never persisted. A later default-secure read with a transient keyring probe failure would then fall back to plaintext, undoing the protection the pin was supposed to provide. Promote to log.Warnf so the user sees the failure during login and can investigate (file permissions, etc.). Login is still not blocked: pinning is best-effort. 2. PinSecureMode re-resolved the storage mode with StorageModeUnknown, so a caller-supplied override was invisible to the source check. The function's doc said "No-op when the user already chose a mode explicitly", but only env and config were detected; an override secure mode would still pin auth_storage = secure to config, silently turning an ephemeral per-invocation choice into a persistent one. No caller passes a non-empty override today, so the bug was dormant, but fix it before someone wires up a flag and notices the behavior. PinSecureMode now takes an override StorageMode parameter, and the three call sites (main login, discoveryLogin, runInlineLogin) pass storage.StorageModeUnknown explicitly. New table-driven case covers the override path. Co-authored-by: Isaac --- cmd/auth/login.go | 4 ++-- cmd/auth/token.go | 2 +- libs/auth/storage/cache.go | 18 ++++++++++++------ libs/auth/storage/cache_test.go | 21 ++++++++++++++++----- 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/cmd/auth/login.go b/cmd/auth/login.go index 4491b422106..5ba9729d1ba 100644 --- a/cmd/auth/login.go +++ b/cmd/auth/login.go @@ -298,7 +298,7 @@ a new profile is created. // Lock secure mode in after a successful keyring write so a later // transient keyring probe failure cannot silently demote this user // to plaintext. - storage.PinSecureMode(ctx, mode) + storage.PinSecureMode(ctx, mode, storage.StorageModeUnknown) // At this point, an OAuth token has been successfully minted and stored // in the CLI cache. The rest of the command focuses on: @@ -639,7 +639,7 @@ func discoveryLogin(ctx context.Context, in discoveryLoginInputs) error { if err := persistentAuth.Challenge(); err != nil { return discoveryErr("login via login.databricks.com failed", err) } - storage.PinSecureMode(ctx, in.mode) + storage.PinSecureMode(ctx, in.mode, storage.StorageModeUnknown) discoveredHost := arg.GetDiscoveredHost() if discoveredHost == "" { diff --git a/cmd/auth/token.go b/cmd/auth/token.go index 26fd0a7b047..d25a1765efc 100644 --- a/cmd/auth/token.go +++ b/cmd/auth/token.go @@ -433,7 +433,7 @@ func runInlineLogin(ctx context.Context, profiler profile.Profiler, tokenCache c if err = persistentAuth.Challenge(); err != nil { return "", nil, err } - storage.PinSecureMode(ctx, mode) + storage.PinSecureMode(ctx, mode, storage.StorageModeUnknown) clearKeys := oauthLoginClearKeys() clearKeys = append(clearKeys, databrickscfg.ExperimentalIsUnifiedHostKey) diff --git a/libs/auth/storage/cache.go b/libs/auth/storage/cache.go index b449a5aabf1..c3d1fc43dcf 100644 --- a/libs/auth/storage/cache.go +++ b/libs/auth/storage/cache.go @@ -248,14 +248,20 @@ func persistPlaintextFallback(ctx context.Context) { // the user to plaintext. // // No-op when mode is not secure or when the user already chose a mode -// explicitly. Best-effort: persistence failures are logged at debug and -// never block login. Concurrent logins racing this write is benign because -// both write the same value. -func PinSecureMode(ctx context.Context, mode StorageMode) { +// explicitly via override, env var, or config. override must be the same +// value the caller passed to ResolveCacheForLogin so the source check sees +// the caller's intent rather than re-resolving without it. +// +// Persistence failures are logged at warn: they do not block login, but +// the user should know the pin did not happen, since a later transient +// keyring failure could then silently route a default-secure user to +// plaintext. Concurrent logins racing this write is benign because both +// write the same value. +func PinSecureMode(ctx context.Context, mode, override StorageMode) { if mode != StorageModeSecure { return } - _, source, err := ResolveStorageModeWithSource(ctx, StorageModeUnknown) + _, source, err := ResolveStorageModeWithSource(ctx, override) if err != nil { log.Debugf(ctx, "pin secure mode: resolve: %v", err) return @@ -265,6 +271,6 @@ func PinSecureMode(ctx context.Context, mode StorageMode) { } configPath := env.Get(ctx, "DATABRICKS_CONFIG_FILE") if err := databrickscfg.SetConfiguredAuthStorage(ctx, string(StorageModeSecure), configPath); err != nil { - log.Debugf(ctx, "persist auth_storage=secure pin failed: %v", err) + log.Warnf(ctx, "could not persist auth_storage=secure to %s: %v. Future commands may need DATABRICKS_AUTH_STORAGE=secure to keep using the OS keyring.", configPath, err) } } diff --git a/libs/auth/storage/cache_test.go b/libs/auth/storage/cache_test.go index 209186a5453..52044eef986 100644 --- a/libs/auth/storage/cache_test.go +++ b/libs/auth/storage/cache_test.go @@ -386,6 +386,7 @@ func TestPinSecureMode(t *testing.T) { cases := []struct { name string mode StorageMode + override StorageMode envValue string configBody string wantWritten string @@ -412,6 +413,16 @@ func TestPinSecureMode(t *testing.T) { configBody: "[__settings__]\nauth_storage = secure\n", wantWritten: "secure", }, + { + // The override signal is per-invocation, so persisting it to + // config would silently turn an ephemeral choice into a + // persistent one. Honor the caller's explicit override by + // no-op'ing the pin. + name: "secure from override is a no-op", + mode: StorageModeSecure, + override: StorageModeSecure, + wantWritten: "", + }, } for _, tc := range cases { @@ -426,7 +437,7 @@ func TestPinSecureMode(t *testing.T) { ctx = env.Set(ctx, EnvVar, tc.envValue) } - PinSecureMode(ctx, tc.mode) + PinSecureMode(ctx, tc.mode, tc.override) got, err := databrickscfg.GetConfiguredAuthStorage(ctx, configPath) require.NoError(t, err) @@ -440,13 +451,13 @@ func TestPinSecureMode_IsIdempotent(t *testing.T) { ctx := t.Context() configPath := env.Get(ctx, "DATABRICKS_CONFIG_FILE") - PinSecureMode(ctx, StorageModeSecure) + PinSecureMode(ctx, StorageModeSecure, StorageModeUnknown) first, err := databrickscfg.GetConfiguredAuthStorage(ctx, configPath) require.NoError(t, err) require.Equal(t, "secure", first) // Second call should see source=Config and skip the write. - PinSecureMode(ctx, StorageModeSecure) + PinSecureMode(ctx, StorageModeSecure, StorageModeUnknown) second, err := databrickscfg.GetConfiguredAuthStorage(ctx, configPath) require.NoError(t, err) assert.Equal(t, "secure", second) @@ -461,8 +472,8 @@ func TestPinSecureMode_PersistFailureIsSwallowed(t *testing.T) { configPath := filepath.Join(t.TempDir(), "no-such-dir", ".databrickscfg") t.Setenv("DATABRICKS_CONFIG_FILE", configPath) - // Must not panic or block; failures are logged at debug. - PinSecureMode(ctx, StorageModeSecure) + // Must not panic or block; failure surfaces in the warn log. + PinSecureMode(ctx, StorageModeSecure, StorageModeUnknown) // The persist failure must not have produced any file. _, err := os.Stat(configPath) From 9baf02b9a0e8bea72644e61f4a9dfb6ea6fc849c Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 20 May 2026 13:53:24 +0200 Subject: [PATCH 10/10] Surface notFoundHint message in auth token error path token.go's loadToken rewrites cache.ErrNotFound to the constant "cache: databricks OAuth is not configured for this host" so older SDK versions (pre-v0.77.0) can substring-match the error and fall through to other auth providers. That rewrite, written before the secure-storage default, silently threw away the notFoundHintCache message a default-secure user would see post-upgrade: instead of the actionable "stored credentials from older CLI versions are no longer used; run auth login or set DATABRICKS_AUTH_STORAGE=plaintext" copy, they got the generic "Try logging in again ... If this fails, please report this issue" trailer, which steers expected upgrade behavior toward bug reports. Add storage.HintForNotFound to extract the notFoundHint message from an error chain, plus storage.NewNotFoundHint as a test factory. token.go now keeps the SDK-compat substring AND appends the hint when present, and skips the helpfulError trailer in that case. Co-authored-by: Isaac --- cmd/auth/token.go | 16 +++++++- cmd/auth/token_test.go | 48 ++++++++++++++++++++++++ libs/auth/storage/not_found_hint.go | 27 +++++++++++++ libs/auth/storage/not_found_hint_test.go | 31 +++++++++++++++ 4 files changed, 120 insertions(+), 2 deletions(-) diff --git a/cmd/auth/token.go b/cmd/auth/token.go index d25a1765efc..e789346e0fa 100644 --- a/cmd/auth/token.go +++ b/cmd/auth/token.go @@ -285,10 +285,22 @@ func loadToken(ctx context.Context, args loadTokenArgs) (*oauth2.Token, error) { // // Older SDK versions check for a particular substring to determine if // the OAuth authentication type can fall through or if it is a real error. - // This means we need to keep this error message constant for backwards compatibility. + // This means we need to keep "databricks OAuth is not configured for + // this host" present in the error for backwards compatibility. // // This is captured in an acceptance test under "cmd/auth/token". - err = errors.New("cache: databricks OAuth is not configured for this host") + const compatSubstring = "databricks OAuth is not configured for this host" + // When storage's notFoundHintCache wrapped the ErrNotFound with + // an actionable hint (e.g. the post-upgrade "stored credentials + // from older CLI versions are no longer used; run `databricks + // auth login`..." copy), surface it instead of the generic + // "Try logging in again with ... If this fails, please report + // this issue" trailer. The hint is more specific and avoids + // users reporting expected post-upgrade behavior as a bug. + if hint := storage.HintForNotFound(err); hint != "" { + return nil, fmt.Errorf("cache: %s. %s", compatSubstring, hint) + } + err = errors.New("cache: " + compatSubstring) } if rewritten, rewrittenErr := auth.RewriteAuthError(ctx, args.authArguments.Host, args.authArguments.AccountID, args.profileName, err); rewritten { return nil, rewrittenErr diff --git a/cmd/auth/token_test.go b/cmd/auth/token_test.go index f3b3cdd8c0c..40ca29f588b 100644 --- a/cmd/auth/token_test.go +++ b/cmd/auth/token_test.go @@ -10,15 +10,34 @@ import ( "time" "github.com/databricks/cli/libs/auth" + "github.com/databricks/cli/libs/auth/storage" "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/databrickscfg/profile" "github.com/databricks/cli/libs/env" "github.com/databricks/databricks-sdk-go/credentials/u2m" + "github.com/databricks/databricks-sdk-go/credentials/u2m/cache" "github.com/databricks/databricks-sdk-go/httpclient/fixtures" "github.com/stretchr/testify/assert" "golang.org/x/oauth2" ) +// upgradeHintTokenCache returns a notFoundHint-wrapped ErrNotFound on +// Lookup, mirroring what storage.notFoundHintCache produces in +// production when ~/.databricks/token-cache.json has entries and the +// resolver picked secure mode by default. Used by TestToken_loadToken +// to verify that auth token surfaces the upgrade-specific hint instead +// of dropping it for the SDK-compat constant string. +type upgradeHintTokenCache struct{} + +func (upgradeHintTokenCache) Store(string, *oauth2.Token) error { return nil } +func (upgradeHintTokenCache) Lookup(string) (*oauth2.Token, error) { + return nil, storage.NewNotFoundHint( + "stored credentials from older CLI versions are no longer used; run `databricks auth login` to sign in again, or set DATABRICKS_AUTH_STORAGE=plaintext to keep using the file cache", + ) +} + +var _ cache.TokenCache = upgradeHintTokenCache{} + type failOnCallTransport struct{} func (failOnCallTransport) RoundTrip(*http.Request) (*http.Response, error) { @@ -408,6 +427,35 @@ func TestToken_loadToken(t *testing.T) { "Try logging in again with `databricks auth login --host https://nonexistent.cloud.databricks.com` before retrying. " + "If this fails, please report this issue to the Databricks CLI maintainers at https://github.com/databricks/cli/issues/new", }, + { + // Regression test: when notFoundHintCache wraps ErrNotFound + // with the upgrade copy (post-upgrade default-secure user + // with a populated token-cache.json), `auth token` must + // surface that hint instead of dropping it for the SDK-compat + // constant string. The combined message keeps the + // "OAuth is not configured for this host" substring older + // SDK versions look for and skips the generic "Try logging + // in again ... If this fails, please report this issue" + // trailer, which would mislead users into reporting expected + // post-upgrade behavior. + name: "ErrNotFound carrying upgrade hint surfaces it", + args: loadTokenArgs{ + authArguments: &auth.AuthArguments{}, + profileName: "", + args: []string{"nonexistent.cloud.databricks.com"}, + tokenTimeout: 1 * time.Hour, + profiler: profiler, + tokenCache: upgradeHintTokenCache{}, + persistentAuthOpts: []u2m.PersistentAuthOption{ + u2m.WithTokenCache(upgradeHintTokenCache{}), + u2m.WithOAuthEndpointSupplier(&MockApiClient{}), + }, + }, + wantErr: "cache: databricks OAuth is not configured for this host. " + + "stored credentials from older CLI versions are no longer used; " + + "run `databricks auth login` to sign in again, " + + "or set DATABRICKS_AUTH_STORAGE=plaintext to keep using the file cache", + }, { name: "errors with clear message for non-host non-profile positional arg", args: loadTokenArgs{ diff --git a/libs/auth/storage/not_found_hint.go b/libs/auth/storage/not_found_hint.go index 28ebd1e60b2..d86615535b5 100644 --- a/libs/auth/storage/not_found_hint.go +++ b/libs/auth/storage/not_found_hint.go @@ -58,6 +58,33 @@ type notFoundHint struct { func (e *notFoundHint) Error() string { return e.msg } func (e *notFoundHint) Unwrap() error { return cache.ErrNotFound } +// HintForNotFound extracts the actionable hint message from an error +// chain produced by notFoundHintCache. Returns the empty string if the +// chain does not contain a notFoundHint (e.g. an unwrapped +// cache.ErrNotFound from a plain TokenCache). +// +// Used by call sites like `auth token` that rewrite the SDK error for +// backwards-compatibility (the "databricks OAuth is not configured for +// this host" substring is load-bearing for older SDK fall-through +// logic) but want to surface the actionable hint to the user instead of +// dropping it. +func HintForNotFound(err error) string { + var hint *notFoundHint + if errors.As(err, &hint) { + return hint.msg + } + return "" +} + +// NewNotFoundHint returns an error that renders as msg but unwraps to +// cache.ErrNotFound, mirroring what notFoundHintCache produces in +// production. Exported so tests in other packages (e.g. cmd/auth) can +// construct a hint-wrapped error without going through the full +// resolver setup. +func NewNotFoundHint(msg string) error { + return ¬FoundHint{msg: msg} +} + // withNotFoundHint wraps inner so ErrNotFound from Lookup carries an // actionable hint. The legacy file path is resolved up front (where ctx // is available) so Lookup can do its check without needing a context. diff --git a/libs/auth/storage/not_found_hint_test.go b/libs/auth/storage/not_found_hint_test.go index 53c46fd5b1d..b262aeb2d9f 100644 --- a/libs/auth/storage/not_found_hint_test.go +++ b/libs/auth/storage/not_found_hint_test.go @@ -3,6 +3,7 @@ package storage import ( "encoding/json" "errors" + "fmt" "os" "path/filepath" "testing" @@ -118,6 +119,36 @@ func TestNotFoundHintCache_StoreIsDelegated(t *testing.T) { require.NoError(t, c.Store("k", &oauth2.Token{AccessToken: "abc"})) } +func TestHintForNotFound(t *testing.T) { + t.Run("nil returns empty", func(t *testing.T) { + assert.Empty(t, HintForNotFound(nil)) + }) + + t.Run("plain ErrNotFound returns empty", func(t *testing.T) { + // An unwrapped cache.ErrNotFound carries no hint, so the caller + // (e.g. `auth token`) falls back to its default error path. + assert.Empty(t, HintForNotFound(cache.ErrNotFound)) + }) + + t.Run("unrelated error returns empty", func(t *testing.T) { + assert.Empty(t, HintForNotFound(errors.New("something else"))) + }) + + t.Run("notFoundHint returns the hint message", func(t *testing.T) { + h := ¬FoundHint{msg: "do the thing"} + assert.Equal(t, "do the thing", HintForNotFound(h)) + }) + + t.Run("notFoundHint behind an fmt.Errorf wrap returns the hint", func(t *testing.T) { + // The SDK wraps every cache error with `cache: %w`, so the hint + // is one Unwrap away when it surfaces in callers. errors.As must + // still find it. + h := ¬FoundHint{msg: "do the thing"} + wrapped := fmt.Errorf("cache: %w", h) + assert.Equal(t, "do the thing", HintForNotFound(wrapped)) + }) +} + func TestLegacyCacheHasTokens(t *testing.T) { tmp := t.TempDir()