diff --git a/cmd/entire/cli/dispatch_tui.go b/cmd/entire/cli/dispatch_tui.go index 54c5cd88d3..b2773efc17 100644 --- a/cmd/entire/cli/dispatch_tui.go +++ b/cmd/entire/cli/dispatch_tui.go @@ -50,6 +50,8 @@ type dispatchProgram interface { Run() (tea.Model, error) } +// newDispatchProgram is overridden by tests via assignment. Tests that mutate +// it cannot use t.Parallel() — they would race each other's factory. var newDispatchProgram = func(model tea.Model, outW io.Writer, altScreen bool) dispatchProgram { options := []tea.ProgramOption{tea.WithOutput(outW)} if altScreen { diff --git a/cmd/entire/cli/dispatch_tui_test.go b/cmd/entire/cli/dispatch_tui_test.go index e23ad8e226..6a759ae49e 100644 --- a/cmd/entire/cli/dispatch_tui_test.go +++ b/cmd/entire/cli/dispatch_tui_test.go @@ -25,8 +25,6 @@ func (p fakeDispatchProgram) Run() (tea.Model, error) { } func TestDefaultRunInteractiveDispatch_DoesNotUseAltScreen(t *testing.T) { - t.Parallel() - oldProgramFactory := newDispatchProgram newDispatchProgram = func(model tea.Model, _ io.Writer, altScreen bool) dispatchProgram { if altScreen { @@ -73,8 +71,6 @@ func TestDispatchStatusModel_ViewRendersInlineCard(t *testing.T) { } func TestDefaultRunInteractiveDispatch_ClearsLoadingCardBeforeReturn(t *testing.T) { - t.Parallel() - oldProgramFactory := newDispatchProgram newDispatchProgram = func(model tea.Model, _ io.Writer, _ bool) dispatchProgram { return fakeDispatchProgram{model: model} diff --git a/cmd/entire/cli/logging/logger.go b/cmd/entire/cli/logging/logger.go index 8c68ac3753..05e01e1cb4 100644 --- a/cmd/entire/cli/logging/logger.go +++ b/cmd/entire/cli/logging/logger.go @@ -174,25 +174,6 @@ func resetLogger() { } } -// getLogger returns the current logger, or a default stderr logger if not initialized. -func getLogger() *slog.Logger { - mu.RLock() - defer mu.RUnlock() - - if logger == nil { - // Return default stderr logger - return slog.Default() - } - return logger -} - -// getSessionID returns the current session ID (thread-safe). -func getSessionID() string { - mu.RLock() - defer mu.RUnlock() - return currentSessionID -} - // createLogger creates a JSON logger writing to the given writer at the specified level. func createLogger(w io.Writer, level slog.Level) *slog.Logger { opts := &slog.HandlerOptions{ @@ -273,14 +254,23 @@ func LogDuration(ctx context.Context, level slog.Level, msg string, start time.T } // log is the internal logging function that extracts context values and logs. +// +// The read lock is held across l.Log so Init/Close cannot close logBufWriter +// mid-write; do not shrink the lock scope to a snapshot pattern. func log(ctx context.Context, level slog.Level, msg string, attrs ...any) { - l := getLogger() + mu.RLock() + defer mu.RUnlock() + + l := logger + if l == nil { + l = slog.Default() + } + globalSessionID := currentSessionID // Build attributes slice with session ID first (if set) var allAttrs []any // Add session ID from Init() if set (always first for consistency) - globalSessionID := getSessionID() if globalSessionID != "" { allAttrs = append(allAttrs, slog.String("session_id", globalSessionID)) } diff --git a/cmd/entire/cli/logging/logger_test.go b/cmd/entire/cli/logging/logger_test.go index 57ded72335..7ef0dbd387 100644 --- a/cmd/entire/cli/logging/logger_test.go +++ b/cmd/entire/cli/logging/logger_test.go @@ -9,6 +9,7 @@ import ( "os/exec" "path/filepath" "strings" + "sync" "testing" "time" ) @@ -536,6 +537,66 @@ func TestLogging_ContextSessionID_WhenNoGlobalSet(t *testing.T) { resetLogger() } +func TestLogging_ConcurrentInitAndLog(t *testing.T) { + tmpDir := t.TempDir() + t.Chdir(tmpDir) + initGitRepo(t, tmpDir) + + if err := Init(context.Background(), ""); err != nil { + t.Fatalf("Init() error = %v", err) + } + defer Close() + + const ( + logGoroutines = 8 + initGoroutines = 4 + closeGoroutines = 2 + iterations = 200 + ) + + var wg sync.WaitGroup + start := make(chan struct{}) + + for i := range logGoroutines { + wg.Add(1) + go func(worker int) { + defer wg.Done() + <-start + for j := range iterations { + Info(context.Background(), "concurrent log", slog.Int("worker", worker), slog.Int("iteration", j)) + } + }(i) + } + + for range initGoroutines { + wg.Add(1) + go func() { + defer wg.Done() + <-start + for range iterations { + if err := Init(context.Background(), ""); err != nil { + t.Errorf("Init() error = %v", err) + return + } + } + }() + } + + for range closeGoroutines { + wg.Add(1) + go func() { + defer wg.Done() + <-start + for range iterations { + Close() + } + }() + } + + close(start) + wg.Wait() +} + func TestInit_RejectsInvalidSessionIDs(t *testing.T) { tests := []struct { name string diff --git a/docs/security-and-privacy.md b/docs/security-and-privacy.md index 9886ffe93f..11f8dd1d04 100644 --- a/docs/security-and-privacy.md +++ b/docs/security-and-privacy.md @@ -16,10 +16,13 @@ If your repository is **public**, this data is visible to the entire internet. ### What Entire redacts automatically -Entire automatically scans transcript and metadata content before writing it to the `entire/checkpoints/v1` branch. Two detection methods run during condensation: +Entire automatically scans transcript and metadata content before writing it to the `entire/checkpoints/v1` branch. Five secret detection methods run during condensation: 1. **Entropy scoring** — Identifies high-entropy strings (Shannon entropy > 4.5) that look like randomly generated secrets, even if they don't match a known pattern. 2. **Pattern matching** — Uses [Betterleaks](https://github.com/betterleaks/betterleaks) built-in rules to detect known secret formats. +3. **Credentialed URI detection** — Redacts URLs with embedded passwords, such as `scheme://user:password@host`. +4. **Database connection-string detection** — Redacts JDBC, Postgres keyword DSN, SQL Server, and ODBC-style connection strings containing passwords. +5. **Bounded credential value detection** — Redacts password-like config values such as `DB_PASSWORD=...` and `PGPASSWORD=...` while preserving the surrounding key. Detected secrets are replaced with `REDACTED` before the data is ever written to a git object. This is **always on** and cannot be disabled. @@ -35,10 +38,12 @@ If your AI sessions will touch sensitive data: ### Secrets (always on) -Betterleaks pattern matching covers cloud providers (AWS, GCP, Azure), version control platforms (GitHub, GitLab, Bitbucket), payment processors (Stripe, Square), communication tools (Slack, Discord, Twilio), private key blocks (RSA, DSA, EC, PGP), and generic credentials (bearer tokens, basic auth, JWTs). Entire also redacts database connection strings and other credentialed URLs containing `://user:password@host`. Entropy scoring catches secrets that don't match any known pattern. +Betterleaks pattern matching covers cloud providers (AWS, GCP, Azure), version control platforms (GitHub, GitLab, Bitbucket), payment processors (Stripe, Square), communication tools (Slack, Discord, Twilio), private key blocks (RSA, DSA, EC, PGP), and generic credentials (bearer tokens, basic auth, JWTs). Dedicated credentialed URI detection covers URLs that embed passwords. Additional database connection-string detection covers DB DSNs and query-parameter passwords not reliably covered by generic secret rules. Entropy scoring catches secrets that don't match any known pattern. All detected secrets are replaced with `REDACTED`. +To reduce over-redaction, Entire preserves structural transcript fields such as IDs and paths, ignores common placeholder values, and redacts only credential values for bounded key/value forms. When a connection string contains a real (non-placeholder) password, it is redacted as a unit because partial fragments can still expose sensitive material; connection strings whose passwords are placeholders (e.g. `${DB_PASSWORD}`) are left intact. + ## Limitations - **Best-effort.** Novel or low-entropy secrets (short passwords, predictable tokens) may not be caught. diff --git a/redact/redact.go b/redact/redact.go index d4119db311..8747d1d9e4 100644 --- a/redact/redact.go +++ b/redact/redact.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "math" + "net/url" "regexp" "sort" "strings" @@ -24,6 +25,22 @@ var secretPattern = regexp.MustCompile(`[A-Za-z0-9+_=-]{10,}`) // moderate entropy and are not reliably covered by vendor-specific scanners. var credentialedURIPattern = regexp.MustCompile(`(?i)\b[a-z][a-z0-9+.-]{1,31}://[^\s/?#@"'` + "`" + `<>:]*:[^\s/?#@"'` + "`" + `<>]+@[^\s"'` + "`" + `<>]+`) +var ( + jdbcPattern = regexp.MustCompile(`(?i)\bjdbc:[^\s"'<>` + "`" + `]+`) + databaseURLPattern = regexp.MustCompile(`(?i)\b(?:postgres(?:ql)?|mysql|mariadb|mongodb(?:\+srv)?|redis)://[^\s"'<>` + "`" + `]+`) + keywordDSNPattern = regexp.MustCompile(`(?i)\b[a-z_][a-z0-9_]*=(?:"[^"]*"|'[^']*'|[^\s"']+)(?:\s+[a-z_][a-z0-9_]*=(?:"[^"]*"|'[^']*'|[^\s"']+)){2,}`) + semicolonConnPattern = regexp.MustCompile(`(?i)\b[a-z][a-z0-9 _-]*=(?:\{[^}]*\}|[^=;"'\s]+)(?:;[a-z][a-z0-9 _-]*=(?:\{[^}]*\}|[^=;"'\s]+)){2,}`) + credentialValuePattern = regexp.MustCompile(`(?i)(?:^|[^A-Za-z0-9])((?:db|database|pg|postgres|postgresql|mysql|mariadb|redis|mongo|mongodb|sqlserver|mssql|jdbc)[_-]?(?:password|passwd|pwd)|pgpassword|mysql_pwd|redis_password|mongo_password|mongodb_password)\s*=\s*("[^"]*"|'[^']*'|[^\s,;&]+)`) + + keywordHostPattern = regexp.MustCompile(`(?i)(?:^|\s)host=`) + keywordUserPattern = regexp.MustCompile(`(?i)(?:^|\s)user=`) + semicolonServerPattern = regexp.MustCompile(`(?i)(?:^|;)\s*(?:server|data source|datasource|addr|address|network address)\s*=`) + semicolonUserPattern = regexp.MustCompile(`(?i)(?:^|;)\s*(?:user id|userid|user|uid)\s*=`) + passwordAssignmentRegex = regexp.MustCompile(`(?i)(?:^|[?&;\s])(?:password|pwd)=("[^"]*"|'[^']*'|[^&;\s"']+)`) + credentialJSONKeyRegex = regexp.MustCompile(`(?i)^(?:(?:db|database|pg|postgres|postgresql|mysql|mariadb|redis|mongo|mongodb|sqlserver|mssql|jdbc)[_-]?(?:password|passwd|pwd)|pgpassword|mysql_pwd|redis_password|mongo_password|mongodb_password)$`) + genericPasswordKeyRegex = regexp.MustCompile(`(?i)^(?:password|passwd|pwd)$`) +) + // entropyThreshold is the minimum Shannon entropy for a string to be considered // a secret. 4.5 was chosen through trial and error: high enough to avoid false // positives on common words and identifiers, low enough to catch typical API keys @@ -33,6 +50,17 @@ const entropyThreshold = 4.5 // RedactedPlaceholder is the replacement text used for redacted secrets. const RedactedPlaceholder = "REDACTED" +// redactedPlaceholderForms holds the lowercase variants of RedactedPlaceholder +// used to recognize already-redacted values (so we don't double-redact). +var redactedPlaceholderForms = func() map[string]struct{} { + lower := strings.ToLower(RedactedPlaceholder) + return map[string]struct{}{ + lower: {}, + "[" + lower + "]": {}, + "<" + lower + ">": {}, + } +}() + // RedactedBytes represents transcript data that has been through secret // redaction. Consumers that require pre-redacted input (e.g., compact.Compact, // checkpoint stores) accept this type to enforce the contract at compile time. @@ -88,11 +116,31 @@ type taggedRegion struct { label string } +type jsonReplacement struct { + key string + original string + redacted string +} + +type connectionStringRule struct { + pattern *regexp.Regexp + hasSecret func(string) bool +} + +var connectionStringRules = []connectionStringRule{ + {pattern: jdbcPattern, hasSecret: hasJDBCPassword}, + {pattern: databaseURLPattern, hasSecret: hasDatabaseURLSecret}, + {pattern: keywordDSNPattern, hasSecret: hasKeywordDSNPassword}, + {pattern: semicolonConnPattern, hasSecret: hasSemicolonConnectionPassword}, +} + // String replaces secrets and PII in s using layered detection: // 1. Entropy-based: high-entropy alphanumeric sequences (threshold 4.5) // 2. Pattern-based: betterleaks regex rules (260+ known secret formats) // 3. Credentialed URIs: URLs containing userinfo passwords -// 4. PII detection: email, phone, address patterns (only when configured via ConfigurePII) +// 4. Database connection strings: JDBC, keyword DSNs, and semicolon strings +// 5. Bounded credential key/value pairs: DB_PASSWORD=... +// 6. PII detection: email, phone, address patterns (only when configured via ConfigurePII) // A string is redacted if ANY method flags it. func String(s string) string { var regions []taggedRegion @@ -146,7 +194,13 @@ func String(s string) string { regions = append(regions, taggedRegion{region: region{loc[0], loc[1]}}) } - // 4. PII detection (opt-in — only runs when configured). + // 4. Database and connection-string detection (secrets — always on). + regions = append(regions, detectConnectionStrings(s)...) + + // 5. Bounded credential key/value detection (secrets — always on). + regions = append(regions, detectCredentialValues(s)...) + + // 6. PII detection (opt-in — only runs when configured). regions = append(regions, detectPII(getPIIConfig(), s)...) if len(regions) == 0 { @@ -187,6 +241,176 @@ func String(s string) string { return b.String() } +func detectConnectionStrings(s string) []taggedRegion { + if !strings.ContainsRune(s, '=') { + return nil + } + var regions []taggedRegion + for _, rule := range connectionStringRules { + regions = append(regions, detectConnectionStringRule(s, rule)...) + } + return regions +} + +func detectConnectionStringRule(s string, rule connectionStringRule) []taggedRegion { + var regions []taggedRegion + for _, loc := range rule.pattern.FindAllStringIndex(s, -1) { + start, end := loc[0], trimConnectionStringEnd(s, loc[0], loc[1]) + if start >= end { + continue + } + if rule.hasSecret(s[start:end]) { + regions = append(regions, taggedRegion{region: region{start, end}}) + } + } + return regions +} + +func trimConnectionStringEnd(s string, start, end int) int { + for end > start { + switch s[end-1] { + case '.', ',', ';', ':', '!', '?', ')', ']': + end-- + default: + return end + } + } + return end +} + +func hasJDBCPassword(candidate string) bool { + if !strings.HasPrefix(strings.ToLower(candidate), "jdbc:") { + return false + } + return hasNonPlaceholderPasswordAssignment(candidate) +} + +func hasDatabaseURLSecret(candidate string) bool { + u, err := url.Parse(candidate) + if err != nil || u.Scheme == "" || u.Host == "" { + return false + } + for key, values := range u.Query() { + if !isPasswordQueryKey(key) { + continue + } + for _, value := range values { + if hasNonPlaceholderPasswordValue(value) { + return true + } + } + } + return false +} + +func isPasswordQueryKey(key string) bool { + return strings.EqualFold(key, "password") || strings.EqualFold(key, "pwd") +} + +func hasKeywordDSNPassword(candidate string) bool { + return keywordHostPattern.MatchString(candidate) && + keywordUserPattern.MatchString(candidate) && + hasNonPlaceholderPasswordAssignment(candidate) +} + +func hasSemicolonConnectionPassword(candidate string) bool { + return semicolonServerPattern.MatchString(candidate) && + semicolonUserPattern.MatchString(candidate) && + hasNonPlaceholderPasswordAssignment(candidate) +} + +func detectCredentialValues(s string) []taggedRegion { + var regions []taggedRegion + for _, loc := range credentialValuePattern.FindAllStringSubmatchIndex(s, -1) { + if len(loc) < 6 || loc[4] < 0 || loc[5] < 0 { + continue + } + start, end := unquoteRange(s, loc[4], loc[5]) + if hasNonPlaceholderPasswordValue(s[start:end]) { + regions = append(regions, taggedRegion{region: region{start, end}}) + } + } + return regions +} + +func unquoteRange(s string, start, end int) (int, int) { + if end-start < 2 { + return start, end + } + first, last := s[start], s[end-1] + if (first == '"' && last == '"') || (first == '\'' && last == '\'') { + return start + 1, end - 1 + } + return start, end +} + +func hasNonPlaceholderPasswordAssignment(candidate string) bool { + for _, loc := range passwordAssignmentRegex.FindAllStringSubmatchIndex(candidate, -1) { + if len(loc) >= 4 && loc[2] >= 0 && loc[3] >= 0 { + start, end := unquoteRange(candidate, loc[2], loc[3]) + if hasNonPlaceholderPasswordValue(candidate[start:end]) { + return true + } + } + } + return false +} + +func hasNonPlaceholderPasswordValue(value string) bool { + return value != "" && !isPlaceholderSecretValue(value) +} + +func isPlaceholderSecretValue(value string) bool { + normalized := strings.ToLower(strings.TrimSpace(value)) + normalized = strings.Trim(normalized, `"'`) + if normalized == "" { + return true + } + if strings.HasPrefix(normalized, "${") && strings.HasSuffix(normalized, "}") { + return true + } + if _, ok := redactedPlaceholderForms[normalized]; ok { + return true + } + switch normalized { + case "xxx", "xxxx", "changeme", "example": + return true + default: + return false + } +} + +func isCredentialJSONSecretKey(key string, credentialContext bool) bool { + normalized := normalizeCredentialJSONKey(key) + if credentialJSONKeyRegex.MatchString(normalized) { + return true + } + return credentialContext && genericPasswordKeyRegex.MatchString(normalized) +} + +func isCredentialJSONObject(obj map[string]any) bool { + var hasHost, hasUser bool + for key := range obj { + switch normalizeCredentialJSONKey(key) { + case "host", "hostname", "server", "addr", "address", "datasource", "data_source": + hasHost = true + case "user", "username", "userid", "user_id", "uid": + hasUser = true + } + if hasHost && hasUser { + return true + } + } + return false +} + +func normalizeCredentialJSONKey(key string) string { + key = strings.ToLower(strings.TrimSpace(key)) + key = strings.ReplaceAll(key, "-", "_") + key = strings.ReplaceAll(key, " ", "_") + return key +} + // Bytes is a convenience wrapper around String for []byte content. func Bytes(b []byte) []byte { s := string(b) @@ -268,24 +492,79 @@ func JSONLContent(content string) (string, error) { // applyJSONReplacements applies collected (original, redacted) string pairs // to the raw JSON text, replacing JSON-encoded originals with their redacted forms. // Returns s unchanged if repls is empty. -func applyJSONReplacements(s string, repls [][2]string) (string, error) { +func applyJSONReplacements(s string, repls []jsonReplacement) (string, error) { if len(repls) == 0 { return s, nil } for _, r := range repls { - origJSON, err := jsonEncodeString(r[0]) + origJSON, err := jsonEncodeString(r.original) + if err != nil { + return "", err + } + replJSON, err := jsonEncodeString(r.redacted) if err != nil { return "", err } - replJSON, err := jsonEncodeString(r[1]) + if r.key == "" { + s = strings.ReplaceAll(s, origJSON, replJSON) + continue + } + keyJSON, err := jsonEncodeString(r.key) if err != nil { return "", err } - s = strings.ReplaceAll(s, origJSON, replJSON) + s = replaceKeyedJSONValue(s, keyJSON, origJSON, replJSON) } return s, nil } +// replaceKeyedJSONValue replaces every occurrence of origJSON that follows +// keyJSON + optional whitespace + ':' + optional whitespace. Restricts +// substitution to value positions so a key's own redacted text is not +// rewritten when it collides with another field's value. +func replaceKeyedJSONValue(s, keyJSON, origJSON, replJSON string) string { + if !strings.Contains(s, keyJSON) { + return s + } + var b strings.Builder + b.Grow(len(s)) + i := 0 + for i < len(s) { + j := strings.Index(s[i:], keyJSON) + if j < 0 { + b.WriteString(s[i:]) + break + } + keyEnd := i + j + len(keyJSON) + b.WriteString(s[i : i+j]) + b.WriteString(keyJSON) + p := keyEnd + for p < len(s) && isJSONWhitespace(s[p]) { + p++ + } + if p >= len(s) || s[p] != ':' { + i = keyEnd + continue + } + p++ + for p < len(s) && isJSONWhitespace(s[p]) { + p++ + } + if p+len(origJSON) <= len(s) && s[p:p+len(origJSON)] == origJSON { + b.WriteString(s[keyEnd:p]) + b.WriteString(replJSON) + i = p + len(origJSON) + continue + } + i = keyEnd + } + return b.String() +} + +func isJSONWhitespace(c byte) bool { + return c == ' ' || c == '\t' || c == '\n' || c == '\r' +} + // isSingleJSONValue returns true if the decoder has reached EOF (no more // top-level values). This distinguishes a single JSON value (e.g., pretty-printed // object) from JSONL (multiple concatenated values). We attempt a second Decode @@ -296,37 +575,44 @@ func isSingleJSONValue(dec *json.Decoder) bool { return dec.Decode(&discard) == io.EOF } -// collectJSONLReplacements walks a parsed JSON value and collects unique -// (original, redacted) string pairs for values that need redaction. -func collectJSONLReplacements(v any) [][2]string { +// collectJSONLReplacements walks a parsed JSON value and collects unique string +// replacements for values that need redaction. +func collectJSONLReplacements(v any) []jsonReplacement { seen := make(map[string]bool) - var repls [][2]string - var walk func(v any) - walk = func(v any) { + var repls []jsonReplacement + var walk func(key string, credentialContext bool, v any) + walk = func(key string, credentialContext bool, v any) { switch val := v.(type) { case map[string]any: if shouldSkipJSONLObject(val) { return } + childCredentialContext := credentialContext || isCredentialJSONObject(val) for k, child := range val { if shouldSkipJSONLField(k) { continue } - walk(child) + walk(k, childCredentialContext, child) } case []any: for _, child := range val { - walk(child) + walk("", credentialContext, child) } case string: redacted := String(val) - if redacted != val && !seen[val] { - seen[val] = true - repls = append(repls, [2]string{val, redacted}) + if redacted == val && isCredentialJSONSecretKey(key, credentialContext) && hasNonPlaceholderPasswordValue(val) { + redacted = RedactedPlaceholder + } + if redacted != val { + seenKey := key + "\x00" + val + if !seen[seenKey] { + seen[seenKey] = true + repls = append(repls, jsonReplacement{key: key, original: val, redacted: redacted}) + } } } } - walk(v) + walk("", false, v) return repls } diff --git a/redact/redact_test.go b/redact/redact_test.go index 5e5cecfbe9..94a8da2376 100644 --- a/redact/redact_test.go +++ b/redact/redact_test.go @@ -28,6 +28,25 @@ func openSSHPrivateKeyMarker(kind string) string { return "-----" + kind + " " + "OPEN" + "SSH" + " " + "PRIVATE" + " KEY-----" } +type stringRedactionCase struct { + name string + input string + want string +} + +func assertStringRedactionCases(t *testing.T, tests []stringRedactionCase) { + t.Helper() + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := String(tt.input) + if got != tt.want { + t.Errorf("String(%q) = %q, want %q", tt.input, got, tt.want) + } + }) + } +} + func TestBytes_NoSecrets(t *testing.T) { input := []byte("hello world, this is normal text") result := Bytes(input) @@ -183,7 +202,7 @@ func TestCollectJSONLReplacements_Succeeds(t *testing.T) { } repls := collectJSONLReplacements(obj) // expect one replacement for high-entropy secret - want := [][2]string{{"token=" + highEntropySecret, "REDACTED"}} + want := []jsonReplacement{{key: "content", original: "token=" + highEntropySecret, redacted: "REDACTED"}} if !slices.Equal(repls, want) { t.Errorf("got %q, want %q", repls, want) } @@ -251,8 +270,25 @@ func TestShouldSkipJSONLField_RedactionBehavior(t *testing.T) { if len(repls) != 1 { t.Fatalf("expected 1 replacement, got %d", len(repls)) } - if repls[0][0] != highEntropySecret { - t.Errorf("expected replacement for secret in content field, got %q", repls[0][0]) + if repls[0].original != highEntropySecret { + t.Errorf("expected replacement for secret in content field, got %q", repls[0].original) + } +} + +func TestJSONLContent_SkippedFieldValueCollision(t *testing.T) { + t.Parallel() + input := `{"session_id":"` + highEntropySecret + `","content":"` + highEntropySecret + `"}` + + result, err := JSONLContent(input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if !strings.Contains(result, `"session_id":"`+highEntropySecret+`"`) { + t.Fatalf("expected skipped session_id to be preserved, got: %s", result) + } + if !strings.Contains(result, `"content":"REDACTED"`) { + t.Fatalf("expected content field to be redacted, got: %s", result) } } @@ -355,6 +391,193 @@ func TestString_CredentialedURIs(t *testing.T) { } } +func TestString_DatabaseConnectionStringRedaction(t *testing.T) { + t.Parallel() + assertStringRedactionCases(t, []stringRedactionCase{ + { + name: "postgres keyword DSN", + input: `dsn="host=db.example.com port=5432 user=svc password=secret dbname=app sslmode=require"`, + want: `dsn="REDACTED"`, + }, + { + name: "postgres keyword DSN different order", + input: "password=secret sslmode=require user=svc host=db.example.com dbname=app", + want: "REDACTED", + }, + { + name: "sql server connection string", + input: "conn=Server=tcp:db.example.com,1433;Database=app;User Id=svc;Password=secret;Encrypt=true", + want: "conn=REDACTED", + }, + { + name: "odbc connection string", + input: "conn=Driver={ODBC Driver 18 for SQL Server};Server=db;UID=svc;PWD=secret;Database=app", + want: "conn=REDACTED", + }, + { + name: "jdbc query password", + input: "jdbc:postgresql://db.example.com:5432/app?user=svc&password=secret&ssl=true", + want: "REDACTED", + }, + { + name: "postgres URL query password without userinfo", + input: "DATABASE_URL=postgresql://db.example.com:5432/app?user=svc&password=secret&sslmode=require", + want: "DATABASE_URL=REDACTED", + }, + { + name: "postgres URL query password is case-insensitive", + input: "DATABASE_URL=postgresql://db.example.com:5432/app?user=svc&Password=secret&sslmode=require", + want: "DATABASE_URL=REDACTED", + }, + { + name: "mongodb URL query password without userinfo", + input: "MONGO_URL=mongodb://cluster0.example.mongodb.net/app?authSource=admin&username=svc&password=secret", + want: "MONGO_URL=REDACTED", + }, + { + name: "mongodb srv URL query password without userinfo", + input: "MONGO_URL=mongodb+srv://cluster0.example.mongodb.net/app?authSource=admin&username=svc&password=secret", + want: "MONGO_URL=REDACTED", + }, + { + name: "placeholder password in database URL query is preserved", + input: "DATABASE_URL=postgresql://db.example.com/app?user=svc&password=${DB_PASSWORD}", + want: "DATABASE_URL=postgresql://db.example.com/app?user=svc&password=${DB_PASSWORD}", + }, + { + name: "jdbc semicolon password", + input: "jdbc:sqlserver://db.example.com:1433;databaseName=app;user=svc;password=secret;encrypt=true", + want: "REDACTED", + }, + }) +} + +func TestDatabaseConnectionStringRuleScope(t *testing.T) { + t.Parallel() + tests := []struct { + name string + candidate string + hasSecret func(string) bool + want bool + }{ + { + name: "database URL query password is in scope", + candidate: "postgresql://db.example.com:5432/app?user=svc&password=secret&sslmode=require", + hasSecret: hasDatabaseURLSecret, + want: true, + }, + { + name: "database URL userinfo password is handled by credentialed URI detection", + candidate: "postgresql://svc:secret@db.example.com:5432/app", + hasSecret: hasDatabaseURLSecret, + want: false, + }, + { + name: "JDBC query password is in scope", + candidate: "jdbc:postgresql://db.example.com:5432/app?user=svc&password=secret", + hasSecret: hasJDBCPassword, + want: true, + }, + { + name: "JDBC userinfo password is handled by credentialed URI detection", + candidate: "jdbc:postgresql://svc:secret@db.example.com:5432/app", + hasSecret: hasJDBCPassword, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := tt.hasSecret(tt.candidate) + if got != tt.want { + t.Errorf("hasSecret(%q) = %v, want %v", tt.candidate, got, tt.want) + } + }) + } +} + +func TestString_BoundedCredentialValueRedaction(t *testing.T) { + t.Parallel() + assertStringRedactionCases(t, []stringRedactionCase{ + { + name: "db password env var", + input: "DB_PASSWORD=secret123", + want: "DB_PASSWORD=REDACTED", + }, + { + name: "postgres password env var", + input: "PGPASSWORD='secret123'", + want: "PGPASSWORD='REDACTED'", + }, + { + name: "redis password env var", + input: `REDIS_PASSWORD="secret123"`, + want: `REDIS_PASSWORD="REDACTED"`, + }, + { + name: "lowercase database password", + input: "database_password=secret123", + want: "database_password=REDACTED", + }, + { + name: "prefixed db password env var", + input: "APP_DB_PASSWORD=secret123", + want: "APP_DB_PASSWORD=REDACTED", + }, + { + name: "prefixed mysql password env var", + input: "PROD_MYSQL_PWD=secret123", + want: "PROD_MYSQL_PWD=REDACTED", + }, + }) +} + +func TestString_BoundedCredentialValueOverRedactionGuards(t *testing.T) { + t.Parallel() + assertStringRedactionCases(t, []stringRedactionCase{ + { + name: "placeholder env var is preserved", + input: "DB_PASSWORD=${DB_PASSWORD}", + want: "DB_PASSWORD=${DB_PASSWORD}", + }, + { + name: "already redacted value is preserved", + input: "DB_PASSWORD=REDACTED", + want: "DB_PASSWORD=REDACTED", + }, + { + name: "prose about password is preserved", + input: "the password field should be rotated regularly", + want: "the password field should be rotated regularly", + }, + { + name: "generic key is preserved", + input: "key=not-a-secret-setting", + want: "key=not-a-secret-setting", + }, + { + name: "shell pwd is preserved", + input: "PWD=/workspace/project", + want: "PWD=/workspace/project", + }, + { + name: "standalone password assignment is preserved", + input: "password=not-a-secret-setting", + want: "password=not-a-secret-setting", + }, + { + name: "password reset query parameter is preserved", + input: "https://example.com/?password_reset=true", + want: "https://example.com/?password_reset=true", + }, + { + name: "generic https password query is preserved", + input: "https://example.com/callback?user=svc&password=not-a-db-credential&debug=true", + want: "https://example.com/callback?user=svc&password=not-a-db-credential&debug=true", + }, + }) +} + func TestString_OpenSSHPrivateKeyBlock(t *testing.T) { input := "key:\n" + fakeOpenSSHPrivateKey + "\nend" want := "key:\nREDACTED\nend" @@ -402,6 +625,76 @@ func TestJSONLContent_OpenSSHPrivateKeyBlock(t *testing.T) { } } +func TestJSONLContent_DatabaseCredentialRedaction(t *testing.T) { + t.Parallel() + input := `{"type":"assistant","message":"dsn host=db.example.com user=svc password=secret dbname=app and env DB_PASSWORD=secret123","session_id":"ses_37273a1fdffegpYbwUTqEkPsQ0","file_path":"/tmp/TestE2E_ExistingFiles/controller.go"}` + + result, err := JSONLContent(input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + for _, leaked := range []string{"password=secret", "DB_PASSWORD=secret123"} { + if strings.Contains(result, leaked) { + t.Fatalf("expected %q to be redacted, got: %s", leaked, result) + } + } + for _, preserved := range []string{"ses_37273a1fdffegpYbwUTqEkPsQ0", "/tmp/TestE2E_ExistingFiles/controller.go"} { + if !strings.Contains(result, preserved) { + t.Fatalf("expected structural value %q to be preserved, got: %s", preserved, result) + } + } +} + +func TestJSONLContent_StructuredCredentialFieldsRedacted(t *testing.T) { + t.Parallel() + input := `{"type":"assistant","env":{"DB_PASSWORD":"correct-horse-db","REDIS_PASSWORD":"${REDIS_PASSWORD}","note":"correct-horse-db"},"db":{"password":"correct-horse-db","host":"db.example.com","user":"svc"},"session_id":"ses_37273a1fdffegpYbwUTqEkPsQ0"}` + + result, err := JSONLContent(input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + for _, leaked := range []string{`"DB_PASSWORD":"correct-horse-db"`, `"password":"correct-horse-db"`} { + if strings.Contains(result, leaked) { + t.Fatalf("expected structured credential field %q to be redacted, got: %s", leaked, result) + } + } + for _, preserved := range []string{ + `"DB_PASSWORD":"REDACTED"`, + `"REDIS_PASSWORD":"${REDIS_PASSWORD}"`, + `"password":"REDACTED"`, + `"host":"db.example.com"`, + `"user":"svc"`, + `"note":"correct-horse-db"`, + "ses_37273a1fdffegpYbwUTqEkPsQ0", + } { + if !strings.Contains(result, preserved) { + t.Fatalf("expected %q to be preserved, got: %s", preserved, result) + } + } +} + +func TestJSONLContent_NormalizedCredentialKeysRedacted(t *testing.T) { + t.Parallel() + input := `{"type":"assistant","env":{"DB Password":"correct-horse-db","note":"correct-horse-db"},"session_id":"ses_37273a1fdffegpYbwUTqEkPsQ0"}` + + result, err := JSONLContent(input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + for _, preserved := range []string{ + `"DB Password":"REDACTED"`, + `"note":"correct-horse-db"`, + "ses_37273a1fdffegpYbwUTqEkPsQ0", + } { + if !strings.Contains(result, preserved) { + t.Fatalf("expected %q to be preserved, got: %s", preserved, result) + } + } + if strings.Contains(result, `"DB Password":"correct-horse-db"`) { + t.Fatalf("expected normalized credential key to be redacted, got: %s", result) + } +} + func TestShouldSkipJSONLObject(t *testing.T) { tests := []struct { name string @@ -458,7 +751,7 @@ func TestShouldSkipJSONLObject_RedactionBehavior(t *testing.T) { repls := collectJSONLReplacements(obj) // expect no replacements, it's an image which is skipped. - var wantRepls [][2]string + var wantRepls []jsonReplacement if !slices.Equal(repls, wantRepls) { t.Errorf("got %q, want %q", repls, wantRepls) } @@ -469,7 +762,7 @@ func TestShouldSkipJSONLObject_RedactionBehavior(t *testing.T) { "content": highEntropySecret, } repls2 := collectJSONLReplacements(obj2) - wantRepls2 := [][2]string{{highEntropySecret, "REDACTED"}} + wantRepls2 := []jsonReplacement{{key: "content", original: highEntropySecret, redacted: "REDACTED"}} if !slices.Equal(repls2, wantRepls2) { t.Errorf("got %q, want %q", repls2, wantRepls2) }