From 652cf0deb20c892d4d8e7d5eae5c7c2cd5309c69 Mon Sep 17 00:00:00 2001 From: Dean Sheather Date: Wed, 6 May 2026 17:46:35 +0000 Subject: [PATCH 1/2] tshttpproxy: degrade gracefully when WPAD is unavailable On Windows, tailscaled invokes WinHttpGetProxyForUrl on every outbound HTTP request to discover a proxy via WPAD (DHCP option 252 / DNS A wpad.). On networks where WPAD is not configured this call blocks for up to 5s and then returns ERROR_WINHTTP_AUTODETECTION_FAILED or ERROR_WINHTTP_UNABLE_TO_DOWNLOAD_SCRIPT. The previous 10s negative-cache window was repeatedly torn down by InvalidateCache() on every netmon link change, so control / DERP / log uploads spent most of their time waiting on dead WPAD probes. This change: * Bumps the negative-cache backoff from 10s to 5 minutes for autodetection-failed / script-download-failed, and to 30s for the overall context timeout. InvalidateCache() still resets it on link changes, so we re-probe WPAD whenever the network actually changes. * Stops returning a stale cachedProxy.val on context-deadline timeout. A previous network's PAC-resolved proxy is the wrong answer when roaming to a network without WPAD; return (nil, nil) so the caller dials direct. * Maps every unmapped WinHTTP error to (nil, nil) plus a backoff instead of returning the raw error. The package-level ProxyFromEnvironment already discards sysProxyFromEnv errors, but making the contract explicit at the platform layer prevents future regressions and makes the intent obvious in the source. WPAD continues to work unchanged on networks where it is configured; this change only affects how we degrade when it is not. Adds a cross-platform regression test that injects a failing sysProxyFromEnv hook and asserts ProxyFromEnvironment returns (nil, nil), respects an active noProxyUntil backoff, and re-probes after InvalidateCache(). References: tailscale/tailscale#17055, tailscale/tailscale#10215 --- net/tshttpproxy/tshttpproxy_test.go | 76 ++++++++++++++++++++++++++ net/tshttpproxy/tshttpproxy_windows.go | 54 +++++++++++++++--- 2 files changed, 123 insertions(+), 7 deletions(-) diff --git a/net/tshttpproxy/tshttpproxy_test.go b/net/tshttpproxy/tshttpproxy_test.go index 5b9362d2c094d..d16162b5d36d7 100644 --- a/net/tshttpproxy/tshttpproxy_test.go +++ b/net/tshttpproxy/tshttpproxy_test.go @@ -4,6 +4,7 @@ package tshttpproxy import ( + "errors" "net/http" "net/url" "os" @@ -205,3 +206,78 @@ func TestSetSelfProxy(t *testing.T) { }) } } + +// TestProxyFromEnvironment_sysProxyError verifies that when the +// platform-specific proxy lookup (e.g. Windows WPAD via WinHTTP) returns +// an error, ProxyFromEnvironment surfaces (nil, nil) to the caller so +// http.Transport falls back to a direct connection rather than failing +// the request. This is the cross-platform glue side of the +// coder/tailscale fix for tailscale/tailscale#17055 / #10215. +func TestProxyFromEnvironment_sysProxyError(t *testing.T) { + // Make sure no env-var proxy interferes with the fallthrough. + for _, k := range []string{"HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY", "http_proxy", "https_proxy", "no_proxy"} { + if v, ok := os.LookupEnv(k); ok { + os.Unsetenv(k) + t.Cleanup(func() { os.Setenv(k, v) }) + } + } + // Reset the package-level config/proxyFunc cache so the env-var + // changes above take effect. + mu.Lock() + config = nil + proxyFunc = nil + noProxyUntil = time.Time{} + mu.Unlock() + t.Cleanup(func() { + mu.Lock() + config = nil + proxyFunc = nil + noProxyUntil = time.Time{} + mu.Unlock() + }) + + prev := sysProxyFromEnv + t.Cleanup(func() { sysProxyFromEnv = prev }) + + var calls int + wantErr := errors.New("synthetic WPAD failure") + sysProxyFromEnv = func(*http.Request) (*url.URL, error) { + calls++ + return nil, wantErr + } + + req := &http.Request{URL: must.Get(url.Parse("https://example.com/"))} + got, err := ProxyFromEnvironment(req) + if err != nil { + t.Fatalf("ProxyFromEnvironment returned error %v; want nil so transport dials direct", err) + } + if got != nil { + t.Fatalf("ProxyFromEnvironment = %v; want nil", got) + } + if calls != 1 { + t.Fatalf("sysProxyFromEnv was called %d times; want 1", calls) + } + + // Simulate the platform layer applying a backoff after a failure. + // While the backoff is active, ProxyFromEnvironment must not + // re-invoke the platform hook. + setNoProxyUntil(time.Minute) + got, err = ProxyFromEnvironment(req) + if err != nil || got != nil { + t.Fatalf("during backoff: got (%v, %v); want (nil, nil)", got, err) + } + if calls != 1 { + t.Fatalf("sysProxyFromEnv was called %d times during backoff; want 1", calls) + } + + // InvalidateCache (e.g. on a netmon link change) must clear the + // backoff and allow a fresh platform lookup. + InvalidateCache() + got, err = ProxyFromEnvironment(req) + if err != nil || got != nil { + t.Fatalf("after InvalidateCache: got (%v, %v); want (nil, nil)", got, err) + } + if calls != 2 { + t.Fatalf("after InvalidateCache: sysProxyFromEnv calls = %d; want 2", calls) + } +} diff --git a/net/tshttpproxy/tshttpproxy_windows.go b/net/tshttpproxy/tshttpproxy_windows.go index 06a1f5ae445d0..ab6d84d790c23 100644 --- a/net/tshttpproxy/tshttpproxy_windows.go +++ b/net/tshttpproxy/tshttpproxy_windows.go @@ -52,6 +52,25 @@ var ( metricErrOther = clientmetric.NewCounter("winhttp_proxy_err_other") ) +// WPAD backoffs. +// +// Coder fork: previously these were 10s, which combined with +// InvalidateCache() being called on every netmon link change meant a +// host with no WPAD server (DHCP option 252 unset, no wpad. A +// record) would re-issue a 5-second blocking WinHttpGetProxyForUrl call +// for nearly every outbound HTTP request. Bumping the negative-cache +// duration to several minutes lets normal traffic flow while still +// re-trying WPAD periodically and on every link change (the latter via +// InvalidateCache, which zeros noProxyUntil). +// +// See tailscale/tailscale#17055 and tailscale/tailscale#10215. +const ( + wpadAutodetectFailedBackoff = 5 * time.Minute + wpadDownloadFailedBackoff = 5 * time.Minute + wpadTimeoutBackoff = 30 * time.Second + wpadUnknownErrorBackoff = 5 * time.Minute +) + func proxyFromWinHTTPOrCache(req *http.Request) (*url.URL, error) { if req.URL == nil { return nil, nil @@ -86,13 +105,27 @@ func proxyFromWinHTTPOrCache(req *http.Request) (*url.URL, error) { } // See https://docs.microsoft.com/en-us/windows/win32/winhttp/error-messages + // + // Coder fork: every error path below now returns (nil, nil) and + // applies a longer negative-cache backoff. The package-level + // ProxyFromEnvironment already discards sysProxyFromEnv errors, + // so propagating one here was effectively dead code — but + // returning nil explicitly documents the contract and keeps us + // honest if that wrapper ever changes. The real bug being fixed + // is the backoff durations being far too short: with the + // previous 10s, a host with no WPAD server (DHCP option 252 + // unset, no wpad. A record) would re-issue a 5s blocking + // WinHttpGetProxyForUrl on nearly every outbound HTTP request, + // because netmon link changes call InvalidateCache() and reset + // the shield constantly. See tailscale/tailscale#17055 and + // tailscale/tailscale#10215. const ( ERROR_WINHTTP_AUTODETECTION_FAILED = 12180 ERROR_WINHTTP_UNABLE_TO_DOWNLOAD_SCRIPT = 12167 ) if err == syscall.Errno(ERROR_WINHTTP_AUTODETECTION_FAILED) { metricErrDetectionFailed.Add(1) - setNoProxyUntil(10 * time.Second) + setNoProxyUntil(wpadAutodetectFailedBackoff) return nil, nil } if err == windows.ERROR_INVALID_PARAMETER { @@ -106,17 +139,24 @@ func proxyFromWinHTTPOrCache(req *http.Request) (*url.URL, error) { proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): %v/%#v", urlStr, err, err) if err == syscall.Errno(ERROR_WINHTTP_UNABLE_TO_DOWNLOAD_SCRIPT) { metricErrDownloadScript.Add(1) - setNoProxyUntil(10 * time.Second) + setNoProxyUntil(wpadDownloadFailedBackoff) return nil, nil } metricErrOther.Add(1) - return nil, err + // Coder fork: do not propagate unmapped WinHTTP errors. Treat + // them as "no proxy discovered" so the caller dials direct. + setNoProxyUntil(wpadUnknownErrorBackoff) + return nil, nil case <-ctx.Done(): metricErrTimeout.Add(1) - cachedProxy.Lock() - defer cachedProxy.Unlock() - proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): timeout; using cached proxy %v", urlStr, cachedProxy.val) - return cachedProxy.val, nil + // Coder fork: previously returned cachedProxy.val, which + // stranded the client on a stale proxy from a previous + // network. Return no-proxy and apply a short backoff so + // in-flight WinHTTP probes don't pile up on every request + // while WPAD is slow. + setNoProxyUntil(wpadTimeoutBackoff) + proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): timeout; falling back to direct", urlStr) + return nil, nil } } From cf8ada465df82f072207ef6cdd38469975f0967a Mon Sep 17 00:00:00 2001 From: Dean Sheather Date: Fri, 8 May 2026 15:43:13 +0000 Subject: [PATCH 2/2] tshttpproxy: address deep-review findings on PR #118 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary of changes in response to review: * Add an InvalidateCache epoch counter (tshttpproxy.go) and snapshot it at the start of every WPAD probe. The Windows error/timeout branches only commit negative-cache state when the epoch hasn't advanced, so a stale probe from a previous network can no longer stomp noProxyUntil and lock out WPAD on a freshly-changed network for 5 minutes. * Restore an epoch-gated cachedProxy fallback on context-deadline timeout. When the cached proxy was discovered under the current epoch and a probe times out, return the cached value instead of dialing direct — this keeps users on networks that legitimately require a proxy from silently bypassing it during transient WinHTTP slowness. When the cache is stale or empty, we still fall through to direct with a short backoff, which addresses the original 'stranded on a stale proxy after roaming' concern. * Make ProxyFromEnvironment's contract explicit: it now drops sysProxyFromEnv errors via an explicit '_' instead of relying on variable shadowing. This was load-bearing for the Windows fix; a future refactor could have silently re-propagated WinHTTP errors and broken outbound traffic on WPAD-less hosts. * Refactor proxyFromWinHTTPOrCache to take an injectable winHTTPLookup package var and add Windows-targeted tests covering: success path (with epoch stamp), autodetect-failed, unmapped error, timeout with no cached proxy, timeout with fresh cached proxy, timeout with stale-epoch cached proxy, and an explicit regression test for the 'stale probe stomps fresh InvalidateCache' race. * Rename backoff constants for consistency (wpadTimeoutBackoff -> wpadTimeoutFailedBackoff, etc.), fold the ERROR_INVALID_PARAMETER 1-hour value into the named-constant block, and drop the unmapped-error backoff from 5 minutes to 1 minute so a one-off transient WinHTTP error doesn't suppress autodetection for longer than necessary. * Trim the duplicate Coder fork: block comment inside the function body; the rationale lives once on the const block now. * Rename the cross-platform test's misleading wantErr to syntheticErr (the value is intentionally swallowed, not asserted on). --- net/tshttpproxy/tshttpproxy.go | 25 +- net/tshttpproxy/tshttpproxy_test.go | 4 +- net/tshttpproxy/tshttpproxy_windows.go | 104 ++++--- net/tshttpproxy/tshttpproxy_windows_test.go | 312 ++++++++++++++++++++ 4 files changed, 402 insertions(+), 43 deletions(-) create mode 100644 net/tshttpproxy/tshttpproxy_windows_test.go diff --git a/net/tshttpproxy/tshttpproxy.go b/net/tshttpproxy/tshttpproxy.go index 24b0050e96425..ff282e2d38e2c 100644 --- a/net/tshttpproxy/tshttpproxy.go +++ b/net/tshttpproxy/tshttpproxy.go @@ -28,11 +28,25 @@ func InvalidateCache() { mu.Lock() defer mu.Unlock() noProxyUntil = time.Time{} + epoch++ +} + +// CurrentEpoch returns a counter that increments on every InvalidateCache +// call. Platform-specific sysProxyFromEnv implementations can snapshot the +// epoch at the start of an in-flight probe and check it again before +// committing state derived from that probe (e.g. a negative-cache backoff), +// so that a probe issued on one network does not stomp the next network's +// state after a link change. +func CurrentEpoch() uint64 { + mu.Lock() + defer mu.Unlock() + return epoch } var ( mu sync.Mutex noProxyUntil time.Time // if non-zero, time at which ProxyFromEnvironment should check again + epoch uint64 // incremented on every InvalidateCache config *httpproxy.Config // used to create proxyFunc proxyFunc func(*url.URL) (*url.URL, error) ) @@ -120,6 +134,11 @@ var sysProxyFromEnv func(*http.Request) (*url.URL, error) // ProxyFromEnvironment is like the standard library's http.ProxyFromEnvironment // but additionally does OS-specific proxy lookups if the environment variables // alone don't specify a proxy. +// +// Errors from the platform-specific sysProxyFromEnv hook (e.g. WinHTTP/WPAD +// on Windows) are intentionally dropped: a failed system-proxy lookup must +// degrade to "no proxy, dial direct" rather than fail the outbound request. +// Errors from the env-var-derived proxy are propagated as before. func ProxyFromEnvironment(req *http.Request) (*url.URL, error) { localProxyFunc := getProxyFunc() u, err := localProxyFunc(req.URL) @@ -135,9 +154,9 @@ func ProxyFromEnvironment(req *http.Request) (*url.URL, error) { } if sysProxyFromEnv != nil { - u, err := sysProxyFromEnv(req) - if u != nil && err == nil { - return u, nil + // Drop the sys error explicitly; see func docs. + if su, serr := sysProxyFromEnv(req); su != nil && serr == nil { + return su, nil } } diff --git a/net/tshttpproxy/tshttpproxy_test.go b/net/tshttpproxy/tshttpproxy_test.go index d16162b5d36d7..d9e73be22bfa1 100644 --- a/net/tshttpproxy/tshttpproxy_test.go +++ b/net/tshttpproxy/tshttpproxy_test.go @@ -240,10 +240,10 @@ func TestProxyFromEnvironment_sysProxyError(t *testing.T) { t.Cleanup(func() { sysProxyFromEnv = prev }) var calls int - wantErr := errors.New("synthetic WPAD failure") + syntheticErr := errors.New("synthetic WPAD failure") sysProxyFromEnv = func(*http.Request) (*url.URL, error) { calls++ - return nil, wantErr + return nil, syntheticErr } req := &http.Request{URL: must.Get(url.Parse("https://example.com/"))} diff --git a/net/tshttpproxy/tshttpproxy_windows.go b/net/tshttpproxy/tshttpproxy_windows.go index ab6d84d790c23..c5d58e5035107 100644 --- a/net/tshttpproxy/tshttpproxy_windows.go +++ b/net/tshttpproxy/tshttpproxy_windows.go @@ -31,9 +31,16 @@ func init() { sysAuthHeader = sysAuthHeaderWindows } +// cachedProxy holds the most recent successful WPAD result and the +// InvalidateCache epoch it was discovered under. The timeout branch of +// proxyFromWinHTTPOrCache reads val as a fallback when an in-flight +// probe blows past the per-request deadline, but only when epoch +// matches the current one so we never strand the client on a proxy +// from a previous network. var cachedProxy struct { sync.Mutex - val *url.URL + val *url.URL + epoch uint64 } // proxyErrorf is a rate-limited logger specifically for errors asking @@ -52,31 +59,45 @@ var ( metricErrOther = clientmetric.NewCounter("winhttp_proxy_err_other") ) -// WPAD backoffs. +// WPAD negative-cache backoffs. // // Coder fork: previously these were 10s, which combined with // InvalidateCache() being called on every netmon link change meant a // host with no WPAD server (DHCP option 252 unset, no wpad. A // record) would re-issue a 5-second blocking WinHttpGetProxyForUrl call // for nearly every outbound HTTP request. Bumping the negative-cache -// duration to several minutes lets normal traffic flow while still -// re-trying WPAD periodically and on every link change (the latter via -// InvalidateCache, which zeros noProxyUntil). +// duration lets normal traffic flow while still re-trying WPAD on +// every link change (via InvalidateCache, which zeros noProxyUntil). // // See tailscale/tailscale#17055 and tailscale/tailscale#10215. const ( - wpadAutodetectFailedBackoff = 5 * time.Minute - wpadDownloadFailedBackoff = 5 * time.Minute - wpadTimeoutBackoff = 30 * time.Second - wpadUnknownErrorBackoff = 5 * time.Minute + wpadAutodetectFailedBackoff = 5 * time.Minute + wpadDownloadFailedBackoff = 5 * time.Minute + wpadTimeoutFailedBackoff = 30 * time.Second + wpadUnknownErrorFailedBackoff = 1 * time.Minute + // wpadInvalidParameterFailedBackoff is intentionally long: this + // error is only seen on Windows 8.1 and is not transient. See + // tailscale/tailscale#879. + wpadInvalidParameterFailedBackoff = 1 * time.Hour ) +// winHTTPLookup performs the underlying WPAD lookup. It is a package +// variable so tests can swap in a fake without exercising winhttp.dll. +var winHTTPLookup = proxyFromWinHTTP + func proxyFromWinHTTPOrCache(req *http.Request) (*url.URL, error) { if req.URL == nil { return nil, nil } urlStr := req.URL.String() + // Snapshot the InvalidateCache epoch before launching the probe. + // If a netmon link change increments the epoch while the probe is + // in flight, we must not commit any state derived from this probe + // (negative-cache backoff, cached-proxy fallback) onto the new + // network — that probe's answer is for the old network only. + startEpoch := CurrentEpoch() + ctx, cancel := context.WithTimeout(req.Context(), 5*time.Second) defer cancel() @@ -86,10 +107,21 @@ func proxyFromWinHTTPOrCache(req *http.Request) (*url.URL, error) { } resc := make(chan result, 1) go func() { - proxy, err := proxyFromWinHTTP(ctx, urlStr) + proxy, err := winHTTPLookup(ctx, urlStr) resc <- result{proxy, err} }() + // setBackoff applies a negative-cache backoff only if the + // InvalidateCache epoch hasn't advanced since the probe started. + // Stale probes from a previous network must not gate fresh + // lookups on the current one. + setBackoff := func(d time.Duration) { + if CurrentEpoch() != startEpoch { + return + } + setNoProxyUntil(d) + } + select { case res := <-resc: err := res.err @@ -101,60 +133,56 @@ func proxyFromWinHTTPOrCache(req *http.Request) (*url.URL, error) { log.Printf("tshttpproxy: winhttp: updating cached proxy setting from %v to %v", was, now) } cachedProxy.val = res.proxy + cachedProxy.epoch = startEpoch return res.proxy, nil } // See https://docs.microsoft.com/en-us/windows/win32/winhttp/error-messages - // - // Coder fork: every error path below now returns (nil, nil) and - // applies a longer negative-cache backoff. The package-level - // ProxyFromEnvironment already discards sysProxyFromEnv errors, - // so propagating one here was effectively dead code — but - // returning nil explicitly documents the contract and keeps us - // honest if that wrapper ever changes. The real bug being fixed - // is the backoff durations being far too short: with the - // previous 10s, a host with no WPAD server (DHCP option 252 - // unset, no wpad. A record) would re-issue a 5s blocking - // WinHttpGetProxyForUrl on nearly every outbound HTTP request, - // because netmon link changes call InvalidateCache() and reset - // the shield constantly. See tailscale/tailscale#17055 and - // tailscale/tailscale#10215. const ( ERROR_WINHTTP_AUTODETECTION_FAILED = 12180 ERROR_WINHTTP_UNABLE_TO_DOWNLOAD_SCRIPT = 12167 ) if err == syscall.Errno(ERROR_WINHTTP_AUTODETECTION_FAILED) { metricErrDetectionFailed.Add(1) - setNoProxyUntil(wpadAutodetectFailedBackoff) + setBackoff(wpadAutodetectFailedBackoff) return nil, nil } if err == windows.ERROR_INVALID_PARAMETER { metricErrInvalidParameters.Add(1) - // Seen on Windows 8.1. (https://github.com/tailscale/tailscale/issues/879) - // TODO(bradfitz): figure this out. - setNoProxyUntil(time.Hour) + setBackoff(wpadInvalidParameterFailedBackoff) proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): ERROR_INVALID_PARAMETER [unexpected]", urlStr) return nil, nil } proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): %v/%#v", urlStr, err, err) if err == syscall.Errno(ERROR_WINHTTP_UNABLE_TO_DOWNLOAD_SCRIPT) { metricErrDownloadScript.Add(1) - setNoProxyUntil(wpadDownloadFailedBackoff) + setBackoff(wpadDownloadFailedBackoff) return nil, nil } metricErrOther.Add(1) - // Coder fork: do not propagate unmapped WinHTTP errors. Treat - // them as "no proxy discovered" so the caller dials direct. - setNoProxyUntil(wpadUnknownErrorBackoff) + // Coder fork: every error path here returns (nil, nil) so the + // caller dials direct rather than failing the request. See the + // package-level ProxyFromEnvironment doc and the const block + // rationale above. + setBackoff(wpadUnknownErrorFailedBackoff) return nil, nil case <-ctx.Done(): metricErrTimeout.Add(1) - // Coder fork: previously returned cachedProxy.val, which - // stranded the client on a stale proxy from a previous - // network. Return no-proxy and apply a short backoff so - // in-flight WinHTTP probes don't pile up on every request - // while WPAD is slow. - setNoProxyUntil(wpadTimeoutBackoff) + // Coder fork: prefer the most recent successful WPAD result, + // but only if it was discovered under the current epoch. A + // cached proxy from a previous network would route this + // request through a now-unreachable host. When the cache is + // stale or empty, fall through to direct and apply a short + // backoff so in-flight WinHTTP probes don't pile up. + cachedProxy.Lock() + fallback := cachedProxy.val + fallbackEpoch := cachedProxy.epoch + cachedProxy.Unlock() + if fallback != nil && fallbackEpoch == startEpoch { + proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): timeout; using cached proxy %v", urlStr, fallback) + return fallback, nil + } + setBackoff(wpadTimeoutFailedBackoff) proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): timeout; falling back to direct", urlStr) return nil, nil } diff --git a/net/tshttpproxy/tshttpproxy_windows_test.go b/net/tshttpproxy/tshttpproxy_windows_test.go new file mode 100644 index 0000000000000..48e3b9674586f --- /dev/null +++ b/net/tshttpproxy/tshttpproxy_windows_test.go @@ -0,0 +1,312 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package tshttpproxy + +import ( + "context" + "net/http" + "net/url" + "sync" + "syscall" + "testing" + "time" + + "tailscale.com/util/must" +) + +// withCleanState resets package globals shared across these Windows +// tests and registers cleanup to put them back when the test exits. +func withCleanState(t *testing.T) { + t.Helper() + + mu.Lock() + prevNoProxyUntil := noProxyUntil + prevEpoch := epoch + noProxyUntil = time.Time{} + mu.Unlock() + + cachedProxy.Lock() + prevCachedVal := cachedProxy.val + prevCachedEpoch := cachedProxy.epoch + cachedProxy.val = nil + cachedProxy.epoch = 0 + cachedProxy.Unlock() + + prevLookup := winHTTPLookup + t.Cleanup(func() { + winHTTPLookup = prevLookup + mu.Lock() + noProxyUntil = prevNoProxyUntil + epoch = prevEpoch + mu.Unlock() + cachedProxy.Lock() + cachedProxy.val = prevCachedVal + cachedProxy.epoch = prevCachedEpoch + cachedProxy.Unlock() + }) +} + +func mustParseURL(t *testing.T, s string) *url.URL { + t.Helper() + return must.Get(url.Parse(s)) +} + +func newReq(t *testing.T) *http.Request { + t.Helper() + return &http.Request{URL: mustParseURL(t, "https://example.com/")} +} + +// TestProxyFromWinHTTPOrCache_AutodetectFailed verifies that the +// "WPAD not configured" error path returns no proxy and applies the +// long autodetect backoff. This is the core "host has no WPAD" case. +func TestProxyFromWinHTTPOrCache_AutodetectFailed(t *testing.T) { + withCleanState(t) + + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + // 12180 = ERROR_WINHTTP_AUTODETECTION_FAILED + return nil, syscall.Errno(12180) + } + + got, err := proxyFromWinHTTPOrCache(newReq(t)) + if err != nil { + t.Fatalf("got err %v; want nil", err) + } + if got != nil { + t.Fatalf("got proxy %v; want nil", got) + } + + mu.Lock() + until := noProxyUntil + mu.Unlock() + if d := time.Until(until); d < 4*time.Minute || d > 6*time.Minute { + t.Fatalf("noProxyUntil set to ~%v from now; want ~%v", d, wpadAutodetectFailedBackoff) + } +} + +// TestProxyFromWinHTTPOrCache_UnknownError verifies that an unmapped +// WinHTTP error doesn't propagate to the caller (would fail the +// outbound HTTP request) and applies the shorter unknown-error backoff. +func TestProxyFromWinHTTPOrCache_UnknownError(t *testing.T) { + withCleanState(t) + + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + return nil, syscall.Errno(99999) // not one we map + } + + got, err := proxyFromWinHTTPOrCache(newReq(t)) + if err != nil { + t.Fatalf("got err %v; want nil so caller dials direct", err) + } + if got != nil { + t.Fatalf("got proxy %v; want nil", got) + } + + mu.Lock() + until := noProxyUntil + mu.Unlock() + if d := time.Until(until); d < 30*time.Second || d > 90*time.Second { + t.Fatalf("noProxyUntil set to ~%v from now; want ~%v", d, wpadUnknownErrorFailedBackoff) + } +} + +// TestProxyFromWinHTTPOrCache_TimeoutNoCached verifies that on probe +// timeout with no cached proxy, the function returns no proxy and +// applies the timeout backoff — i.e. dials direct rather than failing +// the request. +func TestProxyFromWinHTTPOrCache_TimeoutNoCached(t *testing.T) { + withCleanState(t) + + // Block until the parent's 5s context fires. We trim the test by + // shortening that wait via context.WithTimeout in the parent — we + // can't override here, so we just block long enough that the + // parent gives up first. + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + <-ctx.Done() + return nil, ctx.Err() + } + + // We need the request's context to be cancelled before the + // internal 5s timeout to keep this test fast. Use a request + // context with a 50ms deadline so ctx.Done() fires quickly. + reqCtx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + req := (&http.Request{URL: mustParseURL(t, "https://example.com/")}).WithContext(reqCtx) + + got, err := proxyFromWinHTTPOrCache(req) + if err != nil { + t.Fatalf("got err %v; want nil", err) + } + if got != nil { + t.Fatalf("got proxy %v; want nil (no cached fallback under fresh epoch)", got) + } + + mu.Lock() + until := noProxyUntil + mu.Unlock() + if d := time.Until(until); d < 10*time.Second || d > 60*time.Second { + t.Fatalf("noProxyUntil set to ~%v from now; want ~%v", d, wpadTimeoutFailedBackoff) + } +} + +// TestProxyFromWinHTTPOrCache_TimeoutFreshCached verifies that on +// probe timeout, a cached proxy from the *current* epoch is returned +// as a fallback. This protects users on networks that legitimately +// require a proxy when WinHTTP transiently exceeds the per-request +// deadline. +func TestProxyFromWinHTTPOrCache_TimeoutFreshCached(t *testing.T) { + withCleanState(t) + + cached := mustParseURL(t, "http://corp-proxy.example:3128") + cachedProxy.Lock() + cachedProxy.val = cached + cachedProxy.epoch = CurrentEpoch() + cachedProxy.Unlock() + + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + <-ctx.Done() + return nil, ctx.Err() + } + + reqCtx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + req := (&http.Request{URL: mustParseURL(t, "https://example.com/")}).WithContext(reqCtx) + + got, err := proxyFromWinHTTPOrCache(req) + if err != nil { + t.Fatalf("got err %v; want nil", err) + } + if got == nil || got.String() != cached.String() { + t.Fatalf("got %v; want cached fallback %v", got, cached) + } +} + +// TestProxyFromWinHTTPOrCache_TimeoutStaleCached verifies that a +// cached proxy from a *previous* epoch (a network we have since +// roamed off of) is NOT returned on timeout, so we never strand the +// client on an old corp proxy after a link change. +func TestProxyFromWinHTTPOrCache_TimeoutStaleCached(t *testing.T) { + withCleanState(t) + + staleProxy := mustParseURL(t, "http://old-corp-proxy.example:3128") + cachedProxy.Lock() + cachedProxy.val = staleProxy + cachedProxy.epoch = CurrentEpoch() // current... but we'll bump + cachedProxy.Unlock() + + // Simulate a link change: the cached proxy's epoch is now stale. + InvalidateCache() + + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + <-ctx.Done() + return nil, ctx.Err() + } + + reqCtx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + req := (&http.Request{URL: mustParseURL(t, "https://example.com/")}).WithContext(reqCtx) + + got, err := proxyFromWinHTTPOrCache(req) + if err != nil { + t.Fatalf("got err %v; want nil", err) + } + if got != nil { + t.Fatalf("got %v; want nil (stale cached proxy must not be returned after epoch advance)", got) + } +} + +// TestProxyFromWinHTTPOrCache_EpochStompProtection is the regression +// test for the deep-review P1: an in-flight WPAD probe must not stomp +// noProxyUntil after InvalidateCache() has advanced the epoch. Pre-fix, +// a probe issued on the previous network could return ~5s later with +// ERROR_WINHTTP_AUTODETECTION_FAILED and lock out WPAD on the new +// network for 5 full minutes. +func TestProxyFromWinHTTPOrCache_EpochStompProtection(t *testing.T) { + withCleanState(t) + + // Coordinate the probe: it blocks until we release it, then + // returns AUTODETECTION_FAILED as if it ran on the old network. + release := make(chan struct{}) + var lookupStarted sync.WaitGroup + lookupStarted.Add(1) + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + lookupStarted.Done() + <-release + return nil, syscall.Errno(12180) // ERROR_WINHTTP_AUTODETECTION_FAILED + } + + // Spawn the probe on a goroutine. proxyFromWinHTTPOrCache will + // wait on the probe (ctx timeout 5s). + type result struct { + u *url.URL + err error + } + resc := make(chan result, 1) + go func() { + u, err := proxyFromWinHTTPOrCache(newReq(t)) + resc <- result{u, err} + }() + + // Wait for the probe to start (so startEpoch has been captured). + lookupStarted.Wait() + + // Simulate a link change: bump the epoch. + InvalidateCache() + + // Now release the probe. It will return an autodetect-failed + // error from the OLD network. setBackoff must be a no-op because + // the epoch has advanced. + close(release) + + res := <-resc + if res.err != nil { + t.Fatalf("got err %v; want nil", res.err) + } + if res.u != nil { + t.Fatalf("got proxy %v; want nil", res.u) + } + + // Crucial assertion: noProxyUntil must NOT be in the future. If + // the stale probe stomped it, every request on the new network + // would skip WPAD for 5 minutes. + mu.Lock() + until := noProxyUntil + mu.Unlock() + if !until.IsZero() && time.Until(until) > 0 { + t.Fatalf("stale probe stomped noProxyUntil = %v (in %v); should be zero or past", + until, time.Until(until)) + } +} + +// TestProxyFromWinHTTPOrCache_Success verifies the happy path: a +// successful WPAD lookup returns the proxy and stamps the cache with +// the current epoch. +func TestProxyFromWinHTTPOrCache_Success(t *testing.T) { + withCleanState(t) + + want := mustParseURL(t, "http://corp-proxy.example:3128") + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + return want, nil + } + + got, err := proxyFromWinHTTPOrCache(newReq(t)) + if err != nil { + t.Fatalf("got err %v; want nil", err) + } + if got == nil || got.String() != want.String() { + t.Fatalf("got %v; want %v", got, want) + } + + cachedProxy.Lock() + cv := cachedProxy.val + ce := cachedProxy.epoch + cachedProxy.Unlock() + if cv == nil || cv.String() != want.String() { + t.Fatalf("cachedProxy.val = %v; want %v", cv, want) + } + if ce != CurrentEpoch() { + t.Fatalf("cachedProxy.epoch = %d; want current epoch %d", ce, CurrentEpoch()) + } +} + +