diff --git a/net/tshttpproxy/tshttpproxy.go b/net/tshttpproxy/tshttpproxy.go index 24b0050e96425..ff282e2d38e2c 100644 --- a/net/tshttpproxy/tshttpproxy.go +++ b/net/tshttpproxy/tshttpproxy.go @@ -28,11 +28,25 @@ func InvalidateCache() { mu.Lock() defer mu.Unlock() noProxyUntil = time.Time{} + epoch++ +} + +// CurrentEpoch returns a counter that increments on every InvalidateCache +// call. Platform-specific sysProxyFromEnv implementations can snapshot the +// epoch at the start of an in-flight probe and check it again before +// committing state derived from that probe (e.g. a negative-cache backoff), +// so that a probe issued on one network does not stomp the next network's +// state after a link change. +func CurrentEpoch() uint64 { + mu.Lock() + defer mu.Unlock() + return epoch } var ( mu sync.Mutex noProxyUntil time.Time // if non-zero, time at which ProxyFromEnvironment should check again + epoch uint64 // incremented on every InvalidateCache config *httpproxy.Config // used to create proxyFunc proxyFunc func(*url.URL) (*url.URL, error) ) @@ -120,6 +134,11 @@ var sysProxyFromEnv func(*http.Request) (*url.URL, error) // ProxyFromEnvironment is like the standard library's http.ProxyFromEnvironment // but additionally does OS-specific proxy lookups if the environment variables // alone don't specify a proxy. +// +// Errors from the platform-specific sysProxyFromEnv hook (e.g. WinHTTP/WPAD +// on Windows) are intentionally dropped: a failed system-proxy lookup must +// degrade to "no proxy, dial direct" rather than fail the outbound request. +// Errors from the env-var-derived proxy are propagated as before. func ProxyFromEnvironment(req *http.Request) (*url.URL, error) { localProxyFunc := getProxyFunc() u, err := localProxyFunc(req.URL) @@ -135,9 +154,9 @@ func ProxyFromEnvironment(req *http.Request) (*url.URL, error) { } if sysProxyFromEnv != nil { - u, err := sysProxyFromEnv(req) - if u != nil && err == nil { - return u, nil + // Drop the sys error explicitly; see func docs. + if su, serr := sysProxyFromEnv(req); su != nil && serr == nil { + return su, nil } } diff --git a/net/tshttpproxy/tshttpproxy_test.go b/net/tshttpproxy/tshttpproxy_test.go index 5b9362d2c094d..d9e73be22bfa1 100644 --- a/net/tshttpproxy/tshttpproxy_test.go +++ b/net/tshttpproxy/tshttpproxy_test.go @@ -4,6 +4,7 @@ package tshttpproxy import ( + "errors" "net/http" "net/url" "os" @@ -205,3 +206,78 @@ func TestSetSelfProxy(t *testing.T) { }) } } + +// TestProxyFromEnvironment_sysProxyError verifies that when the +// platform-specific proxy lookup (e.g. Windows WPAD via WinHTTP) returns +// an error, ProxyFromEnvironment surfaces (nil, nil) to the caller so +// http.Transport falls back to a direct connection rather than failing +// the request. This is the cross-platform glue side of the +// coder/tailscale fix for tailscale/tailscale#17055 / #10215. +func TestProxyFromEnvironment_sysProxyError(t *testing.T) { + // Make sure no env-var proxy interferes with the fallthrough. + for _, k := range []string{"HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY", "http_proxy", "https_proxy", "no_proxy"} { + if v, ok := os.LookupEnv(k); ok { + os.Unsetenv(k) + t.Cleanup(func() { os.Setenv(k, v) }) + } + } + // Reset the package-level config/proxyFunc cache so the env-var + // changes above take effect. + mu.Lock() + config = nil + proxyFunc = nil + noProxyUntil = time.Time{} + mu.Unlock() + t.Cleanup(func() { + mu.Lock() + config = nil + proxyFunc = nil + noProxyUntil = time.Time{} + mu.Unlock() + }) + + prev := sysProxyFromEnv + t.Cleanup(func() { sysProxyFromEnv = prev }) + + var calls int + syntheticErr := errors.New("synthetic WPAD failure") + sysProxyFromEnv = func(*http.Request) (*url.URL, error) { + calls++ + return nil, syntheticErr + } + + req := &http.Request{URL: must.Get(url.Parse("https://example.com/"))} + got, err := ProxyFromEnvironment(req) + if err != nil { + t.Fatalf("ProxyFromEnvironment returned error %v; want nil so transport dials direct", err) + } + if got != nil { + t.Fatalf("ProxyFromEnvironment = %v; want nil", got) + } + if calls != 1 { + t.Fatalf("sysProxyFromEnv was called %d times; want 1", calls) + } + + // Simulate the platform layer applying a backoff after a failure. + // While the backoff is active, ProxyFromEnvironment must not + // re-invoke the platform hook. + setNoProxyUntil(time.Minute) + got, err = ProxyFromEnvironment(req) + if err != nil || got != nil { + t.Fatalf("during backoff: got (%v, %v); want (nil, nil)", got, err) + } + if calls != 1 { + t.Fatalf("sysProxyFromEnv was called %d times during backoff; want 1", calls) + } + + // InvalidateCache (e.g. on a netmon link change) must clear the + // backoff and allow a fresh platform lookup. + InvalidateCache() + got, err = ProxyFromEnvironment(req) + if err != nil || got != nil { + t.Fatalf("after InvalidateCache: got (%v, %v); want (nil, nil)", got, err) + } + if calls != 2 { + t.Fatalf("after InvalidateCache: sysProxyFromEnv calls = %d; want 2", calls) + } +} diff --git a/net/tshttpproxy/tshttpproxy_windows.go b/net/tshttpproxy/tshttpproxy_windows.go index 06a1f5ae445d0..c5d58e5035107 100644 --- a/net/tshttpproxy/tshttpproxy_windows.go +++ b/net/tshttpproxy/tshttpproxy_windows.go @@ -31,9 +31,16 @@ func init() { sysAuthHeader = sysAuthHeaderWindows } +// cachedProxy holds the most recent successful WPAD result and the +// InvalidateCache epoch it was discovered under. The timeout branch of +// proxyFromWinHTTPOrCache reads val as a fallback when an in-flight +// probe blows past the per-request deadline, but only when epoch +// matches the current one so we never strand the client on a proxy +// from a previous network. var cachedProxy struct { sync.Mutex - val *url.URL + val *url.URL + epoch uint64 } // proxyErrorf is a rate-limited logger specifically for errors asking @@ -52,12 +59,45 @@ var ( metricErrOther = clientmetric.NewCounter("winhttp_proxy_err_other") ) +// WPAD negative-cache backoffs. +// +// Coder fork: previously these were 10s, which combined with +// InvalidateCache() being called on every netmon link change meant a +// host with no WPAD server (DHCP option 252 unset, no wpad. A +// record) would re-issue a 5-second blocking WinHttpGetProxyForUrl call +// for nearly every outbound HTTP request. Bumping the negative-cache +// duration lets normal traffic flow while still re-trying WPAD on +// every link change (via InvalidateCache, which zeros noProxyUntil). +// +// See tailscale/tailscale#17055 and tailscale/tailscale#10215. +const ( + wpadAutodetectFailedBackoff = 5 * time.Minute + wpadDownloadFailedBackoff = 5 * time.Minute + wpadTimeoutFailedBackoff = 30 * time.Second + wpadUnknownErrorFailedBackoff = 1 * time.Minute + // wpadInvalidParameterFailedBackoff is intentionally long: this + // error is only seen on Windows 8.1 and is not transient. See + // tailscale/tailscale#879. + wpadInvalidParameterFailedBackoff = 1 * time.Hour +) + +// winHTTPLookup performs the underlying WPAD lookup. It is a package +// variable so tests can swap in a fake without exercising winhttp.dll. +var winHTTPLookup = proxyFromWinHTTP + func proxyFromWinHTTPOrCache(req *http.Request) (*url.URL, error) { if req.URL == nil { return nil, nil } urlStr := req.URL.String() + // Snapshot the InvalidateCache epoch before launching the probe. + // If a netmon link change increments the epoch while the probe is + // in flight, we must not commit any state derived from this probe + // (negative-cache backoff, cached-proxy fallback) onto the new + // network — that probe's answer is for the old network only. + startEpoch := CurrentEpoch() + ctx, cancel := context.WithTimeout(req.Context(), 5*time.Second) defer cancel() @@ -67,10 +107,21 @@ func proxyFromWinHTTPOrCache(req *http.Request) (*url.URL, error) { } resc := make(chan result, 1) go func() { - proxy, err := proxyFromWinHTTP(ctx, urlStr) + proxy, err := winHTTPLookup(ctx, urlStr) resc <- result{proxy, err} }() + // setBackoff applies a negative-cache backoff only if the + // InvalidateCache epoch hasn't advanced since the probe started. + // Stale probes from a previous network must not gate fresh + // lookups on the current one. + setBackoff := func(d time.Duration) { + if CurrentEpoch() != startEpoch { + return + } + setNoProxyUntil(d) + } + select { case res := <-resc: err := res.err @@ -82,6 +133,7 @@ func proxyFromWinHTTPOrCache(req *http.Request) (*url.URL, error) { log.Printf("tshttpproxy: winhttp: updating cached proxy setting from %v to %v", was, now) } cachedProxy.val = res.proxy + cachedProxy.epoch = startEpoch return res.proxy, nil } @@ -92,31 +144,47 @@ func proxyFromWinHTTPOrCache(req *http.Request) (*url.URL, error) { ) if err == syscall.Errno(ERROR_WINHTTP_AUTODETECTION_FAILED) { metricErrDetectionFailed.Add(1) - setNoProxyUntil(10 * time.Second) + setBackoff(wpadAutodetectFailedBackoff) return nil, nil } if err == windows.ERROR_INVALID_PARAMETER { metricErrInvalidParameters.Add(1) - // Seen on Windows 8.1. (https://github.com/tailscale/tailscale/issues/879) - // TODO(bradfitz): figure this out. - setNoProxyUntil(time.Hour) + setBackoff(wpadInvalidParameterFailedBackoff) proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): ERROR_INVALID_PARAMETER [unexpected]", urlStr) return nil, nil } proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): %v/%#v", urlStr, err, err) if err == syscall.Errno(ERROR_WINHTTP_UNABLE_TO_DOWNLOAD_SCRIPT) { metricErrDownloadScript.Add(1) - setNoProxyUntil(10 * time.Second) + setBackoff(wpadDownloadFailedBackoff) return nil, nil } metricErrOther.Add(1) - return nil, err + // Coder fork: every error path here returns (nil, nil) so the + // caller dials direct rather than failing the request. See the + // package-level ProxyFromEnvironment doc and the const block + // rationale above. + setBackoff(wpadUnknownErrorFailedBackoff) + return nil, nil case <-ctx.Done(): metricErrTimeout.Add(1) + // Coder fork: prefer the most recent successful WPAD result, + // but only if it was discovered under the current epoch. A + // cached proxy from a previous network would route this + // request through a now-unreachable host. When the cache is + // stale or empty, fall through to direct and apply a short + // backoff so in-flight WinHTTP probes don't pile up. cachedProxy.Lock() - defer cachedProxy.Unlock() - proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): timeout; using cached proxy %v", urlStr, cachedProxy.val) - return cachedProxy.val, nil + fallback := cachedProxy.val + fallbackEpoch := cachedProxy.epoch + cachedProxy.Unlock() + if fallback != nil && fallbackEpoch == startEpoch { + proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): timeout; using cached proxy %v", urlStr, fallback) + return fallback, nil + } + setBackoff(wpadTimeoutFailedBackoff) + proxyErrorf("tshttpproxy: winhttp: GetProxyForURL(%q): timeout; falling back to direct", urlStr) + return nil, nil } } diff --git a/net/tshttpproxy/tshttpproxy_windows_test.go b/net/tshttpproxy/tshttpproxy_windows_test.go new file mode 100644 index 0000000000000..48e3b9674586f --- /dev/null +++ b/net/tshttpproxy/tshttpproxy_windows_test.go @@ -0,0 +1,312 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package tshttpproxy + +import ( + "context" + "net/http" + "net/url" + "sync" + "syscall" + "testing" + "time" + + "tailscale.com/util/must" +) + +// withCleanState resets package globals shared across these Windows +// tests and registers cleanup to put them back when the test exits. +func withCleanState(t *testing.T) { + t.Helper() + + mu.Lock() + prevNoProxyUntil := noProxyUntil + prevEpoch := epoch + noProxyUntil = time.Time{} + mu.Unlock() + + cachedProxy.Lock() + prevCachedVal := cachedProxy.val + prevCachedEpoch := cachedProxy.epoch + cachedProxy.val = nil + cachedProxy.epoch = 0 + cachedProxy.Unlock() + + prevLookup := winHTTPLookup + t.Cleanup(func() { + winHTTPLookup = prevLookup + mu.Lock() + noProxyUntil = prevNoProxyUntil + epoch = prevEpoch + mu.Unlock() + cachedProxy.Lock() + cachedProxy.val = prevCachedVal + cachedProxy.epoch = prevCachedEpoch + cachedProxy.Unlock() + }) +} + +func mustParseURL(t *testing.T, s string) *url.URL { + t.Helper() + return must.Get(url.Parse(s)) +} + +func newReq(t *testing.T) *http.Request { + t.Helper() + return &http.Request{URL: mustParseURL(t, "https://example.com/")} +} + +// TestProxyFromWinHTTPOrCache_AutodetectFailed verifies that the +// "WPAD not configured" error path returns no proxy and applies the +// long autodetect backoff. This is the core "host has no WPAD" case. +func TestProxyFromWinHTTPOrCache_AutodetectFailed(t *testing.T) { + withCleanState(t) + + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + // 12180 = ERROR_WINHTTP_AUTODETECTION_FAILED + return nil, syscall.Errno(12180) + } + + got, err := proxyFromWinHTTPOrCache(newReq(t)) + if err != nil { + t.Fatalf("got err %v; want nil", err) + } + if got != nil { + t.Fatalf("got proxy %v; want nil", got) + } + + mu.Lock() + until := noProxyUntil + mu.Unlock() + if d := time.Until(until); d < 4*time.Minute || d > 6*time.Minute { + t.Fatalf("noProxyUntil set to ~%v from now; want ~%v", d, wpadAutodetectFailedBackoff) + } +} + +// TestProxyFromWinHTTPOrCache_UnknownError verifies that an unmapped +// WinHTTP error doesn't propagate to the caller (would fail the +// outbound HTTP request) and applies the shorter unknown-error backoff. +func TestProxyFromWinHTTPOrCache_UnknownError(t *testing.T) { + withCleanState(t) + + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + return nil, syscall.Errno(99999) // not one we map + } + + got, err := proxyFromWinHTTPOrCache(newReq(t)) + if err != nil { + t.Fatalf("got err %v; want nil so caller dials direct", err) + } + if got != nil { + t.Fatalf("got proxy %v; want nil", got) + } + + mu.Lock() + until := noProxyUntil + mu.Unlock() + if d := time.Until(until); d < 30*time.Second || d > 90*time.Second { + t.Fatalf("noProxyUntil set to ~%v from now; want ~%v", d, wpadUnknownErrorFailedBackoff) + } +} + +// TestProxyFromWinHTTPOrCache_TimeoutNoCached verifies that on probe +// timeout with no cached proxy, the function returns no proxy and +// applies the timeout backoff — i.e. dials direct rather than failing +// the request. +func TestProxyFromWinHTTPOrCache_TimeoutNoCached(t *testing.T) { + withCleanState(t) + + // Block until the parent's 5s context fires. We trim the test by + // shortening that wait via context.WithTimeout in the parent — we + // can't override here, so we just block long enough that the + // parent gives up first. + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + <-ctx.Done() + return nil, ctx.Err() + } + + // We need the request's context to be cancelled before the + // internal 5s timeout to keep this test fast. Use a request + // context with a 50ms deadline so ctx.Done() fires quickly. + reqCtx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + req := (&http.Request{URL: mustParseURL(t, "https://example.com/")}).WithContext(reqCtx) + + got, err := proxyFromWinHTTPOrCache(req) + if err != nil { + t.Fatalf("got err %v; want nil", err) + } + if got != nil { + t.Fatalf("got proxy %v; want nil (no cached fallback under fresh epoch)", got) + } + + mu.Lock() + until := noProxyUntil + mu.Unlock() + if d := time.Until(until); d < 10*time.Second || d > 60*time.Second { + t.Fatalf("noProxyUntil set to ~%v from now; want ~%v", d, wpadTimeoutFailedBackoff) + } +} + +// TestProxyFromWinHTTPOrCache_TimeoutFreshCached verifies that on +// probe timeout, a cached proxy from the *current* epoch is returned +// as a fallback. This protects users on networks that legitimately +// require a proxy when WinHTTP transiently exceeds the per-request +// deadline. +func TestProxyFromWinHTTPOrCache_TimeoutFreshCached(t *testing.T) { + withCleanState(t) + + cached := mustParseURL(t, "http://corp-proxy.example:3128") + cachedProxy.Lock() + cachedProxy.val = cached + cachedProxy.epoch = CurrentEpoch() + cachedProxy.Unlock() + + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + <-ctx.Done() + return nil, ctx.Err() + } + + reqCtx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + req := (&http.Request{URL: mustParseURL(t, "https://example.com/")}).WithContext(reqCtx) + + got, err := proxyFromWinHTTPOrCache(req) + if err != nil { + t.Fatalf("got err %v; want nil", err) + } + if got == nil || got.String() != cached.String() { + t.Fatalf("got %v; want cached fallback %v", got, cached) + } +} + +// TestProxyFromWinHTTPOrCache_TimeoutStaleCached verifies that a +// cached proxy from a *previous* epoch (a network we have since +// roamed off of) is NOT returned on timeout, so we never strand the +// client on an old corp proxy after a link change. +func TestProxyFromWinHTTPOrCache_TimeoutStaleCached(t *testing.T) { + withCleanState(t) + + staleProxy := mustParseURL(t, "http://old-corp-proxy.example:3128") + cachedProxy.Lock() + cachedProxy.val = staleProxy + cachedProxy.epoch = CurrentEpoch() // current... but we'll bump + cachedProxy.Unlock() + + // Simulate a link change: the cached proxy's epoch is now stale. + InvalidateCache() + + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + <-ctx.Done() + return nil, ctx.Err() + } + + reqCtx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + req := (&http.Request{URL: mustParseURL(t, "https://example.com/")}).WithContext(reqCtx) + + got, err := proxyFromWinHTTPOrCache(req) + if err != nil { + t.Fatalf("got err %v; want nil", err) + } + if got != nil { + t.Fatalf("got %v; want nil (stale cached proxy must not be returned after epoch advance)", got) + } +} + +// TestProxyFromWinHTTPOrCache_EpochStompProtection is the regression +// test for the deep-review P1: an in-flight WPAD probe must not stomp +// noProxyUntil after InvalidateCache() has advanced the epoch. Pre-fix, +// a probe issued on the previous network could return ~5s later with +// ERROR_WINHTTP_AUTODETECTION_FAILED and lock out WPAD on the new +// network for 5 full minutes. +func TestProxyFromWinHTTPOrCache_EpochStompProtection(t *testing.T) { + withCleanState(t) + + // Coordinate the probe: it blocks until we release it, then + // returns AUTODETECTION_FAILED as if it ran on the old network. + release := make(chan struct{}) + var lookupStarted sync.WaitGroup + lookupStarted.Add(1) + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + lookupStarted.Done() + <-release + return nil, syscall.Errno(12180) // ERROR_WINHTTP_AUTODETECTION_FAILED + } + + // Spawn the probe on a goroutine. proxyFromWinHTTPOrCache will + // wait on the probe (ctx timeout 5s). + type result struct { + u *url.URL + err error + } + resc := make(chan result, 1) + go func() { + u, err := proxyFromWinHTTPOrCache(newReq(t)) + resc <- result{u, err} + }() + + // Wait for the probe to start (so startEpoch has been captured). + lookupStarted.Wait() + + // Simulate a link change: bump the epoch. + InvalidateCache() + + // Now release the probe. It will return an autodetect-failed + // error from the OLD network. setBackoff must be a no-op because + // the epoch has advanced. + close(release) + + res := <-resc + if res.err != nil { + t.Fatalf("got err %v; want nil", res.err) + } + if res.u != nil { + t.Fatalf("got proxy %v; want nil", res.u) + } + + // Crucial assertion: noProxyUntil must NOT be in the future. If + // the stale probe stomped it, every request on the new network + // would skip WPAD for 5 minutes. + mu.Lock() + until := noProxyUntil + mu.Unlock() + if !until.IsZero() && time.Until(until) > 0 { + t.Fatalf("stale probe stomped noProxyUntil = %v (in %v); should be zero or past", + until, time.Until(until)) + } +} + +// TestProxyFromWinHTTPOrCache_Success verifies the happy path: a +// successful WPAD lookup returns the proxy and stamps the cache with +// the current epoch. +func TestProxyFromWinHTTPOrCache_Success(t *testing.T) { + withCleanState(t) + + want := mustParseURL(t, "http://corp-proxy.example:3128") + winHTTPLookup = func(ctx context.Context, urlStr string) (*url.URL, error) { + return want, nil + } + + got, err := proxyFromWinHTTPOrCache(newReq(t)) + if err != nil { + t.Fatalf("got err %v; want nil", err) + } + if got == nil || got.String() != want.String() { + t.Fatalf("got %v; want %v", got, want) + } + + cachedProxy.Lock() + cv := cachedProxy.val + ce := cachedProxy.epoch + cachedProxy.Unlock() + if cv == nil || cv.String() != want.String() { + t.Fatalf("cachedProxy.val = %v; want %v", cv, want) + } + if ce != CurrentEpoch() { + t.Fatalf("cachedProxy.epoch = %d; want current epoch %d", ce, CurrentEpoch()) + } +} + +