Skip to content

Commit

Permalink
Merge #44022
Browse files Browse the repository at this point in the history
44022: server,sql,pgwire: let client conns timeout on unavailable clusters r=ajwerner a=knz

Fixes #44012.
Informs #43974 and #30887.

Release note (general change): CockroachDB will now report a timeout
error when a client attempts to connect via SQL or the web UI and some
system ranges are unavailable. The previous behavior was to wait
indefinitely. The timeout is configurable via the cluster setting
`server.user_login.timeout` and is set to 10 seconds by default.  The
value 0 means "indefinitely" can can be used to restore the pre-20.1
behavior.

Co-authored-by: Raphael 'kena' Poss <knz@thaumogen.net>
  • Loading branch information
craig[bot] and knz committed Jan 17, 2020
2 parents f745367 + 04fb169 commit 13af75f
Show file tree
Hide file tree
Showing 3 changed files with 296 additions and 12 deletions.
1 change: 1 addition & 0 deletions docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
<tr><td><code>server.shutdown.drain_wait</code></td><td>duration</td><td><code>0s</code></td><td>the amount of time a server waits in an unready state before proceeding with the rest of the shutdown process</td></tr>
<tr><td><code>server.shutdown.query_wait</code></td><td>duration</td><td><code>10s</code></td><td>the server will wait for at least this amount of time for active queries to finish</td></tr>
<tr><td><code>server.time_until_store_dead</code></td><td>duration</td><td><code>5m0s</code></td><td>the time after which if there is no new gossiped information about a store, it is considered dead</td></tr>
<tr><td><code>server.user_login.timeout</code></td><td>duration</td><td><code>10s</code></td><td>timeout after which client authentication times out if some system range is unavailable (0 = no timeout)</td></tr>
<tr><td><code>server.web_session_timeout</code></td><td>duration</td><td><code>168h0m0s</code></td><td>the duration that a newly created web session will be valid</td></tr>
<tr><td><code>sql.defaults.default_int_size</code></td><td>integer</td><td><code>8</code></td><td>the size, in bytes, of an INT type</td></tr>
<tr><td><code>sql.defaults.results_buffer.size</code></td><td>byte size</td><td><code>16 KiB</code></td><td>default size of the buffer that accumulates results for a statement or a batch of statements before they are sent to the client. This can be overridden on an individual connection with the 'results_buffer_size' parameter. Note that auto-retries generally only happen while no results have been delivered to the client, so reducing this size can increase the number of retriable errors a client receives. On the other hand, increasing the buffer size can increase the delay until the client receives the first result row. Updating the setting only affects new connections. Setting to 0 disables any buffering.</td></tr>
Expand Down
80 changes: 68 additions & 12 deletions pkg/sql/user.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,38 +12,94 @@ package sql

import (
"context"
"time"

"github.com/cockroachdb/cockroach/pkg/security"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
)

// GetUserHashedPassword returns the hashedPassword for the given username if
// found in system.users.
//
// The function is tolerant of unavailable clusters (or unavailable
// system.user) as follows:
//
// - if the user is root and system.users is unavailable, the function
// reports that the user exists but has no password. (Reminder: "no
// password" means that they cannot use password authentication and
// thus must use certificates instead, or an existing web session
// cookie when using the web UI.)
//
// This special case exists so that root can still log in into an
// unavailable cluster. This is currently required for proper
// function of `cockroach node status` which uses SQL.
// Ideally we would not need SQL to enquire the status of
// clusters and this special case would not need to exist.
//
// - if the user is another user than root, then the function fails
// after a timeout instead of blocking. The timeout is configurable
// via a cluster setting.
//
func GetUserHashedPassword(
ctx context.Context, ie *InternalExecutor, metrics *MemoryMetrics, username string,
) (bool, []byte, error) {
) (exists bool, hashedPassword []byte, err error) {
normalizedUsername := tree.Name(username).Normalize()
isRoot := normalizedUsername == security.RootUser
// Always return no password for the root user, even if someone manually inserts one.
if normalizedUsername == security.RootUser {
if isRoot {
return true, nil, nil
}

const getHashedPassword = `SELECT "hashedPassword" FROM system.users ` +
`WHERE username=$1 AND "isRole" = false`
values, err := ie.QueryRow(
ctx, "get-hashed-pwd", nil /* txn */, getHashedPassword, normalizedUsername)
if err != nil {
return false, nil, errors.Wrapf(err, "error looking up user %s", normalizedUsername)
// We may be operating with a timeout.
timeout := userLoginTimeout.Get(&ie.s.cfg.Settings.SV)
runFn := func(fn func(ctx context.Context) error) error { return fn(ctx) }
if timeout != 0 {
runFn = func(fn func(ctx context.Context) error) error {
return contextutil.RunWithTimeout(ctx, "get-hashed-pwd-timeout", timeout, fn)
}
}
if values == nil {
return false, nil, nil

// Perform the lookup with a timeout.
err = runFn(func(ctx context.Context) error {
const getHashedPassword = `SELECT "hashedPassword" FROM system.users ` +
`WHERE username=$1 AND "isRole" = false`
values, err := ie.QueryRow(
ctx, "get-hashed-pwd", nil /* txn */, getHashedPassword, normalizedUsername)
if err != nil {
return errors.Wrapf(err, "error looking up user %s", normalizedUsername)
}
if values != nil {
exists = true
hashedPassword = []byte(*(values[0].(*tree.DBytes)))
}
return nil
})

if errors.Is(err, context.DeadlineExceeded) {
// Failed to retrieve the user account.
log.Warningf(ctx, "user lookup for %q failed: %v", username, err)
// As a special case, if root is logging in we know the user
// account exists; we just report it has no password so that it
// can only log in using certs or GSS.
if isRoot {
return true, nil, nil
}
err = errors.HandledWithMessage(err, "timeout while retrieving user account")
}
hashedPassword := []byte(*(values[0].(*tree.DBytes)))
return true, hashedPassword, nil
return exists, hashedPassword, err
}

var userLoginTimeout = settings.RegisterPublicNonNegativeDurationSetting(
"server.user_login.timeout",
"timeout after which client authentication times out if some system range is unavailable (0 = no timeout)",
10*time.Second,
)

// The map value is true if the map key is a role, false if it is a user.
func (p *planner) GetAllUsersAndRoles(ctx context.Context) (map[string]bool, error) {
query := `SELECT username,"isRole" FROM system.users`
Expand Down
227 changes: 227 additions & 0 deletions pkg/sql/user_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package sql_test

import (
"context"
"errors"
"fmt"
"net/url"
"os"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/security"
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/sql"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/jackc/pgx"
"github.com/jackc/pgx/pgtype"
)

// TestGetUserHashedPasswordTimeout verifies that user login attempts
// fail with a suitable timeout when some system range(s) are
// unavailable.
//
// To achieve this it creates a 2-node cluster, moves all ranges
// from node 1 to node 2, then stops node 2, then attempts
// to connect to node 1.
func TestGetUserHashedPasswordTimeout(t *testing.T) {
defer leaktest.AfterTest(t)()

if testing.Short() {
t.Skip("short flag")
}

ctx := context.Background()

// Two nodes, two localities. We need the localities to pin the
// ranges to a particular node velow.
tc := testcluster.StartTestCluster(t, 2,
base.TestClusterArgs{
ServerArgsPerNode: map[int]base.TestServerArgs{
0: {Locality: roachpb.Locality{Tiers: []roachpb.Tier{{Key: "region", Value: "r1"}}}},
1: {Locality: roachpb.Locality{Tiers: []roachpb.Tier{{Key: "region", Value: "r2"}}}},
},
})
defer tc.Stopper().Stop(ctx)

// Note: we are using fmt.Fprintln here and below instead of t.Logf
// so as to ensure that these messages are properly interleaved with
// the server logs in case of failure.
fmt.Fprintln(os.Stderr, "-- moving system ranges to 2nd node --")

// Disable replication (num replicas = 1) and move all ranges to r2.
if err := tc.Server(0).(*server.TestServer).Server.RunLocalSQL(ctx,
func(ctx context.Context, ie *sql.InternalExecutor) error {
rows, err := ie.Query(ctx, "get-zones", nil,
"SELECT target FROM crdb_internal.zones")
if err != nil {
return err
}
for _, row := range rows {
zone := string(*row[0].(*tree.DString))
if _, err := ie.Exec(ctx, "set-zone", nil,
fmt.Sprintf("ALTER %s CONFIGURE ZONE USING num_replicas = 1, constraints = '[+region=r2]'", zone)); err != nil {
return err
}
}
return nil
}); err != nil {
t.Fatal(err)
}

// Wait for ranges to move to node 2.
testutils.SucceedsSoon(t, func() error {
// Kick the replication queues, to speed up the rebalancing process.
for i := 0; i < tc.NumServers(); i++ {
if err := tc.Server(i).GetStores().(*storage.Stores).VisitStores(func(s *storage.Store) error {
return s.ForceReplicationScanAndProcess()
}); err != nil {
t.Fatal(err)
}
}
row := tc.ServerConn(0).QueryRow(`SELECT min(unnest(replicas)) FROM crdb_internal.ranges`)
var nodeID int
if err := row.Scan(&nodeID); err != nil {
t.Fatal(err)
}
if nodeID != 2 {
return errors.New("not rebalanced, still waiting")
}
return nil
})

// Make a user that must use a password to authenticate.
// Default privileges on defaultdb are needed to run simple queries.
if _, err := tc.ServerConn(0).Exec(`
CREATE USER foo WITH PASSWORD 'testabc';
GRANT ALL ON DATABASE defaultdb TO foo`); err != nil {
t.Fatal(err)
}

// We'll attempt connections on gateway node 0.
userURL, cleanupFn := sqlutils.PGUrlWithOptionalClientCerts(t,
tc.Server(0).ServingSQLAddr(), t.Name(), url.UserPassword("foo", "testabc"), false /* withClientCerts */)
defer cleanupFn()
rootURL, rootCleanupFn := sqlutils.PGUrl(t,
tc.Server(0).ServingSQLAddr(), t.Name(), url.User(security.RootUser))
defer rootCleanupFn()

// Override the timeout built into pgx so we are only subject to
// what the server thinks.
userURL.RawQuery += "&connect_timeout=0"
rootURL.RawQuery += "&connect_timeout=0"

fmt.Fprintln(os.Stderr, "-- sanity checks --")

// We use a closure here and below to ensure the defers are run
// before the rest of the test.

func() {
// Sanity check: verify that secure mode is enabled: password is
// required. If this part fails, this means the test cluster is
// not properly configured, and the remainder of the test below
// would report false positives.
unauthURL := userURL
unauthURL.User = url.User("foo")
dbSQL, err := pgxConn(t, unauthURL)
if err == nil {
defer func() { _ = dbSQL.Close() }()
}
if !testutils.IsError(err, "password authentication failed for user foo") {
t.Fatalf("expected password error, got %v", err)
}
}()

func() {
// Sanity check: verify that the new user is able to log in with password.
dbSQL, err := pgxConn(t, userURL)
if err != nil {
t.Fatal(err)
}
defer func() { _ = dbSQL.Close() }()
row := dbSQL.QueryRow("SELECT current_user")
var username string
if err := row.Scan(&username); err != nil {
t.Fatal(err)
}
if username != "foo" {
t.Fatalf("invalid username: expected foo, got %q", username)
}
}()

// Configure the login timeout to just 1s.
if _, err := tc.ServerConn(0).Exec(`SET CLUSTER SETTING server.user_login.timeout = '1s'`); err != nil {
t.Fatal(err)
}

fmt.Fprintln(os.Stderr, "-- shutting down node 2 --")

// This is intended to make the system ranges unavailable.
tc.StopServer(1)

fmt.Fprintln(os.Stderr, "-- expect timeout --")

func() {
// Now attempt to connect again. We're expecting a timeout within 5 seconds.
start := timeutil.Now()
dbSQL, err := pgxConn(t, userURL)
if err == nil {
defer func() { _ = dbSQL.Close() }()
}
if !testutils.IsError(err, "timeout while retrieving user account") {
t.Fatalf("expected timeout error during connection, got %v", err)
}
timeoutDur := timeutil.Now().Sub(start)
if timeoutDur > 5*time.Second {
t.Fatalf("timeout lasted for more than 5 second (%s)", timeoutDur)
}
}()

fmt.Fprintln(os.Stderr, "-- no timeout for root --")

func() {
dbSQL, err := pgxConn(t, rootURL)
if err != nil {
t.Fatal(err)
}
defer func() { _ = dbSQL.Close() }()
// A simple query must work for 'root' even without a system range available.
if _, err := dbSQL.Exec("SELECT 1"); err != nil {
t.Fatal(err)
}
}()

}

func pgxConn(t *testing.T, connURL url.URL) (*pgx.Conn, error) {
pgxConfig, err := pgx.ParseConnectionString(connURL.String())
if err != nil {
t.Fatal(err)
}

// Override the conninfo to avoid a bunch of pg_catalog
// queries when the connection is being set up.
pgxConfig.CustomConnInfo = func(c *pgx.Conn) (*pgtype.ConnInfo, error) {
return c.ConnInfo, nil
}

return pgx.Connect(pgxConfig)
}

0 comments on commit 13af75f

Please sign in to comment.