From 43d55ab0eeb0b342967ad6f88cf653a5b0fa948e Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Mon, 29 Aug 2016 17:55:47 -0400 Subject: [PATCH 1/9] storage: test command queue context cancellation Closes #7634. --- storage/replica_test.go | 85 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/storage/replica_test.go b/storage/replica_test.go index 571436c7cec..ee1b51533a5 100644 --- a/storage/replica_test.go +++ b/storage/replica_test.go @@ -1934,6 +1934,91 @@ func TestReplicaCommandQueueInconsistent(t *testing.T) { // Success. } +// TestReplicaCommandQueueCancellation verifies that commands which are +// waiting on the command queue do not execute if their context is cancelled. +func TestReplicaCommandQueueCancellation(t *testing.T) { + defer leaktest.AfterTest(t)() + // Intercept commands with matching command IDs and block them. + blockingStart := make(chan struct{}) + blockingDone := make(chan struct{}) + + tc := testContext{} + tsc := TestStoreContext() + tsc.TestingKnobs.TestingCommandFilter = + func(filterArgs storagebase.FilterArgs) *roachpb.Error { + if filterArgs.Hdr.UserPriority == 42 { + blockingStart <- struct{}{} + <-blockingDone + } + return nil + } + tc.StartWithStoreContext(t, tsc) + defer tc.Stop() + + defer close(blockingDone) // make sure teardown can happen + + startBlockingCmd := func(ctx context.Context, keys ...roachpb.Key) <-chan *roachpb.Error { + done := make(chan *roachpb.Error) + + if err := tc.stopper.RunAsyncTask(func() { + ba := roachpb.BatchRequest{ + Header: roachpb.Header{ + UserPriority: 42, + }, + } + for _, key := range keys { + args := putArgs(key, nil) + ba.Add(&args) + } + + _, pErr := tc.Sender().Send(ctx, ba) + done <- pErr + }); err != nil { + t.Fatal(err) + } + + return done + } + + key1 := roachpb.Key("one") + key2 := roachpb.Key("two") + + // Wait until the command queue is blocked. + cmd1Done := startBlockingCmd(context.Background(), key1) + <-blockingStart + + // Put a cancelled blocking command in the command queue. This command will + // block on the previous command, but will not itself reach the filter since + // its context is cancelled. + ctx, cancel := context.WithCancel(context.Background()) + cancel() + cmd2Done := startBlockingCmd(ctx, key1, key2) + + // Wait until both commands are in the command queue. + util.SucceedsSoon(t, func() error { + tc.rng.mu.Lock() + chans := tc.rng.mu.cmdQ.getWait(false, roachpb.Span{Key: key1}, roachpb.Span{Key: key2}) + tc.rng.mu.Unlock() + if a, e := len(chans), 2; a < e { + return errors.Errorf("%d of %d commands in the command queue", a, e) + } + return nil + }) + + // Finish the previous command. + blockingDone <- struct{}{} + if pErr := <-cmd1Done; pErr != nil { + t.Fatal(pErr) + } + + // If this deadlocks, the command has unexpectedly begun executing and was + // trapped in the command filter. Indeed, the absence of such a deadlock is + // what's being tested here. + if pErr := <-cmd2Done; !testutils.IsPError(pErr, context.Canceled.Error()) { + t.Fatal(pErr) + } +} + func SendWrapped(sender client.Sender, ctx context.Context, header roachpb.Header, args roachpb.Request) (roachpb.Response, roachpb.BatchResponse_Header, *roachpb.Error) { var ba roachpb.BatchRequest ba.Add(args) From 0090eeb144c7a49bbfa88ab95e86097536955e69 Mon Sep 17 00:00:00 2001 From: Ben Darnell Date: Tue, 30 Aug 2016 09:56:50 +0800 Subject: [PATCH 2/9] scripts: s/slave/worker/g --- scripts/{gceslave.sh => gceworker.sh} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename scripts/{gceslave.sh => gceworker.sh} (95%) diff --git a/scripts/gceslave.sh b/scripts/gceworker.sh similarity index 95% rename from scripts/gceslave.sh rename to scripts/gceworker.sh index 5c96f34375b..3ba5ad739da 100755 --- a/scripts/gceslave.sh +++ b/scripts/gceworker.sh @@ -3,9 +3,9 @@ set -euxo pipefail export CLOUDSDK_CORE_PROJECT=${CLOUDSDK_CORE_PROJECT-${GOOGLE_PROJECT-cockroach-$(id -un)}} -export CLOUDSDK_COMPUTE_ZONE=${GCESLAVE_ZONE-${CLOUDSDK_COMPUTE_ZONE-us-east1-b}} +export CLOUDSDK_COMPUTE_ZONE=${GCEWORKER_ZONE-${CLOUDSDK_COMPUTE_ZONE-us-east1-b}} -name=${GCESLAVE_NAME-gceslave} +name=${GCEWORKER_NAME-gceworker} cd "$(dirname "${0}")" From c3a32b1791ca942a4a960879851c13ea86c4fe9f Mon Sep 17 00:00:00 2001 From: Ben Darnell Date: Tue, 30 Aug 2016 10:10:32 +0800 Subject: [PATCH 3/9] build: Update etcd dependency Notable changes include coreos/etcd#6286 (which fixes the issue that was preventing us from upgrading sooner), and coreos/etcd#5809 (which fixes a possible cause of failed leader transfers discussed in #8834) Fixes #8017 --- GLOCKFILE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GLOCKFILE b/GLOCKFILE index eba28beede7..5baa71fb3cc 100644 --- a/GLOCKFILE +++ b/GLOCKFILE @@ -42,7 +42,7 @@ github.com/cockroachdb/cockroach-go 2e4a60d41697eebb308b1def89f0abaf1c056137 github.com/cockroachdb/pq 40c6b2414c76cdb84aacc955f79dc844e48ad0c0 github.com/cockroachdb/stress 029c9348806514969d1109a6ae36e521af411ca7 github.com/codahale/hdrhistogram f8ad88b59a584afeee9d334eff879b104439117b -github.com/coreos/etcd fb64c8ccfeb9408b80c3f657a52ec6cad2fdb3f8 +github.com/coreos/etcd 48f4a7d037ca8fd276fff09ef068074193b24dfa github.com/cpuguy83/go-md2man 2724a9c9051aa62e9cca11304e7dd518e9e41599 github.com/davecgh/go-spew 5215b55f46b2b919f50a1df0eaa5886afe4e3b3d github.com/docker/distribution c9fd26e9efe2c7405d7072ad181844275977b5e3 From bac0bb60458096f4aeececbdeeecd417ea207acc Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Mon, 29 Aug 2016 16:26:29 -0400 Subject: [PATCH 4/9] roachpb: shallow copy `BatchRequest.Txn` when mutating Fixes a race detected in `server.TestIntentResolution`. --- roachpb/batch.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/roachpb/batch.go b/roachpb/batch.go index f4b393058dc..ee0459b5825 100644 --- a/roachpb/batch.go +++ b/roachpb/batch.go @@ -590,8 +590,9 @@ func (*BatchRequest) GetUser() string { // Store, a sequence counter less than or equal to the last observed one incurs // a transaction restart (if the request is transactional). func (ba *BatchRequest) SetNewRequest() { - if ba.Txn == nil { - return + if ba.Txn != nil { + txn := *ba.Txn + txn.Sequence++ + ba.Txn = &txn } - ba.Txn.Sequence++ } From c6ced2cfcd4e2201a30f74f8a897506e0fdcf499 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Tue, 30 Aug 2016 00:12:53 -0400 Subject: [PATCH 5/9] scripts: allow varying Go version Supports only 1.6 and 1.7. Multi-version support was easy enough and is going to be useful when - playing with Backtrace right now (no 1.7 support) - comparing Go versions (think benchmarking) Note that we lint against using all but one Go version at a time, but that check is easy enough to circumvent. Also reduced the memory assigned to 32GiB (from 64GiB), added `set -x` to the bootstrap script and added the Go version to the default machine name (easy enough to preserve the current behavior with export GCEWORKER_NAME=gceworker). --- scripts/bootstrap-debian.sh | 6 +- scripts/gceworker.sh | 7 +- scripts/parallelbuilds-go1.6.patch | 125 +++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 6 deletions(-) create mode 100644 scripts/parallelbuilds-go1.6.patch diff --git a/scripts/bootstrap-debian.sh b/scripts/bootstrap-debian.sh index 92860c716e4..0864541e53b 100755 --- a/scripts/bootstrap-debian.sh +++ b/scripts/bootstrap-debian.sh @@ -3,9 +3,9 @@ # On a (recent enough) Debian/Ubuntu system, bootstraps a source Go install # (with improved parallel build patches) and the cockroach repo. -set -euo pipefail +set -euxo pipefail -GOVERSION="1.7" +GOVERSION="${GOVERSION-1.7}" cd "$(dirname "${0}")" @@ -15,7 +15,7 @@ mkdir -p ~/go-bootstrap curl "https://storage.googleapis.com/golang/go${GOVERSION}.linux-amd64.tar.gz" | tar -C ~/go-bootstrap -xvz --strip=1 curl "https://storage.googleapis.com/golang/go${GOVERSION}.src.tar.gz" | tar -C ~ -xvz -patch -p1 -d ../go < parallelbuilds-go1.7.patch +patch -p1 -d ../go < "parallelbuilds-go${GOVERSION}.patch" (cd ~/go/src && GOROOT_BOOTSTRAP=~/go-bootstrap ./make.bash) diff --git a/scripts/gceworker.sh b/scripts/gceworker.sh index 3ba5ad739da..eff09b448ba 100755 --- a/scripts/gceworker.sh +++ b/scripts/gceworker.sh @@ -4,8 +4,9 @@ set -euxo pipefail export CLOUDSDK_CORE_PROJECT=${CLOUDSDK_CORE_PROJECT-${GOOGLE_PROJECT-cockroach-$(id -un)}} export CLOUDSDK_COMPUTE_ZONE=${GCEWORKER_ZONE-${CLOUDSDK_COMPUTE_ZONE-us-east1-b}} +GOVERSION=${GOVERSION-1.7} -name=${GCEWORKER_NAME-gceworker} +name=${GCEWORKER_NAME-gceworker$(echo "${GOVERSION}" | tr -d '.')} cd "$(dirname "${0}")" @@ -13,7 +14,7 @@ case ${1-} in create) gcloud compute instances \ create "${name}" \ - --machine-type "custom-32-65536" \ + --machine-type "custom-32-32768" \ --network "default" \ --maintenance-policy "MIGRATE" \ --image "/debian-cloud/debian-8-jessie-v20160803" \ @@ -23,7 +24,7 @@ case ${1-} in sleep 20 # avoid SSH timeout on copy-files gcloud compute copy-files . "${name}:scripts" - gcloud compute ssh "${name}" ./scripts/bootstrap-debian.sh + gcloud compute ssh "${name}" "GOVERSION=${GOVERSION} ./scripts/bootstrap-debian.sh" # Install automatic shutdown after ten minutes of operation without a # logged in user. To disable this, `sudo touch /.active`. # This is much more intricate than it looks. A few complications which diff --git a/scripts/parallelbuilds-go1.6.patch b/scripts/parallelbuilds-go1.6.patch new file mode 100644 index 00000000000..8c9018e848c --- /dev/null +++ b/scripts/parallelbuilds-go1.6.patch @@ -0,0 +1,125 @@ +diff --git a/src/cmd/go/build.go b/src/cmd/go/build.go +index f2a2a60..53c962c 100644 +--- a/src/cmd/go/build.go ++++ b/src/cmd/go/build.go +@@ -694,6 +694,8 @@ type builder struct { + exec sync.Mutex + readySema chan bool + ready actionQueue ++ ++ tasks chan func() + } + + // An action represents a single action in the action graph. +@@ -1236,6 +1238,7 @@ func (b *builder) do(root *action) { + } + + b.readySema = make(chan bool, len(all)) ++ b.tasks = make(chan func(), buildP) + + // Initialize per-action execution state. + for _, a := range all { +@@ -1312,6 +1315,8 @@ func (b *builder) do(root *action) { + a := b.ready.pop() + b.exec.Unlock() + handle(a) ++ case task := <-b.tasks: ++ task() + case <-interrupted: + setExitStatus(1) + return +@@ -3141,12 +3146,16 @@ func (b *builder) cgo(p *Package, cgoExe, obj string, pcCFLAGS, pcLDFLAGS, cgofi + staticLibs = []string{"-Wl,--start-group", "-lmingwex", "-lmingw32", "-Wl,--end-group"} + } + ++ var tasks []func() ++ var results chan error ++ + cflags := stringList(cgoCPPFLAGS, cgoCFLAGS) + for _, cfile := range cfiles { ++ cfile := cfile + ofile := obj + cfile[:len(cfile)-1] + "o" +- if err := b.gcc(p, ofile, cflags, obj+cfile); err != nil { +- return nil, nil, err +- } ++ tasks = append(tasks, func() { ++ results <- b.gcc(p, ofile, cflags, obj+cfile) ++ }) + linkobj = append(linkobj, ofile) + if !strings.HasSuffix(ofile, "_cgo_main.o") { + outObj = append(outObj, ofile) +@@ -3154,35 +3163,65 @@ func (b *builder) cgo(p *Package, cgoExe, obj string, pcCFLAGS, pcLDFLAGS, cgofi + } + + for _, file := range gccfiles { ++ file := file + ofile := obj + cgoRe.ReplaceAllString(file[:len(file)-1], "_") + "o" +- if err := b.gcc(p, ofile, cflags, file); err != nil { +- return nil, nil, err +- } ++ tasks = append(tasks, func() { ++ results <- b.gcc(p, ofile, cflags, file) ++ }) + linkobj = append(linkobj, ofile) + outObj = append(outObj, ofile) + } + + cxxflags := stringList(cgoCPPFLAGS, cgoCXXFLAGS) + for _, file := range gxxfiles { ++ file := file + // Append .o to the file, just in case the pkg has file.c and file.cpp + ofile := obj + cgoRe.ReplaceAllString(file, "_") + ".o" +- if err := b.gxx(p, ofile, cxxflags, file); err != nil { +- return nil, nil, err +- } ++ tasks = append(tasks, func() { ++ results <- b.gxx(p, ofile, cxxflags, file) ++ }) + linkobj = append(linkobj, ofile) + outObj = append(outObj, ofile) + } + + for _, file := range mfiles { ++ file := file + // Append .o to the file, just in case the pkg has file.c and file.m + ofile := obj + cgoRe.ReplaceAllString(file, "_") + ".o" +- if err := b.gcc(p, ofile, cflags, file); err != nil { +- return nil, nil, err +- } ++ tasks = append(tasks, func() { ++ results <- b.gcc(p, ofile, cflags, file) ++ }) + linkobj = append(linkobj, ofile) + outObj = append(outObj, ofile) + } + ++ // Give the results channel enough capacity so that sending the ++ // result is guaranteed not to block. ++ results = make(chan error, len(tasks)) ++ ++ // Feed the tasks into the b.tasks channel on a separate goroutine ++ // because the b.tasks channel's limited capacity might cause ++ // sending the task to block. ++ go func() { ++ for _, task := range tasks { ++ b.tasks <- task ++ } ++ }() ++ ++ // Loop until we've received results from all of our tasks or an ++ // error occurs. ++ for count := 0; count < len(tasks); { ++ select { ++ case err := <-results: ++ if err != nil { ++ return nil, nil, err ++ } ++ count++ ++ case task := <-b.tasks: ++ task() ++ } ++ } ++ + linkobj = append(linkobj, p.SysoFiles...) + dynobj := obj + "_cgo_.o" + pie := (goarch == "arm" && goos == "linux") || goos == "android" From 77b071b217904d85d63f0c7213030e7e09316098 Mon Sep 17 00:00:00 2001 From: Peter Mattis Date: Tue, 30 Aug 2016 10:30:12 -0400 Subject: [PATCH 6/9] build: update brew to beta-20160829 --- build/cockroach.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/cockroach.rb b/build/cockroach.rb index 6984761dc19..cc4250e22b9 100644 --- a/build/cockroach.rb +++ b/build/cockroach.rb @@ -4,8 +4,8 @@ class Cockroach < Formula desc "Distributed SQL database" homepage "https://www.cockroachlabs.com" url "https://github.com/cockroachdb/cockroach.git", - :tag => "beta-20160728", - :revision => "844e419503ab060aa091c40a7126cb6766fb6621" + :tag => "beta-20160829", + :revision => "ce2bc501f35e5a0d5707fd11d88ca28224aa34b9" head "https://github.com/cockroachdb/cockroach.git" depends_on "go" => :build From 99cfccadeba135f79b8873493c9e029e537e13d6 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Tue, 30 Aug 2016 10:42:57 -0400 Subject: [PATCH 7/9] gossip: reverse client culling direction Most tests implicitly assume this is how it works by starting clients from low ID nodes to higher ID nodes. --- gossip/client.go | 4 ++-- gossip/client_test.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gossip/client.go b/gossip/client.go index fffa3b9eb2e..d88060c5c6c 100644 --- a/gossip/client.go +++ b/gossip/client.go @@ -236,9 +236,9 @@ func (c *client) handleResponse(g *Gossip, reply *Response) error { // matches an incoming or the client is connecting to itself. if g.mu.is.NodeID == c.peerID { return errors.Errorf("stopping outgoing client to node %d (%s); loopback connection", c.peerID, c.addr) - } else if g.hasIncomingLocked(c.peerID) && g.mu.is.NodeID < c.peerID { + } else if g.hasIncomingLocked(c.peerID) && g.mu.is.NodeID > c.peerID { // To avoid mutual shutdown, we only shutdown our client if our - // node ID is less than the peer's. + // node ID is higher than the peer's. return errors.Errorf("stopping outgoing client to node %d (%s); already have incoming", c.peerID, c.addr) } diff --git a/gossip/client_test.go b/gossip/client_test.go index 3d54b822bfc..b94a18e911f 100644 --- a/gossip/client_test.go +++ b/gossip/client_test.go @@ -333,7 +333,7 @@ func TestClientDisconnectRedundant(t *testing.T) { // Check which of the clients is connected to the other. ok1 := local.findClient(func(c *client) bool { return c.addr.String() == rAddr.String() }) != nil ok2 := remote.findClient(func(c *client) bool { return c.addr.String() == lAddr.String() }) != nil - // We expect node 1 to disconnect; if both are still connected, + // We expect node 2 to disconnect; if both are still connected, // it's possible that node 1 gossiped before node 2 connected, in // which case we have to gossip from node 1 to trigger the // disconnect redundant client code. @@ -341,7 +341,7 @@ func TestClientDisconnectRedundant(t *testing.T) { if err := local.AddInfo("local-key", nil, time.Second); err != nil { t.Fatal(err) } - } else if !ok1 && ok2 && verifyServerMaps(local, 1) && verifyServerMaps(remote, 0) { + } else if ok1 && !ok2 && verifyServerMaps(local, 0) && verifyServerMaps(remote, 1) { return nil } return errors.New("local client to remote not yet closed as redundant") From d996a5bccccd80d1ade35e509fe8a5ed11eddfa1 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Tue, 30 Aug 2016 11:18:45 -0400 Subject: [PATCH 8/9] Update grpc Picks up https://github.com/grpc/grpc-go/pull/864. --- GLOCKFILE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GLOCKFILE b/GLOCKFILE index 5baa71fb3cc..fa26ae0b7ac 100644 --- a/GLOCKFILE +++ b/GLOCKFILE @@ -109,7 +109,7 @@ golang.org/x/sys a646d33e2ee3172a661fc09bca23bb4889a41bc8 golang.org/x/text 2910a502d2bf9e43193af9d68ca516529614eed3 golang.org/x/tools 0e9f43fcb67267967af8c15d7dc54b373e341d20 google.golang.org/appengine e951d3868b377b14f4e60efa3a301532ee3c1ebf -google.golang.org/grpc b7aa4e95cbb5ec9d07c52a7541ce178ef9626316 +google.golang.org/grpc 79b7c349179cdd6efd8bac4a1ce7f01b98c16e9b gopkg.in/check.v1 4f90aeace3a26ad7021961c297b22c42160c7b25 gopkg.in/inf.v0 3887ee99ecf07df5b447e9b00d9c0b2adaa9f3e4 gopkg.in/yaml.v1 9f9df34309c04878acc86042b16630b0f696e1de From 50661c5d9d45af2383baff1d9cb1b279b7e28554 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Tue, 30 Aug 2016 11:08:48 -0400 Subject: [PATCH 9/9] gossip: avoid racing client removal against ID discovery Fixes #8937. --- gossip/client.go | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/gossip/client.go b/gossip/client.go index d88060c5c6c..c597a6b3b33 100644 --- a/gossip/client.go +++ b/gossip/client.go @@ -19,6 +19,7 @@ package gossip import ( "fmt" "net" + "sync" "time" "github.com/pkg/errors" @@ -80,8 +81,19 @@ func (c *client) start( ) { stopper.RunWorker(func() { ctx, cancel := context.WithCancel(context.TODO()) - defer cancel() + var wg sync.WaitGroup defer func() { + // This closes the outgoing stream, causing any attempt to send or + // receive to return an error. + // + // Note: it is still possible for incoming gossip to be processed after + // this point. + cancel() + + // The stream is closed, but there may still be some incoming gossip + // being processed. Wait until that is complete to avoid racing the + // client's removal against the discovery of its remote's node ID. + wg.Wait() disconnected <- c }() @@ -109,7 +121,7 @@ func (c *client) start( // Start gossiping. log.Infof(ctx, "node %d: started gossip client to %s", nodeID, c.addr) - if err := c.gossip(ctx, g, stream, stopper); err != nil { + if err := c.gossip(ctx, g, stream, stopper, &wg); err != nil { if !grpcutil.IsClosedConnection(err) { g.mu.Lock() peerID := c.peerID @@ -248,7 +260,7 @@ func (c *client) handleResponse(g *Gossip, reply *Response) error { // gossip loops, sending deltas of the infostore and receiving deltas // in turn. If an alternate is proposed on response, the client addr // is modified and method returns for forwarding by caller. -func (c *client) gossip(ctx context.Context, g *Gossip, stream Gossip_GossipClient, stopper *stop.Stopper) error { +func (c *client) gossip(ctx context.Context, g *Gossip, stream Gossip_GossipClient, stopper *stop.Stopper, wg *sync.WaitGroup) error { sendGossipChan := make(chan struct{}, 1) // Register a callback for gossip updates. @@ -262,7 +274,12 @@ func (c *client) gossip(ctx context.Context, g *Gossip, stream Gossip_GossipClie defer g.RegisterCallback(".*", updateCallback)() errCh := make(chan error, 1) + // This wait group is used to allow the caller to wait until gossip + // processing is terminated. + wg.Add(1) stopper.RunWorker(func() { + defer wg.Done() + errCh <- func() error { for { reply, err := stream.Recv()