Skip to content

Commit

Permalink
tests: add client failover on defrag test case in integration and e2e
Browse files Browse the repository at this point in the history
Signed-off-by: Chao Chen <chaochn@amazon.com>
  • Loading branch information
chaochn47 committed Sep 22, 2023
1 parent 612e5c9 commit a8b2f92
Show file tree
Hide file tree
Showing 4 changed files with 211 additions and 1 deletion.
1 change: 1 addition & 0 deletions .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
run: |
set -euo pipefail
go clean -testcache
make gofail-enable
echo "${TARGET}"
case "${TARGET}" in
Expand Down
3 changes: 2 additions & 1 deletion server/etcdserver/api/v3rpc/maintenance.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ import (
"time"

"github.com/dustin/go-humanize"
"go.etcd.io/raft/v3"
"go.uber.org/zap"

"go.etcd.io/raft/v3"

pb "go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
"go.etcd.io/etcd/api/v3/version"
Expand Down
131 changes: 131 additions & 0 deletions tests/e2e/failover_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
// Copyright 2023 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !cluster_proxy

package e2e

import (
"context"
"testing"
"time"

"github.com/stretchr/testify/require"
"google.golang.org/grpc"
_ "google.golang.org/grpc/health"

clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/tests/v3/framework/config"
"go.etcd.io/etcd/tests/v3/framework/e2e"
)

const (
// in sync with how kubernetes uses etcd
// https://github.com/kubernetes/kubernetes/blob/release-1.28/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go#L59-L71
keepaliveTime = 30 * time.Second
keepaliveTimeout = 10 * time.Second
dialTimeout = 20 * time.Second

clientRuntime = 10 * time.Second
// expect no more than 5 failed requests
failedRequests = 5
)

func TestFailover(t *testing.T) {
tcs := []struct {
name string
clusterOptions []e2e.EPClusterOption
failureInjector func(t *testing.T, clus *e2e.EtcdProcessCluster)
}{
{
name: "defrag",
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithGoFailEnabled(true)},
failureInjector: triggerDefrag,
},
}

for _, tc := range tcs {
t.Run(tc.name, func(t *testing.T) {
e2e.BeforeTest(t)
clus, cerr := e2e.NewEtcdProcessCluster(context.TODO(), t, tc.clusterOptions...)
require.NoError(t, cerr)
t.Cleanup(func() { clus.Stop() })
endpoints := clus.EndpointsGRPC()

cnt, success := 0, 0
donec := make(chan struct{})
errc := make(chan error, 1)

go func() {
var lastErr error
var cc *clientv3.Client
defer func() {
if cc != nil {
cc.Close()
}
errc <- lastErr
close(donec)
close(errc)
}()
cc, cerr := clientv3.New(clientv3.Config{
DialTimeout: dialTimeout,
DialKeepAliveTime: keepaliveTime,
DialKeepAliveTimeout: keepaliveTimeout,
Endpoints: endpoints,
DialOptions: []grpc.DialOption{
grpc.WithDisableServiceConfig(),
grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy": "round_robin", "healthCheckConfig": {"serviceName": ""}}`),
},
})
require.NoError(t, cerr)
timeout := time.After(clientRuntime)

for {
select {
case <-timeout:
return
default:
}
cctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
_, err := cc.Get(cctx, "health")
cancel()
cnt++
if err != nil {
lastErr = err
continue
}
success++
}
}()

tc.failureInjector(t, clus)

<-donec
err, ok := <-errc
if ok && err != nil {
t.Logf("etcd client failed to fail over, error (%v)", err)
}
t.Logf("request failure rate is %.2f%%, traffic volume success %d requests, total %d requests", (1-float64(success)/float64(cnt))*100, success, cnt)
// expect no more than 5 failed requests
require.InDelta(t, cnt, success, failedRequests)
})
}
}

func triggerDefrag(t *testing.T, clus *e2e.EtcdProcessCluster) {
err := clus.Procs[0].Failpoints().Setup(context.Background(), "defragBeforeCopy", `sleep(8000)`)
require.NoError(t, err)
err = clus.Procs[0].Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute})
require.NoError(t, err)
}
77 changes: 77 additions & 0 deletions tests/integration/v3_failover_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"testing"
"time"

"github.com/stretchr/testify/require"
"google.golang.org/grpc"

"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
Expand All @@ -29,6 +30,18 @@ import (
clientv3test "go.etcd.io/etcd/tests/v3/integration/clientv3"
)

const (
// in sync with how kubernetes uses etcd
// https://github.com/kubernetes/kubernetes/blob/release-1.28/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go#L59-L71
keepaliveTime = 30 * time.Second
keepaliveTimeout = 10 * time.Second
dialTimeout = 20 * time.Second

clientRuntime = 10 * time.Second
// expect no more than 5 failed requests
failedRequests = 5
)

func TestFailover(t *testing.T) {
cases := []struct {
name string
Expand Down Expand Up @@ -172,3 +185,67 @@ func shouldRetry(err error) bool {
}
return false
}

func TestFailoverOnDefrag(t *testing.T) {
integration2.BeforeTest(t, integration2.WithFailpoint("defragBeforeCopy", `sleep(10000)`))
clus := integration2.NewCluster(t, &integration2.ClusterConfig{Size: 3})
defer clus.Terminate(t)
endpoints := clus.Endpoints()

cnt, success := 0, 0
donec := make(chan struct{})
errc := make(chan error, 1)

go func() {
var lastErr error
var cc *clientv3.Client
defer func() {
if cc != nil {
cc.Close()
}
errc <- lastErr
close(donec)
close(errc)
}()
cc, cerr := clientv3.New(clientv3.Config{
DialTimeout: dialTimeout,
DialKeepAliveTime: keepaliveTime,
DialKeepAliveTimeout: keepaliveTimeout,
Endpoints: endpoints,
DialOptions: []grpc.DialOption{
grpc.WithDisableServiceConfig(),
grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy": "round_robin", "healthCheckConfig": {"serviceName": ""}}`),
},
})
require.NoError(t, cerr)
timeout := time.After(clientRuntime)

for {
select {
case <-timeout:
return
default:
}
cctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
_, err := cc.Get(cctx, "health")
cancel()
cnt++
if err != nil {
lastErr = err
continue
}
success++
}
}()
_, err := clus.Client(0).Defragment(context.Background(), endpoints[0])
require.NoError(t, err)

<-donec
err, ok := <-errc
if ok && err != nil {
t.Logf("etcd client failed to fail over, error (%v)", err)
}
t.Logf("request failure rate is %.2f%%, traffic volume success %d requests, total %d requests", (1-float64(success)/float64(cnt))*100, success, cnt)
// expect no more than 5 failed requests
require.InDelta(t, cnt, success, failedRequests)
}

0 comments on commit a8b2f92

Please sign in to comment.