Skip to content

Commit

Permalink
Fixing a flaky test (#924)
Browse files Browse the repository at this point in the history
* Fixing a flaky test
* Fixing some CI issue
* Reducing flakiness of the demo tests
* Adding some extra delay before we give up on pinging a node starting in the demo test
  • Loading branch information
AnomalRoil committed Mar 9, 2022
1 parent d9099c9 commit 8413033
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 18 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ test-integration:
cd demo && go build && ./demo -build -test -debug

coverage:
go get -u github.com/ory/go-acc
go install github.com/ory/go-acc@latest
go get -v -t -d ./...
COVERAGE=true go-acc ./...

Expand Down
2 changes: 2 additions & 0 deletions chain/beacon/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ func (h *Handler) Start() error {
}

h.Lock()
// XXX: do we really need both started and running?
h.started = true
h.Unlock()

Expand Down Expand Up @@ -297,6 +298,7 @@ func (h *Handler) run(startTime int64) {
setRunning := sync.Once{}

h.Lock()
// XXX: do we really need both started and running?
h.running = true
h.Unlock()

Expand Down
2 changes: 2 additions & 0 deletions core/drand_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ func TestRunDKGReshareTimeout(t *testing.T) {
var doneReshare = make(chan *key.Group)
go func() {
t.Log("[reshare] Start reshare")
// XXX: notice that the RunReshare is already running AdvanceMockClock on its own after a while!!
group, err := dt.RunReshare(t,
&reshareConfig{
oldRun: nodesToKeep,
Expand All @@ -357,6 +358,7 @@ func TestRunDKGReshareTimeout(t *testing.T) {
}()
time.Sleep(3 * time.Second)

// XXX: this isn't always the case: it can already have reached past this at this point because of the above Sleep.
t.Log("Move to response phase")
dt.AdvanceMockClock(t, timeout)

Expand Down
40 changes: 32 additions & 8 deletions core/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,22 @@ func (d *DrandTestScenario) Ids(n int, newGroup bool) []string {
return addresses
}

// waitRunning with wait until the Status of the Daemon the client controls is set to "isRunning: True"
func (d *DrandTestScenario) waitRunning(client *net.ControlClient, node *MockNode) {
isRunning := false
for !isRunning {
r, err := client.Status(d.beaconID)
require.NoError(d.t, err)
/// XXX: maybe needs to be changed if running and started aren't both necessary, using "isStarted" could maybe work too
if r.Beacon.IsRunning {
d.t.Log("[DEBUG] ", node.GetAddr(), " Status: isRunning")
isRunning = true
} else {
time.Sleep(100 * time.Millisecond)
}
}
}

// RunDKG runs the DKG with the initial nodes created during NewDrandTest
func (d *DrandTestScenario) RunDKG() *key.Group {
// common secret between participants
Expand All @@ -248,15 +264,15 @@ func (d *DrandTestScenario) RunDKG() *key.Group {
controlClient, err := net.NewControlClient(leaderNode.drand.opts.controlPort)
require.NoError(d.t, err)

d.t.Log("[RunDKG] Start")
d.t.Log("[RunDKG] Start: Leader = ", leaderNode.GetAddr())

// the leaderNode will return the group over this channel
var wg sync.WaitGroup
wg.Add(d.n)

// first run the leader and then run the other nodes
go func() {
d.t.Log("[RunDKG] Leader init")
d.t.Log("[RunDKG] Leader (", leaderNode.GetAddr(), ") init")

// TODO: Control Client needs every single parameter, not a protobuf type. This means that it will be difficult to extend
groupPacket, err := controlClient.InitDKGLeader(
Expand All @@ -268,13 +284,12 @@ func (d *DrandTestScenario) RunDKG() *key.Group {
require.NoError(d.t, err)

d.t.Logf("[RunDKG] Leader Finished. GroupHash %x", group.Hash())

// We need to make sure the daemon is running before continuing
d.waitRunning(controlClient, leaderNode)
wg.Done()
}()

// make sure the leader is up and running to start the setup
// TODO: replace this with a ping loop and refactor to make it reusable
time.Sleep(1 * time.Second)

// all other nodes will send their PK to the leader that will create the group
for _, node := range d.nodes[1:] {
go func(n *MockNode) {
Expand All @@ -285,13 +300,17 @@ func (d *DrandTestScenario) RunDKG() *key.Group {
group, err := key.GroupFromProto(groupPacket)
require.NoError(d.t, err)

d.t.Logf("[RunDKG] NonLeader Finished. GroupHash %x", group.Hash())
d.t.Logf("[RunDKG] NonLeader %s Finished. GroupHash %x", n.GetAddr(), group.Hash())

// We need to make sure the daemon is running before continuing
d.waitRunning(client, n)
wg.Done()
}(node)
}

// wait for all to return
wg.Wait()

d.t.Logf("[RunDKG] Leader %s FINISHED", leaderNode.addr)

// we check that we can fetch the group using control functionalities on the leaderNode node
Expand Down Expand Up @@ -665,7 +684,7 @@ func (d *DrandTestScenario) RunReshare(t *testing.T, c *reshareConfig) (*key.Gro
d.resharedNodes = append(d.resharedNodes, leader)

// leave some time to make sure leader is listening
// Note: Remove this. Ping until leader is ready. Use a ping + lambda + retry
// TODO: Remove this. Ping until leader is ready. Use a ping + lambda + retry
time.Sleep(1 * time.Second)

// run the current nodes
Expand Down Expand Up @@ -730,6 +749,7 @@ func (d *DrandTestScenario) RunReshare(t *testing.T, c *reshareConfig) (*key.Gro
if !relyOnTimeout {
continue
}
// XXX: check if this is really intended
d.AdvanceMockClock(t, c.timeout)
t.Logf("[reshare] Advance clock: %d", d.Now().Unix())
case <-outgoingChan:
Expand Down Expand Up @@ -875,3 +895,7 @@ func (b *testBroadcast) BroadcastDKG(c context.Context, p *drand.DKGPacket) (*dr
defer b.Unlock()
return ret, nil
}

func (n *MockNode) GetAddr() string {
return n.addr
}
12 changes: 12 additions & 0 deletions demo/demo_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package main

import (
"fmt"
"os"
"testing"
"time"

"github.com/drand/drand/common"

Expand All @@ -11,6 +14,15 @@ import (
)

func TestLocalOrchestration(t *testing.T) {

// Let us have a 2 minutes deadline
time.AfterFunc(
2*time.Minute,
func() {
fmt.Println("Deadline reached")
os.Exit(1)
})

sch, beaconID := scheme.GetSchemeFromEnv(), common.GetBeaconIDFromEnv()

o := lib.NewOrchestrator(3, 2, "4s", true, "", false, sch, beaconID, true)
Expand Down
25 changes: 16 additions & 9 deletions demo/lib/orchestrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -556,15 +556,14 @@ func (e *Orchestrator) StartNode(idxs ...int) {

fmt.Printf("[+] Attempting to start node %s again ...\n", foundNode.PrivateAddr())
foundNode.Start(e.certFolder)
trial := 0
var started bool
for trial < 5 {
for trial := 1; trial < 10; trial += 1 {
if foundNode.Ping() {
fmt.Printf("\t- Node %s started correctly\n", foundNode.PrivateAddr())
started = true
break
}
time.Sleep(1 * time.Second)
time.Sleep(time.Duration(trial*trial) * time.Second)
}
if !started {
panic(fmt.Errorf("[-] Could not start node %s ... \n", foundNode.PrivateAddr()))
Expand All @@ -583,14 +582,22 @@ func (e *Orchestrator) PrintLogs() {
}
func (e *Orchestrator) Shutdown() {
fmt.Println("[+] Shutdown all nodes")
for _, node := range e.nodes {
fmt.Printf("\t- Stop old node %s\n", node.PrivateAddr())
node.Stop()
for _, no := range e.nodes {
fmt.Printf("\t- Stop old node %s\n", no.PrivateAddr())
go func(n node.Node) {
n.Stop()
fmt.Println("\t- Successfully stopped Node", n.Index(), "(", n.PrivateAddr(), ")")
}(no)
}
for _, node := range e.newNodes {
fmt.Printf("\t- Stop new node %s\n", node.PrivateAddr())
node.Stop()
for _, no := range e.newNodes {
fmt.Printf("\t- Stop new node %s\n", no.PrivateAddr())
go func(n node.Node) {
n.Stop()
fmt.Println("\t- Successfully stopped Node", n.Index(), "(", n.PrivateAddr(), ")")
}(no)
}
// We let some time for the nodes to shutdown graciously before the whole terminates
time.Sleep(10 * time.Second)
}

func runCommand(c *exec.Cmd, add ...string) []byte {
Expand Down

0 comments on commit 8413033

Please sign in to comment.