Fixing a flaky test (#924)

* Fixing a flaky test * Fixing some CI issue * Reducing flakiness of the demo tests * Adding some extra delay before we give up on pinging a node starting in the demo test
drand · Mar 9, 2022 · 8413033 · 8413033
1 parent d9099c9
commit 8413033
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 18 deletions.
diff --git a/Makefile b/Makefile
@@ -68,7 +68,7 @@ test-integration:
 	cd demo && go build && ./demo -build -test -debug
 
 coverage:
-	go get -u github.com/ory/go-acc
+	go install github.com/ory/go-acc@latest
 	go get -v -t -d ./...
 	COVERAGE=true go-acc ./...
 

diff --git a/chain/beacon/node.go b/chain/beacon/node.go
@@ -174,6 +174,7 @@ func (h *Handler) Start() error {
 	}
 
 	h.Lock()
+	// XXX: do we really need both started and running?
 	h.started = true
 	h.Unlock()
 
@@ -297,6 +298,7 @@ func (h *Handler) run(startTime int64) {
 	setRunning := sync.Once{}
 
 	h.Lock()
+	// XXX: do we really need both started and running?
 	h.running = true
 	h.Unlock()
 

diff --git a/core/drand_test.go b/core/drand_test.go
@@ -345,6 +345,7 @@ func TestRunDKGReshareTimeout(t *testing.T) {
 	var doneReshare = make(chan *key.Group)
 	go func() {
 		t.Log("[reshare] Start reshare")
+		// XXX: notice that the RunReshare is already running AdvanceMockClock on its own after a while!!
 		group, err := dt.RunReshare(t,
 			&reshareConfig{
 				oldRun:  nodesToKeep,
@@ -357,6 +358,7 @@ func TestRunDKGReshareTimeout(t *testing.T) {
 	}()
 	time.Sleep(3 * time.Second)
 
+	// XXX: this isn't always the case: it can already have reached past this at this point because of the above Sleep.
 	t.Log("Move to response phase")
 	dt.AdvanceMockClock(t, timeout)
 

diff --git a/core/util_test.go b/core/util_test.go
@@ -239,6 +239,22 @@ func (d *DrandTestScenario) Ids(n int, newGroup bool) []string {
 	return addresses
 }
 
+// waitRunning with wait until the Status of the Daemon the client controls is set to "isRunning: True"
+func (d *DrandTestScenario) waitRunning(client *net.ControlClient, node *MockNode) {
+	isRunning := false
+	for !isRunning {
+		r, err := client.Status(d.beaconID)
+		require.NoError(d.t, err)
+		/// XXX: maybe needs to be changed if running and started aren't both necessary, using "isStarted" could maybe work too
+		if r.Beacon.IsRunning {
+			d.t.Log("[DEBUG] ", node.GetAddr(), "    Status: isRunning")
+			isRunning = true
+		} else {
+			time.Sleep(100 * time.Millisecond)
+		}
+	}
+}
+
 // RunDKG runs the DKG with the initial nodes created during NewDrandTest
 func (d *DrandTestScenario) RunDKG() *key.Group {
 	// common secret between participants
@@ -248,15 +264,15 @@ func (d *DrandTestScenario) RunDKG() *key.Group {
 	controlClient, err := net.NewControlClient(leaderNode.drand.opts.controlPort)
 	require.NoError(d.t, err)
 
-	d.t.Log("[RunDKG] Start")
+	d.t.Log("[RunDKG] Start: Leader = ", leaderNode.GetAddr())
 
 	// the leaderNode will return the group over this channel
 	var wg sync.WaitGroup
 	wg.Add(d.n)
 
 	// first run the leader and then run the other nodes
 	go func() {
-		d.t.Log("[RunDKG] Leader init")
+		d.t.Log("[RunDKG] Leader (", leaderNode.GetAddr(), ") init")
 
 		// TODO: Control Client needs every single parameter, not a protobuf type. This means that it will be difficult to extend
 		groupPacket, err := controlClient.InitDKGLeader(
@@ -268,13 +284,12 @@ func (d *DrandTestScenario) RunDKG() *key.Group {
 		require.NoError(d.t, err)
 
 		d.t.Logf("[RunDKG] Leader    Finished. GroupHash %x", group.Hash())
+
+		// We need to make sure the daemon is running before continuing
+		d.waitRunning(controlClient, leaderNode)
 		wg.Done()
 	}()
 
-	// make sure the leader is up and running to start the setup
-	// TODO: replace this with a ping loop and refactor to make it reusable
-	time.Sleep(1 * time.Second)
-
 	// all other nodes will send their PK to the leader that will create the group
 	for _, node := range d.nodes[1:] {
 		go func(n *MockNode) {
@@ -285,13 +300,17 @@ func (d *DrandTestScenario) RunDKG() *key.Group {
 			group, err := key.GroupFromProto(groupPacket)
 			require.NoError(d.t, err)
 
-			d.t.Logf("[RunDKG] NonLeader Finished. GroupHash %x", group.Hash())
+			d.t.Logf("[RunDKG] NonLeader %s Finished. GroupHash %x", n.GetAddr(), group.Hash())
+
+			// We need to make sure the daemon is running before continuing
+			d.waitRunning(client, n)
 			wg.Done()
 		}(node)
 	}
 
 	// wait for all to return
 	wg.Wait()
+
 	d.t.Logf("[RunDKG] Leader %s FINISHED", leaderNode.addr)
 
 	// we check that we can fetch the group using control functionalities on the leaderNode node
@@ -665,7 +684,7 @@ func (d *DrandTestScenario) RunReshare(t *testing.T, c *reshareConfig) (*key.Gro
 	d.resharedNodes = append(d.resharedNodes, leader)
 
 	// leave some time to make sure leader is listening
-	// Note: Remove this. Ping until leader is ready. Use a ping + lambda + retry
+	// TODO: Remove this. Ping until leader is ready. Use a ping + lambda + retry
 	time.Sleep(1 * time.Second)
 
 	// run the current nodes
@@ -730,6 +749,7 @@ func (d *DrandTestScenario) RunReshare(t *testing.T, c *reshareConfig) (*key.Gro
 			if !relyOnTimeout {
 				continue
 			}
+			// XXX: check if this is really intended
 			d.AdvanceMockClock(t, c.timeout)
 			t.Logf("[reshare] Advance clock: %d", d.Now().Unix())
 		case <-outgoingChan:
@@ -875,3 +895,7 @@ func (b *testBroadcast) BroadcastDKG(c context.Context, p *drand.DKGPacket) (*dr
 	defer b.Unlock()
 	return ret, nil
 }
+
+func (n *MockNode) GetAddr() string {
+	return n.addr
+}
diff --git a/demo/demo_test.go b/demo/demo_test.go
@@ -1,7 +1,10 @@
 package main
 
 import (
+	"fmt"
+	"os"
 	"testing"
+	"time"
 
 	"github.com/drand/drand/common"
 
@@ -11,6 +14,15 @@ import (
 )
 
 func TestLocalOrchestration(t *testing.T) {
+
+	// Let us have a 2 minutes deadline
+	time.AfterFunc(
+		2*time.Minute,
+		func() {
+			fmt.Println("Deadline reached")
+			os.Exit(1)
+		})
+
 	sch, beaconID := scheme.GetSchemeFromEnv(), common.GetBeaconIDFromEnv()
 
 	o := lib.NewOrchestrator(3, 2, "4s", true, "", false, sch, beaconID, true)

diff --git a/demo/lib/orchestrator.go b/demo/lib/orchestrator.go
@@ -556,15 +556,14 @@ func (e *Orchestrator) StartNode(idxs ...int) {
 
 		fmt.Printf("[+] Attempting to start node %s again ...\n", foundNode.PrivateAddr())
 		foundNode.Start(e.certFolder)
-		trial := 0
 		var started bool
-		for trial < 5 {
+		for trial := 1; trial < 10; trial += 1 {
 			if foundNode.Ping() {
 				fmt.Printf("\t- Node %s started correctly\n", foundNode.PrivateAddr())
 				started = true
 				break
 			}
-			time.Sleep(1 * time.Second)
+			time.Sleep(time.Duration(trial*trial) * time.Second)
 		}
 		if !started {
 			panic(fmt.Errorf("[-] Could not start node %s ... \n", foundNode.PrivateAddr()))
@@ -583,14 +582,22 @@ func (e *Orchestrator) PrintLogs() {
 }
 func (e *Orchestrator) Shutdown() {
 	fmt.Println("[+] Shutdown all nodes")
-	for _, node := range e.nodes {
-		fmt.Printf("\t- Stop old node %s\n", node.PrivateAddr())
-		node.Stop()
+	for _, no := range e.nodes {
+		fmt.Printf("\t- Stop old node %s\n", no.PrivateAddr())
+		go func(n node.Node) {
+			n.Stop()
+			fmt.Println("\t- Successfully stopped Node", n.Index(), "(", n.PrivateAddr(), ")")
+		}(no)
 	}
-	for _, node := range e.newNodes {
-		fmt.Printf("\t- Stop new node %s\n", node.PrivateAddr())
-		node.Stop()
+	for _, no := range e.newNodes {
+		fmt.Printf("\t- Stop new node %s\n", no.PrivateAddr())
+		go func(n node.Node) {
+			n.Stop()
+			fmt.Println("\t- Successfully stopped Node", n.Index(), "(", n.PrivateAddr(), ")")
+		}(no)
 	}
+	// We let some time for the nodes to shutdown graciously before the whole terminates
+	time.Sleep(10 * time.Second)
 }
 
 func runCommand(c *exec.Cmd, add ...string) []byte {