From 7c793f67a7c6466a5e9b99a7ca9b96d2ba93de99 Mon Sep 17 00:00:00 2001 From: Jay J Wylie Date: Fri, 12 Oct 2012 14:20:07 -0700 Subject: [PATCH] Additional hardening of tests to reduce the number of intermittent BindException errors due to a TOCTOU issue with getLocalCluster. test/integration/voldemort/performance/RoutedStoreParallelismTest.java - switched to startVoldemortCluster test/unit/voldemort/server/gossip/GossiperTest.java - hand-coded test-specific startParallelVoldemortCluster. Not pretty. Not pretty at all. But, should retry in the face of such exceptions. Switched TODO to comment about possible susceptability to BindExceptions: - test/unit/voldemort/client/rebalance/RebalanceTest.java - test/unit/voldemort/scheduled/StreamingSlopPusherTest.java --- .../RoutedStoreParallelismTest.java | 47 ++++----- .../client/rebalance/RebalanceTest.java | 3 +- .../scheduled/StreamingSlopPusherTest.java | 3 +- .../voldemort/server/gossip/GossiperTest.java | 95 ++++++++++++------- 4 files changed, 81 insertions(+), 67 deletions(-) diff --git a/test/integration/voldemort/performance/RoutedStoreParallelismTest.java b/test/integration/voldemort/performance/RoutedStoreParallelismTest.java index 7ad76fb554..a82ab617a5 100644 --- a/test/integration/voldemort/performance/RoutedStoreParallelismTest.java +++ b/test/integration/voldemort/performance/RoutedStoreParallelismTest.java @@ -29,7 +29,6 @@ import joptsimple.OptionParser; import joptsimple.OptionSet; import voldemort.ServerTestUtils; -import voldemort.TestUtils; import voldemort.VoldemortException; import voldemort.client.ClientConfig; import voldemort.cluster.Cluster; @@ -40,7 +39,6 @@ import voldemort.cluster.failuredetector.FailureDetectorUtils; import voldemort.cluster.failuredetector.MutableStoreVerifier; import voldemort.server.StoreRepository; -import voldemort.server.VoldemortConfig; import voldemort.server.VoldemortServer; import voldemort.store.SleepyStore; import voldemort.store.Store; @@ -141,15 +139,6 @@ public static void main(String[] args) throws Throwable { ClientConfig clientConfig = new ClientConfig().setMaxConnectionsPerNode(maxConnectionsPerNode) .setMaxThreads(maxThreads); - Map serverMap = new HashMap(); - - int[][] partitionMap = new int[numNodes][1]; - - for(int i = 0; i < numNodes; i++) { - partitionMap[i][0] = i; - } - - Cluster cluster = ServerTestUtils.getLocalCluster(numNodes, partitionMap); String storeDefinitionFile = "test/common/voldemort/config/single-store.xml"; StoreDefinition storeDefinition = new StoreDefinitionsMapper().readStoreList(new File(storeDefinitionFile)) .get(0); @@ -161,33 +150,31 @@ public static void main(String[] args) throws Throwable { clientConfig.getSocketBufferSize(), clientConfig.getSocketKeepAlive()); - // TODO: add a variant of ServerTestUtils.startVoldemortCluster that - // accepts StoreDefinitions in the interface. - for(int i = 0; i < cluster.getNumberOfNodes(); i++) { - VoldemortConfig config = ServerTestUtils.createServerConfig(true, - i, - TestUtils.createTempDir() - .getAbsolutePath(), - null, - storeDefinitionFile, - new Properties()); - - VoldemortServer server = ServerTestUtils.startVoldemortServer(socketStoreFactory, - config, - cluster); - serverMap.put(i, server); + VoldemortServer[] servers = new VoldemortServer[numNodes]; + int[][] partitionMap = new int[numNodes][1]; + for(int i = 0; i < numNodes; i++) { + partitionMap[i][0] = i; + } + Cluster cluster = ServerTestUtils.startVoldemortCluster(numNodes, + servers, + partitionMap, + socketStoreFactory, + true, + null, + storeDefinitionFile, + new Properties()); + Map serverMap = new HashMap(); + for(int i = 0; i < cluster.getNumberOfNodes(); i++) { + serverMap.put(i, servers[i]); Store store = new InMemoryStorageEngine("test-sleepy"); - if(i < numSlowNodes) store = new SleepyStore(delay, store); - - StoreRepository storeRepository = server.getStoreRepository(); + StoreRepository storeRepository = servers[i].getStoreRepository(); storeRepository.addLocalStore(store); } Map> stores = new HashMap>(); - for(Node node: cluster.getNodes()) { Store socketStore = ServerTestUtils.getSocketStore(socketStoreFactory, "test-sleepy", diff --git a/test/unit/voldemort/client/rebalance/RebalanceTest.java b/test/unit/voldemort/client/rebalance/RebalanceTest.java index 9c763e4848..9f0f06123c 100644 --- a/test/unit/voldemort/client/rebalance/RebalanceTest.java +++ b/test/unit/voldemort/client/rebalance/RebalanceTest.java @@ -81,7 +81,8 @@ protected Cluster getCurrentCluster(int nodeId) { } } - // TODO: refactor to take advantage of ServerTestUtils.startVoldemortCluster + // This method may be susceptible to BindException issues due to TOCTOU + // problem with getLocalCluster. @Override protected Cluster startServers(Cluster cluster, String storeXmlFile, diff --git a/test/unit/voldemort/scheduled/StreamingSlopPusherTest.java b/test/unit/voldemort/scheduled/StreamingSlopPusherTest.java index e48e9f95ac..53f32949d1 100644 --- a/test/unit/voldemort/scheduled/StreamingSlopPusherTest.java +++ b/test/unit/voldemort/scheduled/StreamingSlopPusherTest.java @@ -78,7 +78,8 @@ public void setUp() throws Exception { } } - // TODO: Refactor to take advantage of ServerTestUtils.startVoldemortCluster + // This method may be susceptible to BindException issues due to TOCTOU + // problem with getLocalCluster. private void startServers(int... nodeIds) { for(int nodeId: nodeIds) { if(nodeId < NUM_SERVERS) { diff --git a/test/unit/voldemort/server/gossip/GossiperTest.java b/test/unit/voldemort/server/gossip/GossiperTest.java index 861070399a..5f55558f6f 100644 --- a/test/unit/voldemort/server/gossip/GossiperTest.java +++ b/test/unit/voldemort/server/gossip/GossiperTest.java @@ -45,7 +45,6 @@ public class GossiperTest { private List servers = new ArrayList(); private Cluster cluster; - private Properties props = new Properties(); private static final int socketBufferSize = 4096; private static final int adminSocketBufferSize = 8192; private SocketStoreFactory socketStoreFactory = new ClientRequestExecutorPool(2, @@ -54,6 +53,8 @@ public class GossiperTest { socketBufferSize); private static String storesXmlfile = "test/common/voldemort/config/stores.xml"; private final boolean useNio; + private CountDownLatch countDownLatch; + final private Properties props = new Properties(); public GossiperTest(boolean useNio) { this.useNio = useNio; @@ -64,22 +65,11 @@ public static Collection configs() { return Arrays.asList(new Object[][] { { false }, { true } }); } - @Before - public void setUp() { - props.put("enable.gossip", "true"); - props.put("gossip.interval.ms", "250"); - props.put("socket.buffer.size", String.valueOf(socketBufferSize)); - props.put("admin.streams.buffer.size", String.valueOf(adminSocketBufferSize)); - - // Start all in parallel to avoid exceptions during gossip - + private void attemptParallelClusterStart(ExecutorService executorService) { + // Start all servers in parallel to avoid exceptions during gossip. cluster = ServerTestUtils.getLocalCluster(3, new int[][] { { 0, 1, 2, 3 }, { 4, 5, 6, 7 }, { 8, 9, 10, 11 } }); - ExecutorService executorService = Executors.newFixedThreadPool(3); - final CountDownLatch countDownLatch = new CountDownLatch(3); - // TODO: Add a variant of ServerTestUtils.startVoldemortCluster that - // starts servers in parallel? for(int i = 0; i < 3; i++) { final int j = i; executorService.submit(new Runnable() { @@ -95,16 +85,41 @@ public void run() { storesXmlfile, props), cluster)); - countDownLatch.countDown(); - } catch(IOException e) { + } catch(IOException ioe) { logger.error("Caught IOException during parallel server start: " - + e.getMessage()); - e.printStackTrace(); - throw new RuntimeException(); + + ioe.getMessage()); + RuntimeException re = new RuntimeException(); + re.initCause(ioe); + throw re; + } finally { + // Ensure setup progresses in face of errors + countDownLatch.countDown(); } } }); } + } + + @Before + public void setUp() { + props.put("enable.gossip", "true"); + props.put("gossip.interval.ms", "250"); + props.put("socket.buffer.size", String.valueOf(socketBufferSize)); + props.put("admin.streams.buffer.size", String.valueOf(adminSocketBufferSize)); + + ExecutorService executorService = Executors.newFixedThreadPool(3); + countDownLatch = new CountDownLatch(3); + + boolean clusterStarted = false; + while(!clusterStarted) { + try { + attemptParallelClusterStart(executorService); + clusterStarted = true; + } catch(RuntimeException re) { + logger.info("Some server thread threw a RuntimeException. Will print out stacktrace and then try again. Assumption is that the RuntimeException is due to BindException that in turn is due to TOCTOU issue with getLocalCluster"); + re.printStackTrace(); + } + } try { countDownLatch.await(); @@ -122,13 +137,9 @@ private AdminClient getAdminClient(Cluster newCluster) { return new AdminClient(newCluster, new AdminClientConfig()); } - // Protect against this test running forever until the root cause of running - // forever is found. - @Test(timeout = 1800) - public void testGossiper() throws Exception { - // First create a new cluster: - // Allocate ports for all nodes in the new cluster, to match existing - // cluster + private Cluster attemptStartAdditionalServer() throws IOException { + // Set up a new cluster that is one bigger than the original cluster + int originalSize = cluster.getNumberOfNodes(); int numOriginalPorts = originalSize * 3; int ports[] = new int[numOriginalPorts + 3]; @@ -162,13 +173,27 @@ public void testGossiper() throws Exception { storesXmlfile, props), newCluster); + // This step is only reached if startVoldemortServer does *not* throw a + // BindException due to TOCTOU problem with getLocalCluster servers.add(newServer); + return newCluster; + } - // Wait a while until the new server starts - try { - Thread.sleep(500); - } catch(InterruptedException e) { - Thread.currentThread().interrupt(); + // Protect against this test running forever until the root cause of running + // forever is found. + @Test(timeout = 1800) + public void testGossiper() throws Exception { + Cluster newCluster = null; + + boolean startedAdditionalServer = false; + while(!startedAdditionalServer) { + try { + newCluster = attemptStartAdditionalServer(); + startedAdditionalServer = true; + } catch(IOException ioe) { + logger.warn("Caught an IOException when attempting to start additional server. Will print stacktrace and then attempt to start additional server again."); + ioe.printStackTrace(); + } } // Get the new cluster.xml @@ -178,8 +203,7 @@ public void testGossiper() throws Exception { MetadataStore.CLUSTER_KEY); // Increment the version, let what would be the "donor node" know about - // it - // to seed the Gossip. + // it to seed the Gossip. Version version = versionedClusterXML.getVersion(); ((VectorClock) version).incrementVersion(3, ((VectorClock) version).getTimestamp() + 1); ((VectorClock) version).incrementVersion(0, ((VectorClock) version).getTimestamp() + 1); @@ -194,6 +218,7 @@ public void testGossiper() throws Exception { } // Wait up to five seconds for Gossip to spread + final Cluster newFinalCluster = newCluster; try { TestUtils.assertWithBackoff(5000, new Attempt() { @@ -206,11 +231,11 @@ public void checkCondition() { assertEquals("server " + nodeId + " has heard " + " the gossip about number of nodes", clusterAtServer.getNumberOfNodes(), - newCluster.getNumberOfNodes()); + newFinalCluster.getNumberOfNodes()); assertEquals("server " + nodeId + " has heard " + " the gossip about partitions", clusterAtServer.getNodeById(nodeId).getPartitionIds(), - newCluster.getNodeById(nodeId).getPartitionIds()); + newFinalCluster.getNodeById(nodeId).getPartitionIds()); serversSeen++; } assertEquals("saw all servers", serversSeen, servers.size());