Skip to content

Commit

Permalink
HBASE-24545 Add backoff to SCP check on WAL split completion (apache#…
Browse files Browse the repository at this point in the history
…1891)

Signed-off-by: Duo Zhang <zhangduo@apache.org>
  • Loading branch information
saintstack authored and clarax committed Nov 15, 2020
1 parent 4593f1b commit 1d7790a
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,21 @@ boolean enqueueSplitTask(String taskname, TaskBatch batch) {
return false;
}

/**
* Get the amount of time in milliseconds to wait till next check.
* Check less frequently if a bunch of work to do still. At a max, check every minute.
* At a minimum, check every 100ms. This is to alleviate case where perhaps there are a bunch of
* threads waiting on a completion. For example, if the zk-based implementation, we will scan the
* '/hbase/splitWAL' dir every time through this loop. If there are lots of WALs to
* split -- could be tens of thousands if big cluster -- then it will take a while. If
* the Master has many SCPs waiting on wal splitting -- could be up to 10 x the configured
* PE thread count (default would be 160) -- then the Master will be putting up a bunch of
* load on zk.
*/
static int getBatchWaitTimeMillis(int remainingTasks) {
return remainingTasks < 10? 100: remainingTasks < 100? 1000: 60_000;
}

private void waitForSplittingCompletion(TaskBatch batch, MonitoredTask status) {
synchronized (batch) {
while ((batch.done + batch.error) != batch.installed) {
Expand All @@ -338,7 +353,7 @@ private void waitForSplittingCompletion(TaskBatch batch, MonitoredTask status) {
return;
}
}
batch.wait(100);
batch.wait(getBatchWaitTimeMillis(remainingTasks));
if (server.isStopped()) {
LOG.warn("Stopped while waiting for log splits to be completed");
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,8 @@ private void splitLogs(final MasterProcedureEnv env) throws IOException {
MasterWalManager mwm = env.getMasterServices().getMasterWalManager();
AssignmentManager am = env.getMasterServices().getAssignmentManager();
// TODO: For Matteo. Below BLOCKs!!!! Redo so can relinquish executor while it is running.
// PROBLEM!!! WE BLOCK HERE.
// PROBLEM!!! WE BLOCK HERE. Can block for hours if hundreds of WALs to split and hundreds
// of SCPs running because big cluster crashed down.
am.getRegionStates().logSplitting(this.serverName);
mwm.splitLog(this.serverName);
if (!carryingMeta) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
Expand Down Expand Up @@ -156,6 +156,15 @@ public void teardown() throws IOException, KeeperException {
TEST_UTIL.shutdownMiniZKCluster();
}

@Test
public void testBatchWaitMillis() {
assertEquals(100, SplitLogManager.getBatchWaitTimeMillis(0));
assertEquals(100, SplitLogManager.getBatchWaitTimeMillis(1));
assertEquals(1000, SplitLogManager.getBatchWaitTimeMillis(10));
assertEquals(60_000, SplitLogManager.getBatchWaitTimeMillis(101));
assertEquals(60_000, SplitLogManager.getBatchWaitTimeMillis(1011));
}

private interface Expr {
long eval();
}
Expand Down

0 comments on commit 1d7790a

Please sign in to comment.