Skip to content

Commit

Permalink
pool: update scrubber messages to be less ambigous
Browse files Browse the repository at this point in the history
The wording of the pool scrubber is ambigous, resulting in a support
ticket. The specific problem in the ticket is that the scrubber failed
to clearly distinguish between a temporary problem and a file
corruption.

This patch addresses this issue by making the following changes:

  o  file corruptions are clearly labeled with the word "corruption",
     rather than the ambigous "error".

  o  an additional counter is introduced for manually triggered full runs
     and background scrubbing that describes the number of files that
     (for whatever reason) could not be checked.  These are NOT reported
     as corruption.

  o  the language used when describing a file that could not be checked
     has been improved slightly, avoiding the ambigious term "error".

  o  a CacheException does not abort a run; an IOException does abort
     a run.

  o  logged messages are updated to distinguish between problems found by
     the 'csm check' command those found by background scrubbing.

  o  If a run is aborted, this is recorded and reported as part of the
     'csm status' command.

Target: master
Patch: https://rb.dcache.org/r/8168/
Acked-by: Gerd Behrmann
Request: 2.12
Request: 2.11
Request: 2.10
Ticket: http://rt.dcache.org/Ticket/Display.html?id=8728
  • Loading branch information
paulmillar committed Apr 30, 2015
1 parent ec21804 commit 346d058
Showing 1 changed file with 62 additions and 20 deletions.
Expand Up @@ -39,6 +39,8 @@
import org.dcache.util.Args;
import org.dcache.util.Checksum;

import static com.google.common.base.Preconditions.checkNotNull;

public class ChecksumScanner
implements CellCommandListener, CellLifeCycleAware
{
Expand Down Expand Up @@ -99,6 +101,7 @@ private class FullScan extends Singleton
{
private volatile int _totalCount;
private volatile int _badCount;
private volatile int _unableCount;

public FullScan()
{
Expand Down Expand Up @@ -129,14 +132,17 @@ public void runIt() throws Exception
_bad.put(id, e.getActualChecksums().get());
_badCount++;
} catch (CacheException e) {
_log.warn("Checksum scanner failed for {}: {}", id, e.getMessage());
_badCount++;
_log.warn("csm scan command unable to verify {}: {}", id, e.getMessage());
_unableCount++;
} catch (IOException e) {
_log.error("Checksum scanner failed with IO error for {}: {}", id, e.getMessage());
_badCount++;
_unableCount++;
throw new IOException("failed to read " + id + ": " + e.getMessage(), e);
}
_totalCount++;
}
} catch (IOException e) {
_log.error("Aborting 'cms check' full-scan: {}", e.getMessage());
setAbortMessage("failure in underlying storage: " + e.getMessage());
} finally {
startScrubber();
}
Expand All @@ -145,8 +151,9 @@ public void runIt() throws Exception
public String toString()
{
return super.toString() + " "
+ _totalCount + " checked; "
+ _badCount + " errors detected";
+ _totalCount + " files: "
+ _badCount + " corrupt, "
+ _unableCount + " unable to check";
}
}

Expand Down Expand Up @@ -219,6 +226,7 @@ private class Scrubber extends Singleton
private volatile int _badCount;
private volatile int _numFiles;
private volatile int _totalCount;
private volatile int _unableCount;

private PnfsId _lastFileChecked;
private long _lastCheckpoint;
Expand Down Expand Up @@ -352,17 +360,24 @@ public void runIt() throws InterruptedException
_numFiles = toScan.length;
_badCount = 0;
_totalCount = 0;
_unableCount = 0;
scanFiles(toScan);
if (_badCount > 0) {
_log.warn("Finished scrubbing. Found {} bad files of {}",
_badCount, _numFiles);
}
isFinished = true;
} catch (IllegalStateException | TimeoutCacheException e) {
} catch (IOException e) {
_log.error("Aborting scrubber run: {}", e.getMessage());
setAbortMessage("failure in underlying storage: " + e.getMessage());
Thread.sleep(FAILURE_RATELIMIT_DELAY);
} catch (IllegalStateException e) {
_log.error("Aborting scrubber run: {}", e.getMessage());
setAbortMessage("illegal state: " + e.getMessage());
Thread.sleep(FAILURE_RATELIMIT_DELAY);
} catch (CacheException | NoSuchAlgorithmException e) {
_log.error("Received an unexpected error during scrubbing: {}",
e.getMessage());
} catch (NoSuchAlgorithmException e) {
_log.error("Aborting scrubber run: {}", e.getMessage());
setAbortMessage("checksum algorithm not supported: " + e.getMessage());
Thread.sleep(FAILURE_RATELIMIT_DELAY);
}
}
Expand Down Expand Up @@ -416,7 +431,7 @@ private void checkpointIfNeeded()
}

private void scanFiles(PnfsId[] repository)
throws InterruptedException, NoSuchAlgorithmException, CacheException
throws InterruptedException, NoSuchAlgorithmException, IOException
{
for (PnfsId id : repository) {
try {
Expand Down Expand Up @@ -444,13 +459,15 @@ private void scanFiles(PnfsId[] repository)
} catch (IllegalTransitionException | CacheException f) {
_log.warn("Failed to mark {} as BROKEN: {}", id, f.getMessage());
}
} catch (IOException e) {
_unableCount++;
throw new IOException("Unable to read " + id + ": " + e.getMessage(), e);
} catch (FileNotInCacheException | NotInTrashCacheException e) {
/* It was removed before we could get it. No problem.
*/
} catch (IOException e) {
_log.error("Failed to verify the checksum of {} (skipping): {}",
id, e.getMessage());
_badCount++;
} catch (CacheException e) {
_log.warn("Scrubber unable to verify {}: {}", id, e.getMessage());
_unableCount++;
}
_lastFileChecked = id;
_totalCount++;
Expand All @@ -462,8 +479,10 @@ private void scanFiles(PnfsId[] repository)
@Override
public String toString()
{
return super.toString() + " " + _totalCount + " of " + _numFiles +
" checked; " + _badCount + " errors detected";
return super.toString() + " processed "
+ _totalCount + " of " + _numFiles + " files: "
+ _badCount + " corrupt, "
+ _unableCount + " unable to check";
}
}

Expand All @@ -473,6 +492,7 @@ abstract private class Singleton

private Exception _lastException;
private Thread _currentThread;
private String _abortMessage;

private Singleton(String name)
{
Expand All @@ -493,24 +513,35 @@ public synchronized boolean isActive()
return (_currentThread != null);
}

public synchronized void setAbortMessage(String reason)
{
_abortMessage = checkNotNull(reason);
}

private synchronized void stopped()
{
_currentThread = null;
}

public synchronized void setException(Exception e)
{
_lastException = e;
}

public synchronized void start()
{
if (isActive()) {
throw new IllegalStateException("Still active");
}
_abortMessage = null;
_lastException = null;
_currentThread = new Thread(_name) {
@Override
public void run() {
try {
runIt();
} catch (Exception ee) {
_lastException = ee;
setException(ee);
} finally {
stopped();
}
Expand All @@ -519,10 +550,21 @@ public void run() {
_currentThread.start();
}

@Override
public synchronized String toString()
{
return _name + (isActive() ? " Active " : " Idle ") +
(_lastException == null ? "" : _lastException.toString());
return _name + " " + getState() + (_lastException == null ? "" : " " + _lastException.toString()) + " ";
}

private synchronized String getState()
{
if (isActive()) {
return "Active";
} else if (_abortMessage != null) {
return "Aborted (" + _abortMessage + ")";
} else {
return "Idle";
}
}
}

Expand Down

0 comments on commit 346d058

Please sign in to comment.