Skip to content

Commit

Permalink
dcache-resilience: repair handling of broken files*
Browse files Browse the repository at this point in the history
Motivation:

When a checksum or broken file message/error is generated,
Resilience makes a best effort to (a) remove the broken
copy and (b) make another replica.

This, of course, is not always possible, particularly if
the broken file is the only accessible copy.

However, there is currently a logical error in how
the handler method determines whether it should remove
and reprocess the file.

Modifications:

Distinguish cases (a) when there are actually less than
two known locations and (b) when there is only one
readable location (which happens to be the corrupted one).

Result:

Faulty behavior, particularly the thrashing noted in
the case of a restaging operation which results in
a checksum error, no longer occurs.

Target: master
Request: 4.0
Request: 3.2
Request: 3.1
Request: 3.0
Request: 2.16
Require-notes: yes
Require-book: no
Acked-by: Tigran
  • Loading branch information
alrossi committed Feb 14, 2018
1 parent 5b37a84 commit a352dfc
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 24 deletions.
Expand Up @@ -83,14 +83,15 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
import org.dcache.resilience.handlers.FileOperationHandler;
import org.dcache.resilience.handlers.FileTaskCompletionHandler;
import org.dcache.resilience.handlers.PoolTaskCompletionHandler;
import org.dcache.resilience.util.BrokenFileTask;
import org.dcache.resilience.util.CacheExceptionUtils;
import org.dcache.resilience.util.CacheExceptionUtils.FailureType;
import org.dcache.resilience.util.CheckpointUtils;
import org.dcache.resilience.util.ForegroundBackgroundAllocator;
import org.dcache.resilience.util.ForegroundBackgroundAllocator.ForegroundBackgroundAllocation;
import org.dcache.resilience.util.Operation;
import org.dcache.resilience.util.OperationHistory;
import org.dcache.resilience.util.OperationStatistics;
import org.dcache.resilience.util.ForegroundBackgroundAllocator;
import org.dcache.resilience.util.ForegroundBackgroundAllocator.ForegroundBackgroundAllocation;
import org.dcache.resilience.util.ResilientFileTask;
import org.dcache.resilience.util.StandardForegroundBackgroundAllocator;
import org.dcache.util.RunnableModule;
Expand Down Expand Up @@ -327,20 +328,26 @@ private void postProcess(FileOperation operation) {

boolean retry = false;
boolean abort = false;
boolean broken = false;

if (operation.getState() == FileOperation.FAILED) {
FailureType type =
CacheExceptionUtils.getFailureType(operation.getException(),
source != null);

switch (type) {
case BROKEN:
if (source != null) {
pool = poolInfoMap.getPool(operation.getSource());
operationHandler.handleBrokenFileLocation(operation.getPnfsId(),
pool);
broken = true;

/*
* Remove this operation, then let the
* broken file handling decide if the
* process should be retried.
*/
operation.setOpCount(0);
break;
}
// fall through - possibly retriable with another source
// fall through, may be retriable
case NEWSOURCE:
operation.addSourceToTriedLocations();
operation.resetSourceAndTarget();
Expand Down Expand Up @@ -435,6 +442,16 @@ private void postProcess(FileOperation operation) {
* removal.
*/
remove(operation.getPnfsId(), abort);

/*
* If the operation reported a broken source, pass it off
* to the handler.
*/
if (broken) {
pool = poolInfoMap.getPool(operation.getSource());
new BrokenFileTask(operation.getPnfsId(), pool, operationHandler)
.submit();
}
}
}

Expand Down
Expand Up @@ -62,6 +62,7 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
import com.google.common.collect.ImmutableList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collection;
import java.util.Collections;
import java.util.NoSuchElementException;
Expand All @@ -71,11 +72,11 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

import dmg.cells.nucleus.CellPath;

import diskCacheV111.util.CacheException;
import diskCacheV111.util.PnfsId;

import dmg.cells.nucleus.CellPath;
import org.dcache.alarms.AlarmMarkerFactory;
import org.dcache.alarms.PredefinedAlarm;
import org.dcache.cells.CellStub;
import org.dcache.pool.migration.PoolMigrationCopyFinishedMessage;
import org.dcache.pool.migration.PoolSelectionStrategy;
Expand Down Expand Up @@ -170,24 +171,46 @@ public void handleBrokenFileLocation(PnfsId pnfsId, String pool) {
= FileUpdate.getAttributes(pnfsId, pool,
MessageType.CORRUPT_FILE,
namespace);
if (attributes == null || attributes.getLocations().size() < 2) {
int actual = 0;
int countable = 0;

if (attributes != null) {
actual = attributes.getLocations().size();
countable = poolInfoMap.getCountableLocations(attributes.getLocations());
}

if (actual <= 1) {
/*
* This is the only copy, or it is not/no longer in the
* namespace. In either case, do nothing, but cancel
* any running operations for this pnfsid.
* namespace. In either case, do nothing.
*/
fileOpMap.cancel(pnfsId, true);
LOGGER.error(AlarmMarkerFactory.getMarker(PredefinedAlarm.INACCESSIBLE_FILE,
pnfsId.toString()),
"{}: Repair of broken replicas is not possible, "
+ "file currently inaccessible", pnfsId);
return;
}

removeTarget(pnfsId, pool);
FileUpdate update = new FileUpdate(pnfsId, pool,
MessageType.CLEAR_CACHE_LOCATION, false);

/*
* Bypass the message guard check of CDC session.
*/
handleLocationUpdate(update);
if (countable > 1) {
FileUpdate update = new FileUpdate(pnfsId, pool,
MessageType.CLEAR_CACHE_LOCATION,
false);
/*
* Bypass the message guard check of CDC session.
*/
handleLocationUpdate(update);
} else {
/*
* No alternate readable source; cannot attempt to make
* any further replicas.
*/
LOGGER.error(AlarmMarkerFactory.getMarker(PredefinedAlarm.INACCESSIBLE_FILE,
pnfsId.toString()),
"{}: Repair of broken replicas is not possible, "
+ "file currently inaccessible", pnfsId);
}
} catch (CacheException e) {
LOGGER.error("Error during handling of broken file removal ({}, {}): {}",
pnfsId, pool, new ExceptionMessage(e));
Expand Down
Expand Up @@ -118,7 +118,7 @@ public final class CheckpointUtils {
/**
* <p>Read back in from the checkpoint file operation records.
* These are converted to {@link FileUpdate} objects and passed
* to {@link FileOperationHandler#handleBrokenFileLocation(PnfsId, String)}
* to {@link FileOperationHandler#handleLocationUpdate(FileUpdate)}(PnfsId, String)}
* for registration.</p>
*
* <p>The file to be reloaded is renamed, so that any checkpointing
Expand Down
Expand Up @@ -73,6 +73,7 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
import org.dcache.pool.migration.Task;
import org.dcache.resilience.TestBase;
import org.dcache.resilience.TestMessageProcessor;
import org.dcache.resilience.TestSynchronousExecutor;
import org.dcache.resilience.TestSynchronousExecutor.Mode;
import org.dcache.resilience.data.FileOperation;
import org.dcache.resilience.data.FileUpdate;
Expand All @@ -84,7 +85,11 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
import org.dcache.vehicles.FileAttributes;
import org.dcache.vehicles.resilience.RemoveReplicaMessage;

import static org.junit.Assert.*;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;

public final class FileOperationHandlerTest extends TestBase
implements TestMessageProcessor {
Expand Down Expand Up @@ -296,8 +301,7 @@ public void shouldTryAnotherSourceOnBrokenFile()
whenVerifyIsRun();
afterInspectingSourceAndTarget();
whenOperationFailsWithBrokenFileError();
whenVerifyIsRun();
assertTrue(theNewSourceIsDifferent());
assertNotNull(repRmMessage);
}

@Test
Expand Down Expand Up @@ -644,6 +648,7 @@ private void whenOperationFailsFatally() throws IOException {
}

private void whenOperationFailsWithBrokenFileError() throws IOException {
fileOperationHandler.setTaskService(new TestSynchronousExecutor(Mode.RUN));
fileOperationMap.scan();
fileOperationMap.updateOperation(update.pnfsId,
new CacheException(
Expand Down

0 comments on commit a352dfc

Please sign in to comment.