Skip to content

Commit

Permalink
pool: update HTTP TPC to support retrying GET and HEAD requests for DPM
Browse files Browse the repository at this point in the history
Motivation:

If DPM is calculating a checksum, then any RFC 3230 (i.e., with a
'Want-Digest' header) GET or HEAD request returns '202 Accepted' respond
status line and an HTML page as the response entity.  Since dCache
considers any 2xx response as success, the HTML page is accepted as the
file's contents, resulting in data corruption.

Modification:

Add a special case that treats a 202 status code ('Accepted') for GET or
HEAD requests as the server indicating the result is not yet ready, and
retry the request after a short delay.

Only accept a 200 status code ('OK') when processing a GET response: all
other status codes are treated as indicating an error.

Result:

dCache now supports a DPM-specific HTTP extension that indicates the
checksum calculation is not yet complete, avoiding potential data
corruption with third-party copy.

Target: master
Request: 4.2
Request: 4.1
Request: 4.0
Request: 3.2
Requires-notes: yes
Requires-book: no
Patch: https://rb.dcache.org/r/11242/
  • Loading branch information
paulmillar committed Oct 15, 2018
1 parent 4b4cd46 commit 910ed77
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 15 deletions.
@@ -1,5 +1,7 @@
package diskCacheV111.util;

import org.dcache.util.Exceptions;

/**
* Some unspecified problem when transferring a file between dCache and some
* remote, third-party storage.
Expand All @@ -8,6 +10,14 @@ public class ThirdPartyTransferFailedCacheException extends CacheException
{
private static final long serialVersionUID = 1L;

public static void checkThirdPartyTransferSuccessful(boolean isSuccessful,
String template, Object...arguments)
throws ThirdPartyTransferFailedCacheException
{
Exceptions.genericCheck(isSuccessful, ThirdPartyTransferFailedCacheException::new,
template, arguments);
}

public ThirdPartyTransferFailedCacheException(String message)
{
super(CacheException.THIRD_PARTY_TRANSFER_FAILED, message);
Expand Down
Expand Up @@ -4,6 +4,7 @@
import org.apache.http.HttpEntity;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
Expand All @@ -23,8 +24,6 @@
import java.nio.channels.Channels;
import java.nio.file.OpenOption;
import java.nio.file.StandardOpenOption;
import java.security.NoSuchAlgorithmException;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Map;
import java.util.Optional;
Expand All @@ -49,8 +48,12 @@

import static com.google.common.collect.Maps.uniqueIndex;
import static org.dcache.namespace.FileAttribute.CHECKSUM;
import static org.dcache.util.ByteUnit.GiB;
import static org.dcache.util.ByteUnit.MiB;
import static org.dcache.util.Exceptions.genericCheck;
import static org.dcache.util.TimeUtils.describeDuration;
import static diskCacheV111.util.ThirdPartyTransferFailedCacheException.checkThirdPartyTransferSuccessful;
import static java.util.concurrent.TimeUnit.MILLISECONDS;

/**
* This class implements transfers of data between a pool and some remote
Expand Down Expand Up @@ -145,6 +148,13 @@ public class RemoteHttpDataTransferProtocol implements MoverProtocol,
/** Number of milliseconds between successive requests. */
private static final long DELAY_BETWEEN_REQUESTS = 5_000;

/**
* A guess on how long to retry a GET request. Since the file size is
* unknown, this value may be insufficient if the file is larger.
*/
private static final long GET_RETRY_DURATION = maxRetryDuration(GiB.toBytes(2L));
private static final String GET_RETRY_DURATION_DESCRIPTION = describeDuration(GET_RETRY_DURATION, MILLISECONDS);

/**
* Maximum number of redirections to follow.
* Note that, although RFC 2068 section 10.3 recommends a maximum of 5,
Expand Down Expand Up @@ -221,9 +231,7 @@ protected CloseableHttpClient createHttpClient() throws CacheException
private void receiveFile(final RemoteHttpDataTransferProtocolInfo info)
throws ThirdPartyTransferFailedCacheException
{
HttpGet get = buildGetRequest(info);

try (CloseableHttpResponse response = _client.execute(get)) {
try (CloseableHttpResponse response = doGet(info)) {
StatusLine statusLine = response.getStatusLine();
if (statusLine.getStatusCode() >= 300) {
throw new ThirdPartyTransferFailedCacheException("remote " +
Expand All @@ -248,18 +256,60 @@ private void receiveFile(final RemoteHttpDataTransferProtocolInfo info)
entity.writeTo(Channels.newOutputStream(_channel));
} catch (IOException e) {
throw new ThirdPartyTransferFailedCacheException(e.toString(), e);
} catch (InterruptedException e) {
throw new ThirdPartyTransferFailedCacheException("pool is shutting down", e);
}
}

private HttpGet buildGetRequest(RemoteHttpDataTransferProtocolInfo info) {
private CloseableHttpResponse doGet(final RemoteHttpDataTransferProtocolInfo info)
throws IOException, ThirdPartyTransferFailedCacheException, InterruptedException
{
HttpGet get = new HttpGet(info.getUri());
get.addHeader("Want-Digest", WANT_DIGEST_VALUE);
addHeadersToRequest(info, get);
get.setConfig(RequestConfig.custom()
.setConnectTimeout(CONNECTION_TIMEOUT)
.setSocketTimeout(SOCKET_TIMEOUT)
.build());
return get;

CloseableHttpResponse response = _client.execute(get);

boolean isSuccessful = false;
try {
long deadline = System.currentTimeMillis() + GET_RETRY_DURATION;
while (shouldRetry(response) && System.currentTimeMillis() < deadline) {
Thread.sleep(DELAY_BETWEEN_REQUESTS);

response.close();
response = _client.execute(get);
}

int statusCode = response.getStatusLine().getStatusCode();
String reason = response.getStatusLine().getReasonPhrase();

checkThirdPartyTransferSuccessful(!shouldRetry(response),
"remote server not ready for GET request after %s: %d %s",
GET_RETRY_DURATION_DESCRIPTION, statusCode, reason);

checkThirdPartyTransferSuccessful(statusCode == HttpStatus.SC_OK,
"remote server rejected GET: %d %s", statusCode, reason);

isSuccessful = true;
} finally {
if (!isSuccessful) {
response.close();
}
}

return response;
}


private static boolean shouldRetry(HttpResponse response)
{
// DPM will return 202 for GET or HEAD with Want-Digest if it's still
// calculating the checksum.
return response.getStatusLine().getStatusCode() == HttpStatus.SC_ACCEPTED;
}

private void sendAndCheckFile(RemoteHttpDataTransferProtocolInfo info)
Expand Down Expand Up @@ -349,12 +399,13 @@ private HttpPut buildPutRequest(RemoteHttpDataTransferProtocolInfo info,
return put;
}

private void verifyRemoteFile(RemoteHttpDataTransferProtocolInfo info)
throws ThirdPartyTransferFailedCacheException
/**
* How long to retry a GET or HEAD request for a file with given size.
* @param fileSize The file's size, in bytes.
* @return the maximum retry duration, in milliseconds.
*/
private static long maxRetryDuration(long fileSize)
{
FileAttributes attributes = _channel.getFileAttributes();
boolean isFirstAttempt = true;

/*
* We estimate how long any post-processing will take based on a
* linear model. The model is:
Expand All @@ -365,8 +416,16 @@ private void verifyRemoteFile(RemoteHttpDataTransferProtocolInfo info)
* S is the file's size, alpha is the fixed time that all files require
* and beta is the effective IO bandwidth within the remote server.
*/
long t_max = POST_PROCESSING_OFFSET +
(long)(attributes.getSize() / POST_PROCESSING_BANDWIDTH);
return POST_PROCESSING_OFFSET + (long)(fileSize / POST_PROCESSING_BANDWIDTH);
}

private void verifyRemoteFile(RemoteHttpDataTransferProtocolInfo info)
throws ThirdPartyTransferFailedCacheException
{
FileAttributes attributes = _channel.getFileAttributes();
boolean isFirstAttempt = true;

long t_max = maxRetryDuration(attributes.getSize());
long deadline = System.currentTimeMillis() + t_max;

try {
Expand All @@ -388,8 +447,18 @@ private void verifyRemoteFile(RemoteHttpDataTransferProtocolInfo info)
" " + status.getReasonPhrase());
}

if (shouldRetry(response)) {
continue;
}

Long length = getContentLength(response);

// REVISIT This is to support pre-2.12 dCache, which could
// give a '201 Created' response to a PUT request before
// post-processing (including checksum calculation) was
// completed and the final details registered in the
// namespace. This problem is fixed with dCache v2.12 or
// later.
if (length == null || (attributes.getSize() != 0 && length == 0)) {
continue;
}
Expand All @@ -413,7 +482,7 @@ private void verifyRemoteFile(RemoteHttpDataTransferProtocolInfo info)
}

throw new ThirdPartyTransferFailedCacheException("remote server failed " +
"to provide length after " + (t_max / 1_000) + "s");
"to provide length after " + describeDuration(GET_RETRY_DURATION, MILLISECONDS));
}

private HttpHead buildHeadRequest(RemoteHttpDataTransferProtocolInfo info)
Expand Down

0 comments on commit 910ed77

Please sign in to comment.