Skip to content

Commit

Permalink
Fix SnapshotBasedIndexRecoveryIT#testNodeDisconnectsDoNotOverAccountR…
Browse files Browse the repository at this point in the history
…ecoveredBytes (#90849) (#91319)

The clock used to check for timeouts does not provide enough granularity,
and sometimes the RESTORE_FILE_FROM_SNAPSHOT is retried while the
test expects to fail right away. This commit configures the nodes
with a timeout of 0, effectively configuring it to avoid retrying
recovery requests.

Closes #90665
  • Loading branch information
fcofdez committed Nov 4, 2022
1 parent 55cde1a commit b4b138c
Showing 1 changed file with 17 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
import static org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY;
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_INTERNAL_ACTION_RETRY_TIMEOUT_SETTING;
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING;
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING;
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_OPERATIONS_SETTING;
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS;
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS_PER_NODE;
Expand Down Expand Up @@ -1255,7 +1256,6 @@ public void testRecoveryRetryKeepsTheGrantedSnapshotFileDownloadPermit() throws
);
}

@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/90665")
public void testNodeDisconnectsDoNotOverAccountRecoveredBytes() throws Exception {
// This test reproduces a rare (but possible scenario) where a shard is recovering using
// snapshots, using logically equivalent index files, but half-way the connection between
Expand All @@ -1266,14 +1266,7 @@ public void testNodeDisconnectsDoNotOverAccountRecoveredBytes() throws Exception
// - The target updates the recovered bytes for the file it has been downloading, after the recovery state was cleared.
// This could end up over-accounting the number of recovered bytes

Settings nodeSettings = Settings.builder()
// We use a really low timeout to avoid retrying the first RESTORE_FILE_FROM_SNAPSHOT after the connection is dropped
.put(INDICES_RECOVERY_INTERNAL_ACTION_RETRY_TIMEOUT_SETTING.getKey(), TimeValue.timeValueMillis(1))
.put(INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS.getKey(), 1)
.put(INDICES_RECOVERY_MAX_CONCURRENT_OPERATIONS_SETTING.getKey(), 1)
.build();

List<String> dataNodes = internalCluster().startDataOnlyNodes(3, nodeSettings);
List<String> dataNodes = internalCluster().startDataOnlyNodes(3);
String indexName = randomAlphaOfLength(10).toLowerCase(Locale.ROOT);
createIndex(
indexName,
Expand All @@ -1287,6 +1280,21 @@ public void testNodeDisconnectsDoNotOverAccountRecoveredBytes() throws Exception
);
ensureGreen(indexName);

assertAcked(
client().admin()
.cluster()
.prepareUpdateSettings()
.setPersistentSettings(
Settings.builder()
// Do not retry the first RESTORE_FILE_FROM_SNAPSHOT after the connection is closed
.put(INDICES_RECOVERY_INTERNAL_ACTION_RETRY_TIMEOUT_SETTING.getKey(), TimeValue.ZERO)
.put(INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS.getKey(), 1)
.put(INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING.getKey(), 1)
.put(INDICES_RECOVERY_MAX_CONCURRENT_OPERATIONS_SETTING.getKey(), 1)
.build()
)
);

int numDocs = randomIntBetween(300, 1000);
indexDocs(indexName, 0, numDocs);
// Flush to ensure that index_commit_seq_nos(replica) == index_commit_seq_nos(primary),
Expand Down

0 comments on commit b4b138c

Please sign in to comment.