From aae7c59210c084be8d273257bd16d17091559195 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Thu, 22 Jul 2021 12:48:12 -0500 Subject: [PATCH 01/29] Changing wait_for_snapshot to check snapshot start time rather than finish time --- .../xpack/core/ilm/WaitForSnapshotStep.java | 2 +- .../core/slm/SnapshotInvocationRecord.java | 26 ++++++++++++++++--- .../core/ilm/WaitForSnapshotStepTests.java | 4 +-- .../slm/SnapshotInvocationRecordTests.java | 7 +++-- .../xpack/slm/SnapshotLifecycleTask.java | 23 +++++++++------- 5 files changed, 44 insertions(+), 18 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java index bc821b05549d5..fc95d48207e45 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java @@ -57,7 +57,7 @@ public Result isConditionMet(Index index, ClusterState clusterState) { throw error(POLICY_NOT_FOUND_MESSAGE, policy); } SnapshotLifecyclePolicyMetadata snapPolicyMeta = snapMeta.getSnapshotConfigurations().get(policy); - if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getTimestamp() < phaseTime) { + if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < phaseTime) { return new Result(false, notExecutedMessage(phaseTime)); } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java index 8a3a6b91f359b..f7ec344d15d6f 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java @@ -7,6 +7,7 @@ package org.elasticsearch.xpack.core.slm; +import org.elasticsearch.Version; import org.elasticsearch.cluster.AbstractDiffable; import org.elasticsearch.cluster.Diffable; import org.elasticsearch.common.xcontent.ParseField; @@ -17,6 +18,7 @@ import org.elasticsearch.common.xcontent.ToXContentObject; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.core.RestApiVersion; import java.io.IOException; import java.util.Objects; @@ -29,19 +31,22 @@ public class SnapshotInvocationRecord extends AbstractDiffable { static final ParseField SNAPSHOT_NAME = new ParseField("snapshot_name"); + static final ParseField START_TIMESTAMP = new ParseField("snapshot_start_time").forRestApiVersion(RestApiVersion.onOrAfter(RestApiVersion.V_8)); static final ParseField TIMESTAMP = new ParseField("time"); static final ParseField DETAILS = new ParseField("details"); private String snapshotName; + private long snapshotStartTimestamp; private long timestamp; private String details; public static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>("snapshot_policy_invocation_record", true, - a -> new SnapshotInvocationRecord((String) a[0], (long) a[1], (String) a[2])); + a -> new SnapshotInvocationRecord((String) a[0], (long) a[1], (long) a[2], (String) a[3])); static { PARSER.declareString(ConstructingObjectParser.constructorArg(), SNAPSHOT_NAME); + PARSER.declareLong(ConstructingObjectParser.constructorArg(), START_TIMESTAMP); PARSER.declareLong(ConstructingObjectParser.constructorArg(), TIMESTAMP); PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), DETAILS); } @@ -50,14 +55,18 @@ public static SnapshotInvocationRecord parse(XContentParser parser, String name) return PARSER.apply(parser, name); } - public SnapshotInvocationRecord(String snapshotName, long timestamp, String details) { + public SnapshotInvocationRecord(String snapshotName, long snapshotStartTimestamp, long timestamp, String details) { this.snapshotName = Objects.requireNonNull(snapshotName, "snapshot name must be provided"); + this.snapshotStartTimestamp = snapshotStartTimestamp; this.timestamp = timestamp; this.details = details; } public SnapshotInvocationRecord(StreamInput in) throws IOException { this.snapshotName = in.readString(); + if(in.getVersion().onOrAfter(Version.V_8_0_0)) { + this.snapshotStartTimestamp = in.readVLong(); + } this.timestamp = in.readVLong(); this.details = in.readOptionalString(); } @@ -66,6 +75,10 @@ public String getSnapshotName() { return snapshotName; } + public long getSnapshotStartTimestamp() { + return snapshotStartTimestamp; + } + public long getTimestamp() { return timestamp; } @@ -77,6 +90,9 @@ public String getDetails() { @Override public void writeTo(StreamOutput out) throws IOException { out.writeString(snapshotName); + if (out.getVersion().onOrAfter(Version.V_8_0_0)) { + out.writeVLong(snapshotStartTimestamp); + } out.writeVLong(timestamp); out.writeOptionalString(details); } @@ -86,6 +102,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.startObject(); { builder.field(SNAPSHOT_NAME.getPreferredName(), snapshotName); + if (builder.getRestApiVersion().matches(START_TIMESTAMP.getForRestApiVersion())) { + builder.timeField(START_TIMESTAMP.getPreferredName(), "snapshot_start_time_string", snapshotStartTimestamp); + } builder.timeField(TIMESTAMP.getPreferredName(), "time_string", timestamp); if (Objects.nonNull(details)) { builder.field(DETAILS.getPreferredName(), details); @@ -101,12 +120,13 @@ public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; SnapshotInvocationRecord that = (SnapshotInvocationRecord) o; return getTimestamp() == that.getTimestamp() && + getSnapshotStartTimestamp() == that.getSnapshotStartTimestamp() && Objects.equals(getSnapshotName(), that.getSnapshotName()) && Objects.equals(getDetails(), that.getDetails()); } @Override public int hashCode() { - return Objects.hash(getSnapshotName(), getTimestamp(), getDetails()); + return Objects.hash(getSnapshotName(), getSnapshotStartTimestamp(), getTimestamp(), getDetails()); } } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java index a2626dc615eb1..383e9ca8ea854 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java @@ -101,7 +101,7 @@ public void testSlmPolicyExecutedBeforeStep() throws IOException { SnapshotLifecyclePolicyMetadata slmPolicy = SnapshotLifecyclePolicyMetadata.builder() .setModifiedDate(randomLong()) .setPolicy(new SnapshotLifecyclePolicy("", "", "", "", null, null)) - .setLastSuccess(new SnapshotInvocationRecord("", phaseTime - 10, "")) + .setLastSuccess(new SnapshotInvocationRecord("", phaseTime - 100, phaseTime - 10, "")) .build(); SnapshotLifecycleMetadata smlMetadata = new SnapshotLifecycleMetadata(Map.of(instance.getPolicy(), slmPolicy), OperationMode.RUNNING, null); @@ -126,7 +126,7 @@ public void testSlmPolicyExecutedAfterStep() throws IOException { SnapshotLifecyclePolicyMetadata slmPolicy = SnapshotLifecyclePolicyMetadata.builder() .setModifiedDate(randomLong()) .setPolicy(new SnapshotLifecyclePolicy("", "", "", "", null, null)) - .setLastSuccess(new SnapshotInvocationRecord("", phaseTime + 10, "")) + .setLastSuccess(new SnapshotInvocationRecord("", phaseTime + 10, phaseTime + 100, "")) .build(); SnapshotLifecycleMetadata smlMetadata = new SnapshotLifecycleMetadata(Map.of(instance.getPolicy(), slmPolicy), OperationMode.RUNNING, null); diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecordTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecordTests.java index 130fcc66e41a3..0fbee9d74e406 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecordTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecordTests.java @@ -37,15 +37,17 @@ protected SnapshotInvocationRecord mutateInstance(SnapshotInvocationRecord insta case 0: return new SnapshotInvocationRecord( randomValueOtherThan(instance.getSnapshotName(), () -> randomAlphaOfLengthBetween(2,10)), + instance.getTimestamp() - 100, instance.getTimestamp(), instance.getDetails()); case 1: + long timestamp = randomValueOtherThan(instance.getTimestamp(), ESTestCase::randomNonNegativeLong); return new SnapshotInvocationRecord(instance.getSnapshotName(), - randomValueOtherThan(instance.getTimestamp(), ESTestCase::randomNonNegativeLong), + timestamp - 100, timestamp, instance.getDetails()); case 2: return new SnapshotInvocationRecord(instance.getSnapshotName(), - instance.getTimestamp(), + instance.getTimestamp() - 100, instance.getTimestamp(), randomValueOtherThan(instance.getDetails(), () -> randomAlphaOfLengthBetween(2,10))); default: throw new AssertionError("failure, got illegal switch case"); @@ -56,6 +58,7 @@ public static SnapshotInvocationRecord randomSnapshotInvocationRecord() { return new SnapshotInvocationRecord( randomAlphaOfLengthBetween(5,10), randomNonNegativeLong(), + randomNonNegativeLong(), randomBoolean() ? null : randomAlphaOfLengthBetween(5, 10)); } diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java index 6af10d9350a42..bc2b0c9d526ec 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java @@ -93,18 +93,19 @@ public static Optional maybeTakeSnapshot(final String jobId, final Clien logger.info("snapshot lifecycle policy [{}] issuing create snapshot [{}]", policyMetadata.getPolicy().getId(), request.snapshot()); clientWithHeaders.admin().cluster().createSnapshot(request, new ActionListener<>() { + private long snapshotStartTime; @Override public void onResponse(CreateSnapshotResponse createSnapshotResponse) { logger.debug("snapshot response for [{}]: {}", policyMetadata.getPolicy().getId(), Strings.toString(createSnapshotResponse)); final SnapshotInfo snapInfo = createSnapshotResponse.getSnapshotInfo(); - + snapshotStartTime = snapInfo.startTime(); // Check that there are no failed shards, since the request may not entirely // fail, but may still have failures (such as in the case of an aborted snapshot) if (snapInfo.failedShards() == 0) { final long timestamp = Instant.now().toEpochMilli(); clusterService.submitStateUpdateTask("slm-record-success-" + policyMetadata.getPolicy().getId(), - WriteJobStatus.success(policyMetadata.getPolicy().getId(), request.snapshot(), timestamp)); + WriteJobStatus.success(policyMetadata.getPolicy().getId(), request.snapshot(), snapshotStartTime, timestamp)); historyStore.putAsync(SnapshotHistoryItem.creationSuccessRecord(timestamp, policyMetadata.getPolicy(), request.snapshot())); } else { @@ -126,7 +127,7 @@ public void onFailure(Exception e) { policyMetadata.getPolicy().getId(), e); final long timestamp = Instant.now().toEpochMilli(); clusterService.submitStateUpdateTask("slm-record-failure-" + policyMetadata.getPolicy().getId(), - WriteJobStatus.failure(policyMetadata.getPolicy().getId(), request.snapshot(), timestamp, e)); + WriteJobStatus.failure(policyMetadata.getPolicy().getId(), request.snapshot(), snapshotStartTime, timestamp, e)); final SnapshotHistoryItem failureRecord; try { failureRecord = SnapshotHistoryItem.creationFailureRecord(timestamp, policyMetadata.getPolicy(), @@ -166,22 +167,24 @@ private static class WriteJobStatus extends ClusterStateUpdateTask { private final String policyName; private final String snapshotName; + private final long snapshotStartTime; private final long timestamp; private final Optional exception; - private WriteJobStatus(String policyName, String snapshotName, long timestamp, Optional exception) { + private WriteJobStatus(String policyName, String snapshotName, long snapshotStartTime, long timestamp, Optional exception) { this.policyName = policyName; this.snapshotName = snapshotName; this.exception = exception; + this.snapshotStartTime = snapshotStartTime; this.timestamp = timestamp; } - static WriteJobStatus success(String policyId, String snapshotName, long timestamp) { - return new WriteJobStatus(policyId, snapshotName, timestamp, Optional.empty()); + static WriteJobStatus success(String policyId, String snapshotName, long snapshotStartTime, long timestamp) { + return new WriteJobStatus(policyId, snapshotName, snapshotStartTime, timestamp, Optional.empty()); } - static WriteJobStatus failure(String policyId, String snapshotName, long timestamp, Exception exception) { - return new WriteJobStatus(policyId, snapshotName, timestamp, Optional.of(exception)); + static WriteJobStatus failure(String policyId, String snapshotName, long snapshotStartTime, long timestamp, Exception exception) { + return new WriteJobStatus(policyId, snapshotName, snapshotStartTime, timestamp, Optional.of(exception)); } private String exceptionToString() throws IOException { @@ -220,10 +223,10 @@ public ClusterState execute(ClusterState currentState) throws Exception { if (exception.isPresent()) { stats.snapshotFailed(policyName); - newPolicyMetadata.setLastFailure(new SnapshotInvocationRecord(snapshotName, timestamp, exceptionToString())); + newPolicyMetadata.setLastFailure(new SnapshotInvocationRecord(snapshotName, snapshotStartTime, timestamp, exceptionToString())); } else { stats.snapshotTaken(policyName); - newPolicyMetadata.setLastSuccess(new SnapshotInvocationRecord(snapshotName, timestamp, null)); + newPolicyMetadata.setLastSuccess(new SnapshotInvocationRecord(snapshotName, snapshotStartTime, timestamp, null)); } snapLifecycles.put(policyName, newPolicyMetadata.build()); From e3b202cb83fc5fc292ca73438ff9ebf023d62775 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Thu, 22 Jul 2021 13:24:39 -0500 Subject: [PATCH 02/29] Adding a test --- .../core/slm/SnapshotInvocationRecord.java | 3 +- .../core/ilm/WaitForSnapshotStepTests.java | 46 +++++++++---------- .../xpack/slm/SnapshotLifecycleTask.java | 6 ++- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java index f7ec344d15d6f..052b9bff3a39a 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java @@ -31,7 +31,8 @@ public class SnapshotInvocationRecord extends AbstractDiffable { static final ParseField SNAPSHOT_NAME = new ParseField("snapshot_name"); - static final ParseField START_TIMESTAMP = new ParseField("snapshot_start_time").forRestApiVersion(RestApiVersion.onOrAfter(RestApiVersion.V_8)); + static final ParseField START_TIMESTAMP = new ParseField("snapshot_start_time") + .forRestApiVersion(RestApiVersion.onOrAfter(RestApiVersion.V_8)); static final ParseField TIMESTAMP = new ParseField("time"); static final ParseField DETAILS = new ParseField("details"); diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java index 383e9ca8ea854..d691342c1421a 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java @@ -95,38 +95,30 @@ public void testSlmPolicyNotExecuted() throws IOException { } public void testSlmPolicyExecutedBeforeStep() throws IOException { - long phaseTime = randomLong(); + // The snapshot was started and finished before the phase time, so we do not expect the step to finish: + assertSlmPolicyExecuted(false, false); + } - WaitForSnapshotStep instance = createRandomInstance(); - SnapshotLifecyclePolicyMetadata slmPolicy = SnapshotLifecyclePolicyMetadata.builder() - .setModifiedDate(randomLong()) - .setPolicy(new SnapshotLifecyclePolicy("", "", "", "", null, null)) - .setLastSuccess(new SnapshotInvocationRecord("", phaseTime - 100, phaseTime - 10, "")) - .build(); - SnapshotLifecycleMetadata smlMetadata = new SnapshotLifecycleMetadata(Map.of(instance.getPolicy(), slmPolicy), - OperationMode.RUNNING, null); + public void testSlmPolicyExecutedAfterStep() throws IOException { + // The snapshot was started and finished after the phase time, so we do expect the step to finish: + assertSlmPolicyExecuted(true, true); + } - IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLength(10)) - .putCustom(LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY, Map.of("phase_time", Long.toString(phaseTime))) - .settings(settings(Version.CURRENT)) - .numberOfShards(randomIntBetween(1, 5)).numberOfReplicas(randomIntBetween(0, 5)).build(); - ImmutableOpenMap.Builder indices = - ImmutableOpenMap.builder().fPut(indexMetadata.getIndex().getName(), indexMetadata); - Metadata.Builder meta = Metadata.builder().indices(indices.build()).putCustom(SnapshotLifecycleMetadata.TYPE, smlMetadata); - ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metadata(meta).build(); - ClusterStateWaitStep.Result result = instance.isConditionMet(indexMetadata.getIndex(), clusterState); - assertFalse(result.isComplete()); - assertTrue(getMessage(result).contains("to be executed")); + public void testSlmPolicyNotExecutedWhenStartIsBeforePhaseTime() throws IOException { + // The snapshot was started before the phase time and finished after, so we do expect the step to finish: + assertSlmPolicyExecuted(false, true); } - public void testSlmPolicyExecutedAfterStep() throws IOException { + private void assertSlmPolicyExecuted(boolean startTimeAfterPhaseTime, boolean finishTimeAfterPhaseTime) throws IOException { long phaseTime = randomLong(); WaitForSnapshotStep instance = createRandomInstance(); SnapshotLifecyclePolicyMetadata slmPolicy = SnapshotLifecyclePolicyMetadata.builder() .setModifiedDate(randomLong()) .setPolicy(new SnapshotLifecyclePolicy("", "", "", "", null, null)) - .setLastSuccess(new SnapshotInvocationRecord("", phaseTime + 10, phaseTime + 100, "")) + .setLastSuccess(new SnapshotInvocationRecord("", + phaseTime + (startTimeAfterPhaseTime ? 10 : -100), + phaseTime + (finishTimeAfterPhaseTime ? 100 : -10), "")) .build(); SnapshotLifecycleMetadata smlMetadata = new SnapshotLifecycleMetadata(Map.of(instance.getPolicy(), slmPolicy), OperationMode.RUNNING, null); @@ -140,8 +132,14 @@ public void testSlmPolicyExecutedAfterStep() throws IOException { Metadata.Builder meta = Metadata.builder().indices(indices.build()).putCustom(SnapshotLifecycleMetadata.TYPE, smlMetadata); ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metadata(meta).build(); ClusterStateWaitStep.Result result = instance.isConditionMet(indexMetadata.getIndex(), clusterState); - assertTrue(result.isComplete()); - assertNull(result.getInfomationContext()); + if (startTimeAfterPhaseTime) { + assertTrue(result.isComplete()); + assertNull(result.getInfomationContext()); + } + else { + assertFalse(result.isComplete()); + assertTrue(getMessage(result).contains("to be executed")); + } } private String getMessage(ClusterStateWaitStep.Result result) throws IOException { diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java index bc2b0c9d526ec..c9c18a9a815a5 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java @@ -171,7 +171,8 @@ private static class WriteJobStatus extends ClusterStateUpdateTask { private final long timestamp; private final Optional exception; - private WriteJobStatus(String policyName, String snapshotName, long snapshotStartTime, long timestamp, Optional exception) { + private WriteJobStatus(String policyName, String snapshotName, long snapshotStartTime, long timestamp, + Optional exception) { this.policyName = policyName; this.snapshotName = snapshotName; this.exception = exception; @@ -223,7 +224,8 @@ public ClusterState execute(ClusterState currentState) throws Exception { if (exception.isPresent()) { stats.snapshotFailed(policyName); - newPolicyMetadata.setLastFailure(new SnapshotInvocationRecord(snapshotName, snapshotStartTime, timestamp, exceptionToString())); + newPolicyMetadata.setLastFailure(new SnapshotInvocationRecord(snapshotName, snapshotStartTime, timestamp, + exceptionToString())); } else { stats.snapshotTaken(policyName); newPolicyMetadata.setLastSuccess(new SnapshotInvocationRecord(snapshotName, snapshotStartTime, timestamp, null)); From 4747ea7ad71de552c93d7257fbe871befc33c6e0 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Mon, 26 Jul 2021 11:07:42 -0500 Subject: [PATCH 03/29] Adding logging for more info on intermittent failure --- .../xpack/ilm/TimeSeriesLifecycleActionsIT.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index ced6944003050..522216764aee0 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -221,7 +221,11 @@ public void testWaitForSnapshot() throws Exception { Request request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); assertOK(client().performRequest(request)); - assertBusy(() -> assertThat(getStepKeyForIndex(client(), index).getAction(), equalTo("complete")), slmPolicy); + assertBusy(() -> { + Step.StepKey stepKey = getStepKeyForIndex(client(), index); + logger.info("step key for index {} is {}", index, stepKey); + assertThat(stepKey.getAction(), equalTo("complete")); + }, slmPolicy); } public void testWaitForSnapshotSlmExecutedBefore() throws Exception { From 5f28f212a37caac209a0a34eaf4d3af8d1044756 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Mon, 26 Jul 2021 13:47:23 -0500 Subject: [PATCH 04/29] Adding verbose logging to troubleshoot test failure --- .../xpack/core/ilm/WaitForSnapshotStep.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java index fc95d48207e45..a33f047a0f335 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java @@ -6,6 +6,8 @@ */ package org.elasticsearch.xpack.core.ilm; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.common.xcontent.ToXContentObject; @@ -25,6 +27,7 @@ public class WaitForSnapshotStep extends ClusterStateWaitStep { static final String NAME = "wait-for-snapshot"; + private static final Logger logger = LogManager.getLogger(WaitForSnapshotStep.class); private static final String MESSAGE_FIELD = "message"; private static final String POLICY_NOT_EXECUTED_MESSAGE = "waiting for policy '%s' to be executed since %s"; @@ -58,9 +61,17 @@ public Result isConditionMet(Index index, ClusterState clusterState) { } SnapshotLifecyclePolicyMetadata snapPolicyMeta = snapMeta.getSnapshotConfigurations().get(policy); if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < phaseTime) { + if (snapPolicyMeta.getLastSuccess() == null) { + logger.info("Not executing policy because no last snapshot success."); + } + else { + logger.info("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", + snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getTimestamp()); + } return new Result(false, notExecutedMessage(phaseTime)); } - + logger.info("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", + snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getTimestamp()); return new Result(true, null); } From 3a3bbf8a35d14088e631d005105cf204ad855303 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Mon, 26 Jul 2021 13:54:03 -0500 Subject: [PATCH 05/29] Putting end of if block and beginning of else on same line Co-authored-by: Lee Hinman --- .../elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java index d691342c1421a..cc6659bbca123 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java @@ -135,8 +135,7 @@ private void assertSlmPolicyExecuted(boolean startTimeAfterPhaseTime, boolean fi if (startTimeAfterPhaseTime) { assertTrue(result.isComplete()); assertNull(result.getInfomationContext()); - } - else { + } else { assertFalse(result.isComplete()); assertTrue(getMessage(result).contains("to be executed")); } From c9b0e30851070adf092605a1e81fcea9bf72bfd7 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Mon, 26 Jul 2021 15:23:13 -0500 Subject: [PATCH 06/29] troublshooting IT test --- .../xpack/ilm/TimeSeriesLifecycleActionsIT.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index 522216764aee0..027b53bb4a85a 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -220,7 +220,7 @@ public void testWaitForSnapshot() throws Exception { Request request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); assertOK(client().performRequest(request)); - + Thread.sleep(1000); //TODO: This is just here for troubleshooting! assertBusy(() -> { Step.StepKey stepKey = getStepKeyForIndex(client(), index); logger.info("step key for index {} is {}", index, stepKey); @@ -270,7 +270,12 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { assertEquals(2, ((Map) ((Map) responseMap.get(slmPolicy)).get("stats")).get("snapshots_taken")); } }, slmPolicy); - + Thread.sleep(1000); //TODO: This is just here for troubleshooting! + assertBusy(() -> { + Step.StepKey stepKey = getStepKeyForIndex(client(), index); + logger.info("stepKey for index {} is {}", index, stepKey); + assertThat(stepKey.getAction(), equalTo("complete")); + }, slmPolicy); assertBusy(() -> assertThat(getStepKeyForIndex(client(), index).getAction(), equalTo("complete")), slmPolicy); } From c156d50380facee9758554f84eea4824d711a049 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Mon, 26 Jul 2021 17:07:41 -0500 Subject: [PATCH 07/29] adding a wait to the IT test --- .../ilm/TimeSeriesLifecycleActionsIT.java | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index 027b53bb4a85a..cea71f18b773c 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -200,16 +200,15 @@ public void testWaitForSnapshot() throws Exception { String slmPolicy = randomAlphaOfLengthBetween(4, 10); createNewSingletonPolicy(client(), policy, "delete", new WaitForSnapshotAction(slmPolicy)); updatePolicy(client(), index, policy); + waitForPhaseTime(); assertBusy(() -> { Map indexILMState = explainIndex(client(), index); assertThat(indexILMState.get("action"), is("wait_for_snapshot")); assertThat(indexILMState.get("failed_step"), is("wait-for-snapshot")); }, slmPolicy); - String snapshotRepo = randomAlphaOfLengthBetween(4, 10); createSnapshotRepo(client(), snapshotRepo, randomBoolean()); createSlmPolicy(slmPolicy, snapshotRepo); - assertBusy(() -> { Map indexILMState = explainIndex(client(), index); //wait for step to notice that the slm policy is created and to get out of error @@ -220,7 +219,6 @@ public void testWaitForSnapshot() throws Exception { Request request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); assertOK(client().performRequest(request)); - Thread.sleep(1000); //TODO: This is just here for troubleshooting! assertBusy(() -> { Step.StepKey stepKey = getStepKeyForIndex(client(), index); logger.info("step key for index {} is {}", index, stepKey); @@ -251,6 +249,7 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { }, slmPolicy); updatePolicy(client(), index, policy); + waitForPhaseTime(); assertBusy(() -> { Map indexILMState = explainIndex(client(), index); @@ -270,7 +269,7 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { assertEquals(2, ((Map) ((Map) responseMap.get(slmPolicy)).get("stats")).get("snapshots_taken")); } }, slmPolicy); - Thread.sleep(1000); //TODO: This is just here for troubleshooting! + assertBusy(() -> { Step.StepKey stepKey = getStepKeyForIndex(client(), index); logger.info("stepKey for index {} is {}", index, stepKey); @@ -279,6 +278,25 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { assertBusy(() -> assertThat(getStepKeyForIndex(client(), index).getAction(), equalTo("complete")), slmPolicy); } + /* + * This method waits until phase_time gets set in the state store. Otherwise we can wind up starting a snapshot before the ILM policy + * is ready. + */ + @SuppressWarnings("unchecked") + private void waitForPhaseTime() throws Exception { + assertBusy(() -> { + Request request = new Request("GET", "/_cluster/state/metadata/" + index); + Map response = entityAsMap(client().performRequest(request)); + Map metadata = (Map) response.get("metadata"); + Map indices = (Map) metadata.get("indices"); + Map indexMap = (Map) indices.get(index); + Map ilm = (Map) indexMap.get("ilm"); + assertNotNull(ilm); + Object phase_time = ilm.get("phase_time"); + assertNotNull(phase_time); + }); + } + public void testDelete() throws Exception { createIndexWithSettings(client(), index, alias, Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)); From 9189fac047b1a54c983aa51f5fe556fedd7f4ef7 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Mon, 26 Jul 2021 17:21:06 -0500 Subject: [PATCH 08/29] renaming timestamp and making snapshotStartTime nullable --- .../xpack/core/ilm/WaitForSnapshotStep.java | 12 +++-- .../core/slm/SnapshotInvocationRecord.java | 44 +++++++++---------- .../core/ilm/WaitForSnapshotStepTests.java | 27 ++++++++++++ .../slm/SnapshotInvocationRecordTests.java | 19 +++++--- .../xpack/slm/SnapshotLifecycleTask.java | 23 +++++----- 5 files changed, 82 insertions(+), 43 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java index a33f047a0f335..9a2be3cd206bd 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java @@ -60,18 +60,22 @@ public Result isConditionMet(Index index, ClusterState clusterState) { throw error(POLICY_NOT_FOUND_MESSAGE, policy); } SnapshotLifecyclePolicyMetadata snapPolicyMeta = snapMeta.getSnapshotConfigurations().get(policy); - if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < phaseTime) { + if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null || + snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < phaseTime) { + //TODO: Clean up all this logging once I figure out the TimeSeriesLiefecycleActionsIT timing problems if (snapPolicyMeta.getLastSuccess() == null) { - logger.info("Not executing policy because no last snapshot success."); + logger.info("Not executing policy because no last snapshot success. Phase time: {}", phaseTime); + } else if (snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null) { + logger.info("Not executing policy because no last snapshot start date."); } else { logger.info("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", - snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getTimestamp()); + snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); } return new Result(false, notExecutedMessage(phaseTime)); } logger.info("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", - snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getTimestamp()); + snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); return new Result(true, null); } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java index 052b9bff3a39a..eb3d117fd2515 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java @@ -10,15 +10,14 @@ import org.elasticsearch.Version; import org.elasticsearch.cluster.AbstractDiffable; import org.elasticsearch.cluster.Diffable; -import org.elasticsearch.common.xcontent.ParseField; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.common.xcontent.ConstructingObjectParser; +import org.elasticsearch.common.xcontent.ParseField; import org.elasticsearch.common.xcontent.ToXContentObject; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.core.RestApiVersion; import java.io.IOException; import java.util.Objects; @@ -31,23 +30,22 @@ public class SnapshotInvocationRecord extends AbstractDiffable { static final ParseField SNAPSHOT_NAME = new ParseField("snapshot_name"); - static final ParseField START_TIMESTAMP = new ParseField("snapshot_start_time") - .forRestApiVersion(RestApiVersion.onOrAfter(RestApiVersion.V_8)); + static final ParseField START_TIMESTAMP = new ParseField("snapshot_start_time"); static final ParseField TIMESTAMP = new ParseField("time"); static final ParseField DETAILS = new ParseField("details"); private String snapshotName; - private long snapshotStartTimestamp; - private long timestamp; + private Long snapshotStartTimestamp; + private long snapshotFinishTimestamp; private String details; public static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>("snapshot_policy_invocation_record", true, - a -> new SnapshotInvocationRecord((String) a[0], (long) a[1], (long) a[2], (String) a[3])); + a -> new SnapshotInvocationRecord((String) a[0], (Long) a[1], (long) a[2], (String) a[3])); static { PARSER.declareString(ConstructingObjectParser.constructorArg(), SNAPSHOT_NAME); - PARSER.declareLong(ConstructingObjectParser.constructorArg(), START_TIMESTAMP); + PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), START_TIMESTAMP); PARSER.declareLong(ConstructingObjectParser.constructorArg(), TIMESTAMP); PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), DETAILS); } @@ -56,19 +54,21 @@ public static SnapshotInvocationRecord parse(XContentParser parser, String name) return PARSER.apply(parser, name); } - public SnapshotInvocationRecord(String snapshotName, long snapshotStartTimestamp, long timestamp, String details) { + public SnapshotInvocationRecord(String snapshotName, Long snapshotStartTimestamp, long snapshotFinishTimestamp, String details) { this.snapshotName = Objects.requireNonNull(snapshotName, "snapshot name must be provided"); this.snapshotStartTimestamp = snapshotStartTimestamp; - this.timestamp = timestamp; + this.snapshotFinishTimestamp = snapshotFinishTimestamp; this.details = details; } public SnapshotInvocationRecord(StreamInput in) throws IOException { this.snapshotName = in.readString(); if(in.getVersion().onOrAfter(Version.V_8_0_0)) { - this.snapshotStartTimestamp = in.readVLong(); + this.snapshotStartTimestamp = in.readOptionalVLong(); + } else { + this.snapshotStartTimestamp = null; } - this.timestamp = in.readVLong(); + this.snapshotFinishTimestamp = in.readVLong(); this.details = in.readOptionalString(); } @@ -76,12 +76,12 @@ public String getSnapshotName() { return snapshotName; } - public long getSnapshotStartTimestamp() { + public Long getSnapshotStartTimestamp() { return snapshotStartTimestamp; } - public long getTimestamp() { - return timestamp; + public long getSnapshotFinishTimestamp() { + return snapshotFinishTimestamp; } public String getDetails() { @@ -92,9 +92,9 @@ public String getDetails() { public void writeTo(StreamOutput out) throws IOException { out.writeString(snapshotName); if (out.getVersion().onOrAfter(Version.V_8_0_0)) { - out.writeVLong(snapshotStartTimestamp); + out.writeOptionalVLong(snapshotStartTimestamp); } - out.writeVLong(timestamp); + out.writeVLong(snapshotFinishTimestamp); out.writeOptionalString(details); } @@ -103,10 +103,10 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.startObject(); { builder.field(SNAPSHOT_NAME.getPreferredName(), snapshotName); - if (builder.getRestApiVersion().matches(START_TIMESTAMP.getForRestApiVersion())) { + if (snapshotStartTimestamp != null) { builder.timeField(START_TIMESTAMP.getPreferredName(), "snapshot_start_time_string", snapshotStartTimestamp); } - builder.timeField(TIMESTAMP.getPreferredName(), "time_string", timestamp); + builder.timeField(TIMESTAMP.getPreferredName(), "time_string", snapshotFinishTimestamp); if (Objects.nonNull(details)) { builder.field(DETAILS.getPreferredName(), details); } @@ -120,14 +120,14 @@ public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; SnapshotInvocationRecord that = (SnapshotInvocationRecord) o; - return getTimestamp() == that.getTimestamp() && - getSnapshotStartTimestamp() == that.getSnapshotStartTimestamp() && + return getSnapshotFinishTimestamp() == that.getSnapshotFinishTimestamp() && + Objects.equals(getSnapshotStartTimestamp(), that.getSnapshotStartTimestamp()) && Objects.equals(getSnapshotName(), that.getSnapshotName()) && Objects.equals(getDetails(), that.getDetails()); } @Override public int hashCode() { - return Objects.hash(getSnapshotName(), getSnapshotStartTimestamp(), getTimestamp(), getDetails()); + return Objects.hash(getSnapshotName(), getSnapshotStartTimestamp(), getSnapshotFinishTimestamp(), getDetails()); } } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java index cc6659bbca123..b81753a4a47d2 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java @@ -141,6 +141,33 @@ private void assertSlmPolicyExecuted(boolean startTimeAfterPhaseTime, boolean fi } } + public void testNullStartTime() throws IOException { + long phaseTime = randomLong(); + + WaitForSnapshotStep instance = createRandomInstance(); + SnapshotLifecyclePolicyMetadata slmPolicy = SnapshotLifecyclePolicyMetadata.builder() + .setModifiedDate(randomLong()) + .setPolicy(new SnapshotLifecyclePolicy("", "", "", "", null, null)) + .setLastSuccess(new SnapshotInvocationRecord("", + null, + phaseTime + 100, "")) + .build(); + SnapshotLifecycleMetadata smlMetadata = new SnapshotLifecycleMetadata(Map.of(instance.getPolicy(), slmPolicy), + OperationMode.RUNNING, null); + + IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLength(10)) + .putCustom(LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY, Map.of("phase_time", Long.toString(phaseTime))) + .settings(settings(Version.CURRENT)) + .numberOfShards(randomIntBetween(1, 5)).numberOfReplicas(randomIntBetween(0, 5)).build(); + ImmutableOpenMap.Builder indices = + ImmutableOpenMap.builder().fPut(indexMetadata.getIndex().getName(), indexMetadata); + Metadata.Builder meta = Metadata.builder().indices(indices.build()).putCustom(SnapshotLifecycleMetadata.TYPE, smlMetadata); + ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metadata(meta).build(); + ClusterStateWaitStep.Result result = instance.isConditionMet(indexMetadata.getIndex(), clusterState); + assertFalse(result.isComplete()); + assertTrue(getMessage(result).contains("to be executed")); + } + private String getMessage(ClusterStateWaitStep.Result result) throws IOException { return Strings.toString(result.getInfomationContext()); } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecordTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecordTests.java index 0fbee9d74e406..d8a8c4bc23498 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecordTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecordTests.java @@ -37,17 +37,17 @@ protected SnapshotInvocationRecord mutateInstance(SnapshotInvocationRecord insta case 0: return new SnapshotInvocationRecord( randomValueOtherThan(instance.getSnapshotName(), () -> randomAlphaOfLengthBetween(2,10)), - instance.getTimestamp() - 100, - instance.getTimestamp(), + instance.getSnapshotFinishTimestamp() - 100, + instance.getSnapshotFinishTimestamp(), instance.getDetails()); case 1: - long timestamp = randomValueOtherThan(instance.getTimestamp(), ESTestCase::randomNonNegativeLong); + long timestamp = randomValueOtherThan(instance.getSnapshotFinishTimestamp(), ESTestCase::randomNonNegativeLong); return new SnapshotInvocationRecord(instance.getSnapshotName(), timestamp - 100, timestamp, instance.getDetails()); case 2: return new SnapshotInvocationRecord(instance.getSnapshotName(), - instance.getTimestamp() - 100, instance.getTimestamp(), + instance.getSnapshotFinishTimestamp() - 100, instance.getSnapshotFinishTimestamp(), randomValueOtherThan(instance.getDetails(), () -> randomAlphaOfLengthBetween(2,10))); default: throw new AssertionError("failure, got illegal switch case"); @@ -57,9 +57,18 @@ protected SnapshotInvocationRecord mutateInstance(SnapshotInvocationRecord insta public static SnapshotInvocationRecord randomSnapshotInvocationRecord() { return new SnapshotInvocationRecord( randomAlphaOfLengthBetween(5,10), - randomNonNegativeLong(), + randomNonNegativeNullableLong(), randomNonNegativeLong(), randomBoolean() ? null : randomAlphaOfLengthBetween(5, 10)); } + private static Long randomNonNegativeNullableLong() { + long value = randomLong(); + if (value < 0) { + return null; + } else { + return value; + } + } + } diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java index c9c18a9a815a5..01d8f17cc97b9 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java @@ -93,13 +93,12 @@ public static Optional maybeTakeSnapshot(final String jobId, final Clien logger.info("snapshot lifecycle policy [{}] issuing create snapshot [{}]", policyMetadata.getPolicy().getId(), request.snapshot()); clientWithHeaders.admin().cluster().createSnapshot(request, new ActionListener<>() { - private long snapshotStartTime; @Override public void onResponse(CreateSnapshotResponse createSnapshotResponse) { logger.debug("snapshot response for [{}]: {}", policyMetadata.getPolicy().getId(), Strings.toString(createSnapshotResponse)); final SnapshotInfo snapInfo = createSnapshotResponse.getSnapshotInfo(); - snapshotStartTime = snapInfo.startTime(); + long snapshotStartTime = snapInfo.startTime(); // Check that there are no failed shards, since the request may not entirely // fail, but may still have failures (such as in the case of an aborted snapshot) if (snapInfo.failedShards() == 0) { @@ -127,7 +126,7 @@ public void onFailure(Exception e) { policyMetadata.getPolicy().getId(), e); final long timestamp = Instant.now().toEpochMilli(); clusterService.submitStateUpdateTask("slm-record-failure-" + policyMetadata.getPolicy().getId(), - WriteJobStatus.failure(policyMetadata.getPolicy().getId(), request.snapshot(), snapshotStartTime, timestamp, e)); + WriteJobStatus.failure(policyMetadata.getPolicy().getId(), request.snapshot(), timestamp, e)); final SnapshotHistoryItem failureRecord; try { failureRecord = SnapshotHistoryItem.creationFailureRecord(timestamp, policyMetadata.getPolicy(), @@ -168,24 +167,24 @@ private static class WriteJobStatus extends ClusterStateUpdateTask { private final String policyName; private final String snapshotName; private final long snapshotStartTime; - private final long timestamp; + private final long snapshotFinishTime; private final Optional exception; - private WriteJobStatus(String policyName, String snapshotName, long snapshotStartTime, long timestamp, + private WriteJobStatus(String policyName, String snapshotName, long snapshotStartTime, long snapshotFinishTime, Optional exception) { this.policyName = policyName; this.snapshotName = snapshotName; this.exception = exception; this.snapshotStartTime = snapshotStartTime; - this.timestamp = timestamp; + this.snapshotFinishTime = snapshotFinishTime; } - static WriteJobStatus success(String policyId, String snapshotName, long snapshotStartTime, long timestamp) { - return new WriteJobStatus(policyId, snapshotName, snapshotStartTime, timestamp, Optional.empty()); + static WriteJobStatus success(String policyId, String snapshotName, long snapshotStartTime, long snapshotFinishTime) { + return new WriteJobStatus(policyId, snapshotName, snapshotStartTime, snapshotFinishTime, Optional.empty()); } - static WriteJobStatus failure(String policyId, String snapshotName, long snapshotStartTime, long timestamp, Exception exception) { - return new WriteJobStatus(policyId, snapshotName, snapshotStartTime, timestamp, Optional.of(exception)); + static WriteJobStatus failure(String policyId, String snapshotName, long timestamp, Exception exception) { + return new WriteJobStatus(policyId, snapshotName, timestamp, timestamp, Optional.of(exception)); } private String exceptionToString() throws IOException { @@ -224,11 +223,11 @@ public ClusterState execute(ClusterState currentState) throws Exception { if (exception.isPresent()) { stats.snapshotFailed(policyName); - newPolicyMetadata.setLastFailure(new SnapshotInvocationRecord(snapshotName, snapshotStartTime, timestamp, + newPolicyMetadata.setLastFailure(new SnapshotInvocationRecord(snapshotName, snapshotStartTime, snapshotFinishTime, exceptionToString())); } else { stats.snapshotTaken(policyName); - newPolicyMetadata.setLastSuccess(new SnapshotInvocationRecord(snapshotName, snapshotStartTime, timestamp, null)); + newPolicyMetadata.setLastSuccess(new SnapshotInvocationRecord(snapshotName, snapshotStartTime, snapshotFinishTime, null)); } snapLifecycles.put(policyName, newPolicyMetadata.build()); From 81f588ab6a42704bc86c0755e7dd5509e9481fad Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Mon, 26 Jul 2021 17:34:18 -0500 Subject: [PATCH 09/29] fixing lines more than 140 chars --- .../elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java index 9a2be3cd206bd..7e4e5b249ef3b 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java @@ -70,12 +70,16 @@ public Result isConditionMet(Index index, ClusterState clusterState) { } else { logger.info("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", - snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); + snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), + phaseTime, + snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); } return new Result(false, notExecutedMessage(phaseTime)); } logger.info("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", - snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); + snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), + phaseTime, + snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); return new Result(true, null); } From adbfc3b5c4d0538c9163f4f31ce6bbbcc11c64fc Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Tue, 27 Jul 2021 08:42:16 -0500 Subject: [PATCH 10/29] changing step log level --- .../xpack/core/ilm/WaitForSnapshotStep.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java index 7e4e5b249ef3b..0af863871dbfe 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java @@ -62,21 +62,24 @@ public Result isConditionMet(Index index, ClusterState clusterState) { SnapshotLifecyclePolicyMetadata snapPolicyMeta = snapMeta.getSnapshotConfigurations().get(policy); if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < phaseTime) { - //TODO: Clean up all this logging once I figure out the TimeSeriesLiefecycleActionsIT timing problems if (snapPolicyMeta.getLastSuccess() == null) { - logger.info("Not executing policy because no last snapshot success. Phase time: {}", phaseTime); + logger.debug("Not executing policy because no last snapshot success. Phase time: {}", phaseTime); } else if (snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null) { - logger.info("Not executing policy because no last snapshot start date."); + /* + * This is because we are running in mixed cluster mode, and the snapshot was taken on an older master, which then went + * down before this check could happen. We'll wait until a snapshot is taken on this newer master before passing this check. + */ + logger.debug("Not executing policy because no last snapshot start date. Phase time: {}", phaseTime); } else { - logger.info("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", + logger.debug("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); } return new Result(false, notExecutedMessage(phaseTime)); } - logger.info("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", + logger.debug("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); From 476086c2b3bfa9d8b57e6860998e489fda2176d4 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Tue, 27 Jul 2021 11:04:19 -0500 Subject: [PATCH 11/29] re-enabling logging to troubleshoot timing issue on jenkins --- .../elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java index 0af863871dbfe..5d7617948c389 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java @@ -63,23 +63,23 @@ public Result isConditionMet(Index index, ClusterState clusterState) { if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < phaseTime) { if (snapPolicyMeta.getLastSuccess() == null) { - logger.debug("Not executing policy because no last snapshot success. Phase time: {}", phaseTime); + logger.info("Not executing policy because no last snapshot success. Phase time: {}", phaseTime); } else if (snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null) { /* * This is because we are running in mixed cluster mode, and the snapshot was taken on an older master, which then went * down before this check could happen. We'll wait until a snapshot is taken on this newer master before passing this check. */ - logger.debug("Not executing policy because no last snapshot start date. Phase time: {}", phaseTime); + logger.info("Not executing policy because no last snapshot start date. Phase time: {}", phaseTime); } else { - logger.debug("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", + logger.info("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); } return new Result(false, notExecutedMessage(phaseTime)); } - logger.debug("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", + logger.info("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); From d1d0e54a910230b2f99d40bfb9508c5ad5005f95 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Tue, 27 Jul 2021 13:41:21 -0500 Subject: [PATCH 12/29] waiting for the delete phase to start --- .../xpack/ilm/TimeSeriesLifecycleActionsIT.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index cea71f18b773c..6c11579ebb2bf 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -279,8 +279,8 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { } /* - * This method waits until phase_time gets set in the state store. Otherwise we can wind up starting a snapshot before the ILM policy - * is ready. + * This method waits until phase_time gets set in the state store for the delete phase. Otherwise we can wind up starting a snapshot + * before the ILM policy is ready. */ @SuppressWarnings("unchecked") private void waitForPhaseTime() throws Exception { @@ -292,8 +292,11 @@ private void waitForPhaseTime() throws Exception { Map indexMap = (Map) indices.get(index); Map ilm = (Map) indexMap.get("ilm"); assertNotNull(ilm); + Object phase = ilm.get("phase"); + assertEquals("delete", phase); Object phase_time = ilm.get("phase_time"); assertNotNull(phase_time); + logger.info("Found phase time for delete phase: {}", phase_time); }); } From 9de69d00fffeebb3849fc0f1c78dae0c30ccd76d Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Tue, 27 Jul 2021 15:43:54 -0500 Subject: [PATCH 13/29] changing step logging to debug --- .../elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java index 5d7617948c389..0af863871dbfe 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java @@ -63,23 +63,23 @@ public Result isConditionMet(Index index, ClusterState clusterState) { if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < phaseTime) { if (snapPolicyMeta.getLastSuccess() == null) { - logger.info("Not executing policy because no last snapshot success. Phase time: {}", phaseTime); + logger.debug("Not executing policy because no last snapshot success. Phase time: {}", phaseTime); } else if (snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null) { /* * This is because we are running in mixed cluster mode, and the snapshot was taken on an older master, which then went * down before this check could happen. We'll wait until a snapshot is taken on this newer master before passing this check. */ - logger.info("Not executing policy because no last snapshot start date. Phase time: {}", phaseTime); + logger.debug("Not executing policy because no last snapshot start date. Phase time: {}", phaseTime); } else { - logger.info("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", + logger.debug("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); } return new Result(false, notExecutedMessage(phaseTime)); } - logger.info("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", + logger.debug("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); From 6101d97c9c5dd791acc65fbe981c9897542208f4 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Tue, 27 Jul 2021 17:07:05 -0500 Subject: [PATCH 14/29] was waiting for the phase time in the wrong place --- .../elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index 6c11579ebb2bf..2a3d01c32e49a 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -231,6 +231,7 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)); String slmPolicy = randomAlphaOfLengthBetween(4, 10); createNewSingletonPolicy(client(), policy, "delete", new WaitForSnapshotAction(slmPolicy)); + waitForPhaseTime(); String snapshotRepo = randomAlphaOfLengthBetween(4, 10); createSnapshotRepo(client(), snapshotRepo, randomBoolean()); @@ -249,7 +250,6 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { }, slmPolicy); updatePolicy(client(), index, policy); - waitForPhaseTime(); assertBusy(() -> { Map indexILMState = explainIndex(client(), index); From ab2b387aa741538730bbefafaa09293eb0d976aa Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Wed, 28 Jul 2021 08:35:51 -0500 Subject: [PATCH 15/29] more troubleshooting logging --- .../elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java | 8 ++++---- .../xpack/ilm/TimeSeriesLifecycleActionsIT.java | 6 +++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java index 0af863871dbfe..5d7617948c389 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java @@ -63,23 +63,23 @@ public Result isConditionMet(Index index, ClusterState clusterState) { if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < phaseTime) { if (snapPolicyMeta.getLastSuccess() == null) { - logger.debug("Not executing policy because no last snapshot success. Phase time: {}", phaseTime); + logger.info("Not executing policy because no last snapshot success. Phase time: {}", phaseTime); } else if (snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null) { /* * This is because we are running in mixed cluster mode, and the snapshot was taken on an older master, which then went * down before this check could happen. We'll wait until a snapshot is taken on this newer master before passing this check. */ - logger.debug("Not executing policy because no last snapshot start date. Phase time: {}", phaseTime); + logger.info("Not executing policy because no last snapshot start date. Phase time: {}", phaseTime); } else { - logger.debug("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", + logger.info("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); } return new Result(false, notExecutedMessage(phaseTime)); } - logger.debug("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", + logger.info("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); diff --git a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index 2a3d01c32e49a..f631b65f1e845 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -56,6 +56,7 @@ import java.util.Locale; import java.util.Map; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; import static java.util.Collections.singletonMap; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; @@ -231,7 +232,6 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)); String slmPolicy = randomAlphaOfLengthBetween(4, 10); createNewSingletonPolicy(client(), policy, "delete", new WaitForSnapshotAction(slmPolicy)); - waitForPhaseTime(); String snapshotRepo = randomAlphaOfLengthBetween(4, 10); createSnapshotRepo(client(), snapshotRepo, randomBoolean()); @@ -250,6 +250,7 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { }, slmPolicy); updatePolicy(client(), index, policy); + waitForPhaseTime(); assertBusy(() -> { Map indexILMState = explainIndex(client(), index); @@ -290,6 +291,9 @@ private void waitForPhaseTime() throws Exception { Map metadata = (Map) response.get("metadata"); Map indices = (Map) metadata.get("indices"); Map indexMap = (Map) indices.get(index); + logger.info("indexMap: {}", indexMap.keySet().stream() + .map(key -> key + "=" + indexMap.get(key)) + .collect(Collectors.joining(", ", "{", "}"))); Map ilm = (Map) indexMap.get("ilm"); assertNotNull(ilm); Object phase = ilm.get("phase"); From d7d2a25488fbf3903a2382999b1d86ecbf36196c Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Wed, 28 Jul 2021 10:11:16 -0500 Subject: [PATCH 16/29] more test troubleshooting --- .../elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index f631b65f1e845..d36af5d845ce8 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -217,7 +217,7 @@ public void testWaitForSnapshot() throws Exception { assertThat(indexILMState.get("action"), is("wait_for_snapshot")); assertThat(indexILMState.get("step"), is("wait-for-snapshot")); }, slmPolicy); - + waitForPhaseTime(); Request request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); assertOK(client().performRequest(request)); assertBusy(() -> { @@ -258,7 +258,7 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { assertThat(indexILMState.get("action"), is("wait_for_snapshot")); assertThat(indexILMState.get("step"), is("wait-for-snapshot")); }, slmPolicy); - + waitForPhaseTime(); request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); assertOK(client().performRequest(request)); From 6d608236237578724bf6eee2deaad6ee566a4a72 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Wed, 28 Jul 2021 10:30:42 -0500 Subject: [PATCH 17/29] more test troubleshooting --- .../elasticsearch/xpack/ilm/IndexLifecycleTransition.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java index b738a5415201d..c6260e2856389 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java @@ -256,13 +256,18 @@ private static LifecycleExecutionState updateExecutionStateToStep(LifecyclePolic policyMetadata.getVersion(), policyMetadata.getModifiedDate()); newPhaseDefinition = Strings.toString(phaseExecutionInfo, false, false); updatedState.setPhaseDefinition(newPhaseDefinition); + logger.info("Updating phase time to {} for phase {} and step {}", nowAsMillis, newStep.getPhase(), newStep.getName()); updatedState.setPhaseTime(nowAsMillis); } else if (currentStep.getPhase().equals(InitializePolicyContextStep.INITIALIZATION_PHASE)) { // The "new" phase is the initialization phase, usually the phase // time would be set on phase transition, but since there is no // transition into the "new" phase, we set it any time in the "new" // phase + logger.info("Updating phase time to {} for phase {} and step {}", nowAsMillis, newStep.getPhase(), newStep.getName()); updatedState.setPhaseTime(nowAsMillis); + } else { + logger.info("Not changing phase time from {} for phase {} and step {}", existingState.getPhaseTime(), newStep.getPhase(), + newStep.getName()); } if (currentStep == null || currentStep.getAction().equals(newStep.getAction()) == false) { From 230096a765f43485c3228cbbf0266dbd471b55dd Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Wed, 28 Jul 2021 15:05:34 -0500 Subject: [PATCH 18/29] Code review feedback --- .../xpack/core/ilm/WaitForSnapshotStep.java | 9 +++--- .../core/slm/SnapshotInvocationRecord.java | 8 +++-- .../ilm/TimeSeriesLifecycleActionsIT.java | 32 +++++++++---------- .../xpack/ilm/IndexLifecycleTransition.java | 5 --- .../xpack/slm/SnapshotLifecycleTask.java | 2 +- 5 files changed, 26 insertions(+), 30 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java index 5d7617948c389..918c83bc49299 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java @@ -63,23 +63,24 @@ public Result isConditionMet(Index index, ClusterState clusterState) { if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < phaseTime) { if (snapPolicyMeta.getLastSuccess() == null) { - logger.info("Not executing policy because no last snapshot success. Phase time: {}", phaseTime); + logger.debug("skipping ILM policy execution because there is no last snapshot success, phase time: {}", phaseTime); } else if (snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null) { /* * This is because we are running in mixed cluster mode, and the snapshot was taken on an older master, which then went * down before this check could happen. We'll wait until a snapshot is taken on this newer master before passing this check. */ - logger.info("Not executing policy because no last snapshot start date. Phase time: {}", phaseTime); + logger.debug("skipping ILM policy execution because no last snapshot start date, phase time: {}", phaseTime); } else { - logger.info("Not executing policy because snapshot start time {} is before phase time {}. Snapshot timestamp is {}", + logger.debug("skipping ILM policy execution because snapshot start time {} is before phase time {}, snapshot timestamp " + + "is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); } return new Result(false, notExecutedMessage(phaseTime)); } - logger.info("Executing policy because snapshot start time {} is after phase time {}. Snapshot timestamp is {}", + logger.debug("executing policy because snapshot start time {} is after phase time {}, snapshot timestamp is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), phaseTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java index eb3d117fd2515..5d2334083f516 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotInvocationRecord.java @@ -18,6 +18,7 @@ import org.elasticsearch.common.xcontent.ToXContentObject; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.core.Nullable; import java.io.IOException; import java.util.Objects; @@ -30,7 +31,7 @@ public class SnapshotInvocationRecord extends AbstractDiffable { static final ParseField SNAPSHOT_NAME = new ParseField("snapshot_name"); - static final ParseField START_TIMESTAMP = new ParseField("snapshot_start_time"); + static final ParseField START_TIMESTAMP = new ParseField("start_time"); static final ParseField TIMESTAMP = new ParseField("time"); static final ParseField DETAILS = new ParseField("details"); @@ -63,7 +64,7 @@ public SnapshotInvocationRecord(String snapshotName, Long snapshotStartTimestamp public SnapshotInvocationRecord(StreamInput in) throws IOException { this.snapshotName = in.readString(); - if(in.getVersion().onOrAfter(Version.V_8_0_0)) { + if (in.getVersion().onOrAfter(Version.V_8_0_0)) { this.snapshotStartTimestamp = in.readOptionalVLong(); } else { this.snapshotStartTimestamp = null; @@ -76,6 +77,7 @@ public String getSnapshotName() { return snapshotName; } + @Nullable public Long getSnapshotStartTimestamp() { return snapshotStartTimestamp; } @@ -104,7 +106,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws { builder.field(SNAPSHOT_NAME.getPreferredName(), snapshotName); if (snapshotStartTimestamp != null) { - builder.timeField(START_TIMESTAMP.getPreferredName(), "snapshot_start_time_string", snapshotStartTimestamp); + builder.timeField(START_TIMESTAMP.getPreferredName(), "start_time_string", snapshotStartTimestamp); } builder.timeField(TIMESTAMP.getPreferredName(), "time_string", snapshotFinishTimestamp); if (Objects.nonNull(details)) { diff --git a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index d36af5d845ce8..0d67d74db32aa 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -16,15 +16,15 @@ import org.elasticsearch.client.ResponseException; import org.elasticsearch.cluster.metadata.DataStream; import org.elasticsearch.cluster.metadata.IndexMetadata; -import org.elasticsearch.core.CheckedRunnable; -import org.elasticsearch.core.Nullable; import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.core.TimeValue; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentHelper; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.common.xcontent.json.JsonXContent; +import org.elasticsearch.core.CheckedRunnable; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.core.TimeValue; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.engine.EngineConfig; import org.elasticsearch.rest.action.admin.indices.RestPutIndexTemplateAction; @@ -56,7 +56,6 @@ import java.util.Locale; import java.util.Map; import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; import static java.util.Collections.singletonMap; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; @@ -199,9 +198,10 @@ public void testWaitForSnapshot() throws Exception { createIndexWithSettings(client(), index, alias, Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)); String slmPolicy = randomAlphaOfLengthBetween(4, 10); - createNewSingletonPolicy(client(), policy, "delete", new WaitForSnapshotAction(slmPolicy)); + final String phaseName = "delete"; + createNewSingletonPolicy(client(), policy, phaseName, new WaitForSnapshotAction(slmPolicy)); updatePolicy(client(), index, policy); - waitForPhaseTime(); + waitForPhaseTime(phaseName); assertBusy(() -> { Map indexILMState = explainIndex(client(), index); assertThat(indexILMState.get("action"), is("wait_for_snapshot")); @@ -217,7 +217,7 @@ public void testWaitForSnapshot() throws Exception { assertThat(indexILMState.get("action"), is("wait_for_snapshot")); assertThat(indexILMState.get("step"), is("wait-for-snapshot")); }, slmPolicy); - waitForPhaseTime(); + waitForPhaseTime(phaseName); // The phase time was reset after the previous line because the action went into the ERROR step and back out Request request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); assertOK(client().performRequest(request)); assertBusy(() -> { @@ -231,7 +231,8 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { createIndexWithSettings(client(), index, alias, Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)); String slmPolicy = randomAlphaOfLengthBetween(4, 10); - createNewSingletonPolicy(client(), policy, "delete", new WaitForSnapshotAction(slmPolicy)); + final String phaseName = "delete"; + createNewSingletonPolicy(client(), policy, phaseName, new WaitForSnapshotAction(slmPolicy)); String snapshotRepo = randomAlphaOfLengthBetween(4, 10); createSnapshotRepo(client(), snapshotRepo, randomBoolean()); @@ -250,7 +251,7 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { }, slmPolicy); updatePolicy(client(), index, policy); - waitForPhaseTime(); + waitForPhaseTime(phaseName); assertBusy(() -> { Map indexILMState = explainIndex(client(), index); @@ -258,7 +259,7 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { assertThat(indexILMState.get("action"), is("wait_for_snapshot")); assertThat(indexILMState.get("step"), is("wait-for-snapshot")); }, slmPolicy); - waitForPhaseTime(); + request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); assertOK(client().performRequest(request)); @@ -280,27 +281,24 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { } /* - * This method waits until phase_time gets set in the state store for the delete phase. Otherwise we can wind up starting a snapshot + * This method waits until phase_time gets set in the state store for the given phase name. Otherwise we can wind up starting a snapshot * before the ILM policy is ready. */ @SuppressWarnings("unchecked") - private void waitForPhaseTime() throws Exception { + private void waitForPhaseTime(String phaseName) throws Exception { assertBusy(() -> { Request request = new Request("GET", "/_cluster/state/metadata/" + index); Map response = entityAsMap(client().performRequest(request)); Map metadata = (Map) response.get("metadata"); Map indices = (Map) metadata.get("indices"); Map indexMap = (Map) indices.get(index); - logger.info("indexMap: {}", indexMap.keySet().stream() - .map(key -> key + "=" + indexMap.get(key)) - .collect(Collectors.joining(", ", "{", "}"))); Map ilm = (Map) indexMap.get("ilm"); assertNotNull(ilm); Object phase = ilm.get("phase"); - assertEquals("delete", phase); + assertEquals(phaseName, phase); Object phase_time = ilm.get("phase_time"); assertNotNull(phase_time); - logger.info("Found phase time for delete phase: {}", phase_time); + logger.info("found phase time for {} phase: {}", phaseName, phase_time); }); } diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java index c6260e2856389..b738a5415201d 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java @@ -256,18 +256,13 @@ private static LifecycleExecutionState updateExecutionStateToStep(LifecyclePolic policyMetadata.getVersion(), policyMetadata.getModifiedDate()); newPhaseDefinition = Strings.toString(phaseExecutionInfo, false, false); updatedState.setPhaseDefinition(newPhaseDefinition); - logger.info("Updating phase time to {} for phase {} and step {}", nowAsMillis, newStep.getPhase(), newStep.getName()); updatedState.setPhaseTime(nowAsMillis); } else if (currentStep.getPhase().equals(InitializePolicyContextStep.INITIALIZATION_PHASE)) { // The "new" phase is the initialization phase, usually the phase // time would be set on phase transition, but since there is no // transition into the "new" phase, we set it any time in the "new" // phase - logger.info("Updating phase time to {} for phase {} and step {}", nowAsMillis, newStep.getPhase(), newStep.getName()); updatedState.setPhaseTime(nowAsMillis); - } else { - logger.info("Not changing phase time from {} for phase {} and step {}", existingState.getPhaseTime(), newStep.getPhase(), - newStep.getName()); } if (currentStep == null || currentStep.getAction().equals(newStep.getAction()) == false) { diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java index 01d8f17cc97b9..3b0a874d0837d 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java @@ -223,7 +223,7 @@ public ClusterState execute(ClusterState currentState) throws Exception { if (exception.isPresent()) { stats.snapshotFailed(policyName); - newPolicyMetadata.setLastFailure(new SnapshotInvocationRecord(snapshotName, snapshotStartTime, snapshotFinishTime, + newPolicyMetadata.setLastFailure(new SnapshotInvocationRecord(snapshotName, null, snapshotFinishTime, exceptionToString())); } else { stats.snapshotTaken(policyName); From e4d669aa3f3cf6fb646958aa0915f5ae970c3ad3 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Wed, 28 Jul 2021 17:48:35 -0500 Subject: [PATCH 19/29] temporary log messages for troubleshooting --- .../java/org/elasticsearch/snapshots/SnapshotInfo.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java index 5911b719dd0a9..b8040f2705307 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java @@ -7,6 +7,8 @@ */ package org.elasticsearch.snapshots; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; import org.elasticsearch.action.ShardOperationFailedException; import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsRequest; @@ -44,7 +46,7 @@ * Information about a snapshot */ public final class SnapshotInfo implements Comparable, ToXContent, Writeable { - + private static final Logger logger = LogManager.getLogger(SnapshotInfo.class); public static final String INDEX_DETAILS_XCONTENT_PARAM = "index_details"; public static final String INCLUDE_REPOSITORY_XCONTENT_PARAM = "include_repository"; @@ -137,6 +139,7 @@ private void setIndexSnapshotDetails(Map indexSnap } private void setStartTime(long startTime) { + logger.info("setting start time in setStartTime: {}", startTime); this.startTime = startTime; } @@ -445,6 +448,7 @@ public SnapshotInfo( this.state = state; this.reason = reason; this.version = version; + logger.info("setting start time in constructor: {}", startTime); this.startTime = startTime; this.endTime = endTime; this.totalShards = totalShards; @@ -469,6 +473,7 @@ public static SnapshotInfo readFrom(final StreamInput in) throws IOException { final SnapshotState state = in.readBoolean() ? SnapshotState.fromValue(in.readByte()) : null; final String reason = in.readOptionalString(); final long startTime = in.readVLong(); + logger.info("setting start time in readFrom: {}", startTime); final long endTime = in.readVLong(); final int totalShards = in.readVInt(); final int successfulShards = in.readVInt(); @@ -903,6 +908,7 @@ public static SnapshotInfo fromXContentInternal(final String repoName, final XCo break; case START_TIME: startTime = parser.longValue(); + logger.info("setting start time in fromXContent: {}", startTime); break; case END_TIME: endTime = parser.longValue(); From 494fe6daf23946e91f2615929b5ff2f9733f4468 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Thu, 29 Jul 2021 13:39:06 -0500 Subject: [PATCH 20/29] more debugging logging --- .../org/elasticsearch/cluster/SnapshotsInProgress.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java index 73ec207c93297..dae5b799b2d52 100644 --- a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java +++ b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java @@ -11,6 +11,9 @@ import com.carrotsearch.hppc.ObjectContainer; import com.carrotsearch.hppc.cursors.ObjectCursor; import com.carrotsearch.hppc.cursors.ObjectObjectCursor; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; import org.elasticsearch.cluster.ClusterState.Custom; import org.elasticsearch.core.Nullable; @@ -31,6 +34,7 @@ import org.elasticsearch.snapshots.Snapshot; import org.elasticsearch.snapshots.SnapshotFeatureInfo; import org.elasticsearch.snapshots.SnapshotId; +import org.elasticsearch.snapshots.SnapshotsService; import java.io.IOException; import java.util.Collections; @@ -54,7 +58,13 @@ public class SnapshotsInProgress extends AbstractNamedDiffable implement private final List entries; + private static final Logger logger = LogManager.getLogger(SnapshotsInProgress.class); + public static SnapshotsInProgress of(List entries) { + logger.info("snapshots in progress: {}", entries.size()); + for (Entry entry : entries) { + logger.info("snapshotId: {}, startTime: {}", entry.snapshot.getSnapshotId(), entry.startTime); + } if (entries.isEmpty()) { return EMPTY; } From a1389d8f78dae2245625719fb689017df58bf162 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Thu, 29 Jul 2021 16:50:47 -0500 Subject: [PATCH 21/29] testing a theory --- .../org/elasticsearch/threadpool/ThreadPool.java | 2 ++ .../xpack/ilm/TimeSeriesLifecycleActionsIT.java | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java b/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java index 92def23402f66..abbad8e531b99 100644 --- a/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java +++ b/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java @@ -591,8 +591,10 @@ long relativeTimeInNanos() { */ long absoluteTimeInMillis() { if (0 < interval) { + logger.info("Using cached time {}", absoluteMillis); return absoluteMillis; } + logger.info("Using real time"); return System.currentTimeMillis(); } diff --git a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index 0d67d74db32aa..c25179eea5729 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -217,7 +217,13 @@ public void testWaitForSnapshot() throws Exception { assertThat(indexILMState.get("action"), is("wait_for_snapshot")); assertThat(indexILMState.get("step"), is("wait-for-snapshot")); }, slmPolicy); - waitForPhaseTime(phaseName); // The phase time was reset after the previous line because the action went into the ERROR step and back out + waitForPhaseTime(phaseName); // The phase time was reset because the action went into the ERROR step and back out + /* + * The phase time is set at System.currentTimeMillis(), but the snapshot start time is set at ThreadPool.absoluteTimeInMillis(), + * which can be 200 ms or more behind real time. So if a snapshot is created right after the ILM policy, the snapshot time can be + * before the policy phase time. + */ + Thread.sleep(500); Request request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); assertOK(client().performRequest(request)); assertBusy(() -> { @@ -259,7 +265,12 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { assertThat(indexILMState.get("action"), is("wait_for_snapshot")); assertThat(indexILMState.get("step"), is("wait-for-snapshot")); }, slmPolicy); - + /* + * The phase time is set at System.currentTimeMillis(), but the snapshot start time is set at ThreadPool.absoluteTimeInMillis(), + * which can be 200 ms or more behind real time. So if a snapshot is created right after the ILM policy, the snapshot time can be + * before the policy phase time. + */ + Thread.sleep(500); request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); assertOK(client().performRequest(request)); From 068568e2084796e8d11d0c4da671575916ec248e Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Fri, 30 Jul 2021 08:16:41 -0500 Subject: [PATCH 22/29] Using ThreadPool.absoluteTimeInMillis() for ILM policy time --- .../elasticsearch/cluster/SnapshotsInProgress.java | 1 - .../xpack/ilm/TimeSeriesLifecycleActionsIT.java | 14 ++------------ .../elasticsearch/xpack/ilm/IndexLifecycle.java | 2 +- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java index dae5b799b2d52..a7ceb7a99ce29 100644 --- a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java +++ b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java @@ -34,7 +34,6 @@ import org.elasticsearch.snapshots.Snapshot; import org.elasticsearch.snapshots.SnapshotFeatureInfo; import org.elasticsearch.snapshots.SnapshotId; -import org.elasticsearch.snapshots.SnapshotsService; import java.io.IOException; import java.util.Collections; diff --git a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index c25179eea5729..ef6988d65db4d 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -218,12 +218,7 @@ public void testWaitForSnapshot() throws Exception { assertThat(indexILMState.get("step"), is("wait-for-snapshot")); }, slmPolicy); waitForPhaseTime(phaseName); // The phase time was reset because the action went into the ERROR step and back out - /* - * The phase time is set at System.currentTimeMillis(), but the snapshot start time is set at ThreadPool.absoluteTimeInMillis(), - * which can be 200 ms or more behind real time. So if a snapshot is created right after the ILM policy, the snapshot time can be - * before the policy phase time. - */ - Thread.sleep(500); + Request request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); assertOK(client().performRequest(request)); assertBusy(() -> { @@ -265,12 +260,7 @@ public void testWaitForSnapshotSlmExecutedBefore() throws Exception { assertThat(indexILMState.get("action"), is("wait_for_snapshot")); assertThat(indexILMState.get("step"), is("wait-for-snapshot")); }, slmPolicy); - /* - * The phase time is set at System.currentTimeMillis(), but the snapshot start time is set at ThreadPool.absoluteTimeInMillis(), - * which can be 200 ms or more behind real time. So if a snapshot is created right after the ILM policy, the snapshot time can be - * before the policy phase time. - */ - Thread.sleep(500); + request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); assertOK(client().performRequest(request)); diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java index b8b4ae9842367..367a23fa5bfab 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java @@ -196,7 +196,7 @@ public Collection createComponents(Client client, ClusterService cluster ilmHistoryStore.set(new ILMHistoryStore(settings, new OriginSettingClient(client, INDEX_LIFECYCLE_ORIGIN), clusterService, threadPool)); indexLifecycleInitialisationService.set(new IndexLifecycleService(settings, client, clusterService, threadPool, - getClock(), System::currentTimeMillis, xContentRegistry, ilmHistoryStore.get(), getLicenseState())); + getClock(), threadPool::absoluteTimeInMillis, xContentRegistry, ilmHistoryStore.get(), getLicenseState())); components.add(indexLifecycleInitialisationService.get()); SnapshotLifecycleTemplateRegistry templateRegistry = new SnapshotLifecycleTemplateRegistry(settings, clusterService, threadPool, From 145dbdd9462b2f312c34bd48f7674c2e8aed4a41 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Fri, 30 Jul 2021 10:48:13 -0500 Subject: [PATCH 23/29] Changing WaitForSnapshotStep to use action time instead of phase time --- .../xpack/core/ilm/WaitForSnapshotStep.java | 28 +++++++++---------- .../ilm/TimeSeriesLifecycleActionsIT.java | 25 +++++++++++++++++ 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java index 918c83bc49299..563e0f614ab8d 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStep.java @@ -21,8 +21,8 @@ /*** * A step that waits for snapshot to be taken by SLM to ensure we have backup before we delete the index. - * It will signal error if it can't get data needed to do the check (phase time from ILM and SLM metadata) - * and will only return success if execution of SLM policy took place after index entered deleted phase. + * It will signal error if it can't get data needed to do the check (action time from ILM and SLM metadata) + * and will only return success if execution of SLM policy took place after index entered the wait for snapshot action. */ public class WaitForSnapshotStep extends ClusterStateWaitStep { @@ -33,7 +33,7 @@ public class WaitForSnapshotStep extends ClusterStateWaitStep { private static final String POLICY_NOT_EXECUTED_MESSAGE = "waiting for policy '%s' to be executed since %s"; private static final String POLICY_NOT_FOUND_MESSAGE = "configured policy '%s' not found"; private static final String NO_INDEX_METADATA_MESSAGE = "no index metadata found for index '%s'"; - private static final String NO_PHASE_TIME_MESSAGE = "no information about ILM phase start in index metadata for index '%s'"; + private static final String NO_ACTION_TIME_MESSAGE = "no information about ILM action start in index metadata for index '%s'"; private final String policy; @@ -49,10 +49,10 @@ public Result isConditionMet(Index index, ClusterState clusterState) { throw error(NO_INDEX_METADATA_MESSAGE, index.getName()); } - Long phaseTime = LifecycleExecutionState.fromIndexMetadata(indexMetadata).getPhaseTime(); + Long actionTime = LifecycleExecutionState.fromIndexMetadata(indexMetadata).getActionTime(); - if (phaseTime == null) { - throw error(NO_PHASE_TIME_MESSAGE, index.getName()); + if (actionTime == null) { + throw error(NO_ACTION_TIME_MESSAGE, index.getName()); } SnapshotLifecycleMetadata snapMeta = clusterState.metadata().custom(SnapshotLifecycleMetadata.TYPE); @@ -61,28 +61,28 @@ public Result isConditionMet(Index index, ClusterState clusterState) { } SnapshotLifecyclePolicyMetadata snapPolicyMeta = snapMeta.getSnapshotConfigurations().get(policy); if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null || - snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < phaseTime) { + snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < actionTime) { if (snapPolicyMeta.getLastSuccess() == null) { - logger.debug("skipping ILM policy execution because there is no last snapshot success, phase time: {}", phaseTime); + logger.debug("skipping ILM policy execution because there is no last snapshot success, action time: {}", actionTime); } else if (snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null) { /* * This is because we are running in mixed cluster mode, and the snapshot was taken on an older master, which then went * down before this check could happen. We'll wait until a snapshot is taken on this newer master before passing this check. */ - logger.debug("skipping ILM policy execution because no last snapshot start date, phase time: {}", phaseTime); + logger.debug("skipping ILM policy execution because no last snapshot start date, action time: {}", actionTime); } else { - logger.debug("skipping ILM policy execution because snapshot start time {} is before phase time {}, snapshot timestamp " + + logger.debug("skipping ILM policy execution because snapshot start time {} is before action time {}, snapshot timestamp " + "is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), - phaseTime, + actionTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); } - return new Result(false, notExecutedMessage(phaseTime)); + return new Result(false, notExecutedMessage(actionTime)); } - logger.debug("executing policy because snapshot start time {} is after phase time {}, snapshot timestamp is {}", + logger.debug("executing policy because snapshot start time {} is after action time {}, snapshot timestamp is {}", snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(), - phaseTime, + actionTime, snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp()); return new Result(true, null); } diff --git a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index ef6988d65db4d..d3f23987b0b64 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -228,6 +228,31 @@ public void testWaitForSnapshot() throws Exception { }, slmPolicy); } + /* + * This test more rapidly creates a policy and then executes a snapshot, in an attempt to reproduce a timing bug where the snapshot + * time gets set to a time earlier than the policy's action's time. + */ + public void testWaitForSnapshotFast() throws Exception { + createIndexWithSettings(client(), index, alias, Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)); + String slmPolicy = randomAlphaOfLengthBetween(4, 10); + final String phaseName = "delete"; + String snapshotRepo = randomAlphaOfLengthBetween(4, 10); + createSnapshotRepo(client(), snapshotRepo, randomBoolean()); + createSlmPolicy(slmPolicy, snapshotRepo); + createNewSingletonPolicy(client(), policy, phaseName, new WaitForSnapshotAction(slmPolicy)); + updatePolicy(client(), index, policy); + waitForPhaseTime(phaseName); + + Request request = new Request("PUT", "/_slm/policy/" + slmPolicy + "/_execute"); + assertOK(client().performRequest(request)); + assertBusy(() -> { + Step.StepKey stepKey = getStepKeyForIndex(client(), index); + logger.info("step key for index {} is {}", index, stepKey); + assertThat(stepKey.getAction(), equalTo("complete")); + }, slmPolicy); + } + public void testWaitForSnapshotSlmExecutedBefore() throws Exception { createIndexWithSettings(client(), index, alias, Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)); From c3e189537a6ff25c58e92edff93495168b67d799 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Fri, 30 Jul 2021 10:55:33 -0500 Subject: [PATCH 24/29] Undoing a bunch of troubleshooting logging --- .../org/elasticsearch/cluster/SnapshotsInProgress.java | 8 -------- .../java/org/elasticsearch/snapshots/SnapshotInfo.java | 7 ------- .../java/org/elasticsearch/threadpool/ThreadPool.java | 2 -- .../elasticsearch/xpack/slm/SnapshotLifecycleTask.java | 4 ++-- 4 files changed, 2 insertions(+), 19 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java index a7ceb7a99ce29..eeb6d4505db1d 100644 --- a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java +++ b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java @@ -12,8 +12,6 @@ import com.carrotsearch.hppc.cursors.ObjectCursor; import com.carrotsearch.hppc.cursors.ObjectObjectCursor; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; import org.elasticsearch.cluster.ClusterState.Custom; import org.elasticsearch.core.Nullable; @@ -57,13 +55,7 @@ public class SnapshotsInProgress extends AbstractNamedDiffable implement private final List entries; - private static final Logger logger = LogManager.getLogger(SnapshotsInProgress.class); - public static SnapshotsInProgress of(List entries) { - logger.info("snapshots in progress: {}", entries.size()); - for (Entry entry : entries) { - logger.info("snapshotId: {}, startTime: {}", entry.snapshot.getSnapshotId(), entry.startTime); - } if (entries.isEmpty()) { return EMPTY; } diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java index b8040f2705307..cc0a91e98d4d2 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java @@ -7,8 +7,6 @@ */ package org.elasticsearch.snapshots; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; import org.elasticsearch.action.ShardOperationFailedException; import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsRequest; @@ -46,7 +44,6 @@ * Information about a snapshot */ public final class SnapshotInfo implements Comparable, ToXContent, Writeable { - private static final Logger logger = LogManager.getLogger(SnapshotInfo.class); public static final String INDEX_DETAILS_XCONTENT_PARAM = "index_details"; public static final String INCLUDE_REPOSITORY_XCONTENT_PARAM = "include_repository"; @@ -139,7 +136,6 @@ private void setIndexSnapshotDetails(Map indexSnap } private void setStartTime(long startTime) { - logger.info("setting start time in setStartTime: {}", startTime); this.startTime = startTime; } @@ -448,7 +444,6 @@ public SnapshotInfo( this.state = state; this.reason = reason; this.version = version; - logger.info("setting start time in constructor: {}", startTime); this.startTime = startTime; this.endTime = endTime; this.totalShards = totalShards; @@ -473,7 +468,6 @@ public static SnapshotInfo readFrom(final StreamInput in) throws IOException { final SnapshotState state = in.readBoolean() ? SnapshotState.fromValue(in.readByte()) : null; final String reason = in.readOptionalString(); final long startTime = in.readVLong(); - logger.info("setting start time in readFrom: {}", startTime); final long endTime = in.readVLong(); final int totalShards = in.readVInt(); final int successfulShards = in.readVInt(); @@ -908,7 +902,6 @@ public static SnapshotInfo fromXContentInternal(final String repoName, final XCo break; case START_TIME: startTime = parser.longValue(); - logger.info("setting start time in fromXContent: {}", startTime); break; case END_TIME: endTime = parser.longValue(); diff --git a/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java b/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java index abbad8e531b99..92def23402f66 100644 --- a/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java +++ b/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java @@ -591,10 +591,8 @@ long relativeTimeInNanos() { */ long absoluteTimeInMillis() { if (0 < interval) { - logger.info("Using cached time {}", absoluteMillis); return absoluteMillis; } - logger.info("Using real time"); return System.currentTimeMillis(); } diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java index 3b0a874d0837d..b2f13bf75dd67 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java @@ -98,13 +98,13 @@ public void onResponse(CreateSnapshotResponse createSnapshotResponse) { logger.debug("snapshot response for [{}]: {}", policyMetadata.getPolicy().getId(), Strings.toString(createSnapshotResponse)); final SnapshotInfo snapInfo = createSnapshotResponse.getSnapshotInfo(); - long snapshotStartTime = snapInfo.startTime(); + // Check that there are no failed shards, since the request may not entirely // fail, but may still have failures (such as in the case of an aborted snapshot) if (snapInfo.failedShards() == 0) { final long timestamp = Instant.now().toEpochMilli(); clusterService.submitStateUpdateTask("slm-record-success-" + policyMetadata.getPolicy().getId(), - WriteJobStatus.success(policyMetadata.getPolicy().getId(), request.snapshot(), snapshotStartTime, timestamp)); + WriteJobStatus.success(policyMetadata.getPolicy().getId(), request.snapshot(), snapInfo.startTime(), timestamp)); historyStore.putAsync(SnapshotHistoryItem.creationSuccessRecord(timestamp, policyMetadata.getPolicy(), request.snapshot())); } else { From fc08331b5766ae47b4b188a8cacf9a57cf54e327 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Fri, 30 Jul 2021 11:00:25 -0500 Subject: [PATCH 25/29] adding a comment about supplying time to indexLifecycleInitialisationService --- .../org/elasticsearch/xpack/ilm/IndexLifecycle.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java index 367a23fa5bfab..2cea548f3cbf5 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java @@ -139,6 +139,7 @@ import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.function.LongSupplier; import java.util.function.Supplier; import static org.elasticsearch.xpack.core.ClientHelper.INDEX_LIFECYCLE_ORIGIN; @@ -195,8 +196,15 @@ public Collection createComponents(Client client, ClusterService cluster ilmTemplateRegistry.initialize(); ilmHistoryStore.set(new ILMHistoryStore(settings, new OriginSettingClient(client, INDEX_LIFECYCLE_ORIGIN), clusterService, threadPool)); + /* + * Here we use threadPool::absoluteTimeInMillis rather than System::currentTimeInMillis because snapshot start time is set using + * ThreadPool.absoluteTimeInMillis(). ThreadPool.absoluteTimeInMillis() returns a cached time that can be several hundred + * milliseconds behind System.currentTimeMillis(). The result is that a snapshot taken after a policy is created can have a start + * time that is before the policy's (or action's) start time if System::currentTimeInMillis is used here. + */ + LongSupplier nowSupplier = threadPool::absoluteTimeInMillis; indexLifecycleInitialisationService.set(new IndexLifecycleService(settings, client, clusterService, threadPool, - getClock(), threadPool::absoluteTimeInMillis, xContentRegistry, ilmHistoryStore.get(), getLicenseState())); + getClock(), nowSupplier, xContentRegistry, ilmHistoryStore.get(), getLicenseState())); components.add(indexLifecycleInitialisationService.get()); SnapshotLifecycleTemplateRegistry templateRegistry = new SnapshotLifecycleTemplateRegistry(settings, clusterService, threadPool, From 0f43a21b60f8f9314894b443e503ecd3c7ac04ea Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Fri, 30 Jul 2021 11:04:57 -0500 Subject: [PATCH 26/29] cleanup --- .../java/org/elasticsearch/cluster/SnapshotsInProgress.java | 1 - .../src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java | 1 + .../main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java index eeb6d4505db1d..73ec207c93297 100644 --- a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java +++ b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java @@ -11,7 +11,6 @@ import com.carrotsearch.hppc.ObjectContainer; import com.carrotsearch.hppc.cursors.ObjectCursor; import com.carrotsearch.hppc.cursors.ObjectObjectCursor; - import org.elasticsearch.Version; import org.elasticsearch.cluster.ClusterState.Custom; import org.elasticsearch.core.Nullable; diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java index cc0a91e98d4d2..5911b719dd0a9 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotInfo.java @@ -44,6 +44,7 @@ * Information about a snapshot */ public final class SnapshotInfo implements Comparable, ToXContent, Writeable { + public static final String INDEX_DETAILS_XCONTENT_PARAM = "index_details"; public static final String INCLUDE_REPOSITORY_XCONTENT_PARAM = "include_repository"; diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java index 2cea548f3cbf5..5365a7df070f6 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java @@ -199,7 +199,7 @@ public Collection createComponents(Client client, ClusterService cluster /* * Here we use threadPool::absoluteTimeInMillis rather than System::currentTimeInMillis because snapshot start time is set using * ThreadPool.absoluteTimeInMillis(). ThreadPool.absoluteTimeInMillis() returns a cached time that can be several hundred - * milliseconds behind System.currentTimeMillis(). The result is that a snapshot taken after a policy is created can have a start + * milliseconds behind System.currentTimeMillis(). The result is that a snapshot taken after a policy is created can have a start * time that is before the policy's (or action's) start time if System::currentTimeInMillis is used here. */ LongSupplier nowSupplier = threadPool::absoluteTimeInMillis; From 416095c2c5f7ea8748b57831a199e31f4e6ad674 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Fri, 30 Jul 2021 11:16:58 -0500 Subject: [PATCH 27/29] checkstyle fix --- .../org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java index b2f13bf75dd67..b50bbd7c586b8 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java @@ -98,13 +98,13 @@ public void onResponse(CreateSnapshotResponse createSnapshotResponse) { logger.debug("snapshot response for [{}]: {}", policyMetadata.getPolicy().getId(), Strings.toString(createSnapshotResponse)); final SnapshotInfo snapInfo = createSnapshotResponse.getSnapshotInfo(); - // Check that there are no failed shards, since the request may not entirely // fail, but may still have failures (such as in the case of an aborted snapshot) if (snapInfo.failedShards() == 0) { + long snapshotStartTime = snapInfo.startTime(); final long timestamp = Instant.now().toEpochMilli(); clusterService.submitStateUpdateTask("slm-record-success-" + policyMetadata.getPolicy().getId(), - WriteJobStatus.success(policyMetadata.getPolicy().getId(), request.snapshot(), snapInfo.startTime(), timestamp)); + WriteJobStatus.success(policyMetadata.getPolicy().getId(), request.snapshot(), snapshotStartTime, timestamp)); historyStore.putAsync(SnapshotHistoryItem.creationSuccessRecord(timestamp, policyMetadata.getPolicy(), request.snapshot())); } else { From c2dd0c3cf37b3b7e6b99ac5f9d7c8325f980b900 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Fri, 30 Jul 2021 12:03:38 -0500 Subject: [PATCH 28/29] Updating WaitForSnapshotStepTests for switch to action date --- .../xpack/core/ilm/WaitForSnapshotStepTests.java | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java index b81753a4a47d2..2fbf7a88043a5 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java @@ -58,7 +58,7 @@ protected WaitForSnapshotStep copyInstance(WaitForSnapshotStep instance) { public void testNoSlmPolicies() { IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLength(10)) - .putCustom(LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY, Map.of("phase_time", Long.toString(randomLong()))) + .putCustom(LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY, Map.of("action_time", Long.toString(randomLong()))) .settings(settings(Version.CURRENT)) .numberOfShards(randomIntBetween(1, 5)).numberOfReplicas(randomIntBetween(0, 5)).build(); ImmutableOpenMap.Builder indices = @@ -82,7 +82,7 @@ public void testSlmPolicyNotExecuted() throws IOException { IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLength(10)) - .putCustom(LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY, Map.of("phase_time", Long.toString(randomLong()))) + .putCustom(LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY, Map.of("action_time", Long.toString(randomLong()))) .settings(settings(Version.CURRENT)) .numberOfShards(randomIntBetween(1, 5)).numberOfReplicas(randomIntBetween(0, 5)).build(); ImmutableOpenMap.Builder indices = @@ -124,7 +124,7 @@ private void assertSlmPolicyExecuted(boolean startTimeAfterPhaseTime, boolean fi OperationMode.RUNNING, null); IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLength(10)) - .putCustom(LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY, Map.of("phase_time", Long.toString(phaseTime))) + .putCustom(LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY, Map.of("action_time", Long.toString(phaseTime))) .settings(settings(Version.CURRENT)) .numberOfShards(randomIntBetween(1, 5)).numberOfReplicas(randomIntBetween(0, 5)).build(); ImmutableOpenMap.Builder indices = @@ -163,9 +163,8 @@ public void testNullStartTime() throws IOException { ImmutableOpenMap.builder().fPut(indexMetadata.getIndex().getName(), indexMetadata); Metadata.Builder meta = Metadata.builder().indices(indices.build()).putCustom(SnapshotLifecycleMetadata.TYPE, smlMetadata); ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metadata(meta).build(); - ClusterStateWaitStep.Result result = instance.isConditionMet(indexMetadata.getIndex(), clusterState); - assertFalse(result.isComplete()); - assertTrue(getMessage(result).contains("to be executed")); + IllegalStateException e = expectThrows(IllegalStateException.class, () -> instance.isConditionMet(indexMetadata.getIndex(), clusterState)); + assertTrue(e.getMessage().contains("no information about ILM action start")); } private String getMessage(ClusterStateWaitStep.Result result) throws IOException { From ab92c05d82c0d17dc37a989a55796e3315e9e195 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Fri, 30 Jul 2021 12:36:28 -0500 Subject: [PATCH 29/29] more checkstyle --- .../elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java index 2fbf7a88043a5..c3f0648ac43ef 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/WaitForSnapshotStepTests.java @@ -163,7 +163,8 @@ public void testNullStartTime() throws IOException { ImmutableOpenMap.builder().fPut(indexMetadata.getIndex().getName(), indexMetadata); Metadata.Builder meta = Metadata.builder().indices(indices.build()).putCustom(SnapshotLifecycleMetadata.TYPE, smlMetadata); ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metadata(meta).build(); - IllegalStateException e = expectThrows(IllegalStateException.class, () -> instance.isConditionMet(indexMetadata.getIndex(), clusterState)); + IllegalStateException e = expectThrows(IllegalStateException.class, () -> instance.isConditionMet(indexMetadata.getIndex(), + clusterState)); assertTrue(e.getMessage().contains("no information about ILM action start")); }