Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changing wait_for_snapshot to check start time rather than finish time #75644

Merged
merged 34 commits into from
Jul 30, 2021
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
aae7c59
Changing wait_for_snapshot to check snapshot start time rather than f…
masseyke Jul 22, 2021
e3b202c
Adding a test
masseyke Jul 22, 2021
4747ea7
Adding logging for more info on intermittent failure
masseyke Jul 26, 2021
5f28f21
Adding verbose logging to troubleshoot test failure
masseyke Jul 26, 2021
3a3bbf8
Putting end of if block and beginning of else on same line
masseyke Jul 26, 2021
c9b0e30
troublshooting IT test
masseyke Jul 26, 2021
7fc7b9c
Merge branch 'fix/73357' of github.com:masseyke/elasticsearch into fi…
masseyke Jul 26, 2021
c156d50
adding a wait to the IT test
masseyke Jul 26, 2021
9189fac
renaming timestamp and making snapshotStartTime nullable
masseyke Jul 26, 2021
81f588a
fixing lines more than 140 chars
masseyke Jul 26, 2021
adbfc3b
changing step log level
masseyke Jul 27, 2021
e550492
Merge branch 'master' into fix/73357
elasticmachine Jul 27, 2021
476086c
re-enabling logging to troubleshoot timing issue on jenkins
masseyke Jul 27, 2021
c7dc045
Merge branch 'fix/73357' of github.com:masseyke/elasticsearch into fi…
masseyke Jul 27, 2021
d1d0e54
waiting for the delete phase to start
masseyke Jul 27, 2021
9de69d0
changing step logging to debug
masseyke Jul 27, 2021
6101d97
was waiting for the phase time in the wrong place
masseyke Jul 27, 2021
ab2b387
more troubleshooting logging
masseyke Jul 28, 2021
d7d2a25
more test troubleshooting
masseyke Jul 28, 2021
6d60823
more test troubleshooting
masseyke Jul 28, 2021
35ca1ea
Merge branch 'master' into fix/73357
elasticmachine Jul 28, 2021
230096a
Code review feedback
masseyke Jul 28, 2021
e4d669a
temporary log messages for troubleshooting
masseyke Jul 28, 2021
494fe6d
more debugging logging
masseyke Jul 29, 2021
a1389d8
testing a theory
masseyke Jul 29, 2021
ef3c0f9
Merge branch 'master' into fix/73357
elasticmachine Jul 29, 2021
068568e
Using ThreadPool.absoluteTimeInMillis() for ILM policy time
masseyke Jul 30, 2021
145dbdd
Changing WaitForSnapshotStep to use action time instead of phase time
masseyke Jul 30, 2021
c3e1895
Undoing a bunch of troubleshooting logging
masseyke Jul 30, 2021
fc08331
adding a comment about supplying time to indexLifecycleInitialisation…
masseyke Jul 30, 2021
0f43a21
cleanup
masseyke Jul 30, 2021
416095c
checkstyle fix
masseyke Jul 30, 2021
c2dd0c3
Updating WaitForSnapshotStepTests for switch to action date
masseyke Jul 30, 2021
ab92c05
more checkstyle
masseyke Jul 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
*/
package org.elasticsearch.xpack.core.ilm;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.common.xcontent.ToXContentObject;
Expand All @@ -25,6 +27,7 @@
public class WaitForSnapshotStep extends ClusterStateWaitStep {

static final String NAME = "wait-for-snapshot";
private static final Logger logger = LogManager.getLogger(WaitForSnapshotStep.class);

private static final String MESSAGE_FIELD = "message";
private static final String POLICY_NOT_EXECUTED_MESSAGE = "waiting for policy '%s' to be executed since %s";
Expand Down Expand Up @@ -57,10 +60,30 @@ public Result isConditionMet(Index index, ClusterState clusterState) {
throw error(POLICY_NOT_FOUND_MESSAGE, policy);
}
SnapshotLifecyclePolicyMetadata snapPolicyMeta = snapMeta.getSnapshotConfigurations().get(policy);
if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getTimestamp() < phaseTime) {
if (snapPolicyMeta.getLastSuccess() == null || snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null ||
snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() < phaseTime) {
if (snapPolicyMeta.getLastSuccess() == null) {
logger.debug("skipping ILM policy execution because there is no last snapshot success, phase time: {}", phaseTime);
} else if (snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp() == null) {
/*
* This is because we are running in mixed cluster mode, and the snapshot was taken on an older master, which then went
* down before this check could happen. We'll wait until a snapshot is taken on this newer master before passing this check.
*/
logger.debug("skipping ILM policy execution because no last snapshot start date, phase time: {}", phaseTime);
}
else {
logger.debug("skipping ILM policy execution because snapshot start time {} is before phase time {}, snapshot timestamp " +
"is {}",
snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(),
phaseTime,
snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp());
}
return new Result(false, notExecutedMessage(phaseTime));
}

logger.debug("executing policy because snapshot start time {} is after phase time {}, snapshot timestamp is {}",
snapPolicyMeta.getLastSuccess().getSnapshotStartTimestamp(),
phaseTime,
snapPolicyMeta.getLastSuccess().getSnapshotFinishTimestamp());
return new Result(true, null);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,18 @@

package org.elasticsearch.xpack.core.slm;

import org.elasticsearch.Version;
import org.elasticsearch.cluster.AbstractDiffable;
import org.elasticsearch.cluster.Diffable;
import org.elasticsearch.common.xcontent.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ParseField;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.core.Nullable;

import java.io.IOException;
import java.util.Objects;
Expand All @@ -29,19 +31,22 @@ public class SnapshotInvocationRecord extends AbstractDiffable<SnapshotInvocatio
implements Writeable, ToXContentObject, Diffable<SnapshotInvocationRecord> {

static final ParseField SNAPSHOT_NAME = new ParseField("snapshot_name");
static final ParseField START_TIMESTAMP = new ParseField("start_time");
static final ParseField TIMESTAMP = new ParseField("time");
static final ParseField DETAILS = new ParseField("details");

private String snapshotName;
private long timestamp;
private Long snapshotStartTimestamp;
private long snapshotFinishTimestamp;
private String details;

public static final ConstructingObjectParser<SnapshotInvocationRecord, String> PARSER =
new ConstructingObjectParser<>("snapshot_policy_invocation_record", true,
a -> new SnapshotInvocationRecord((String) a[0], (long) a[1], (String) a[2]));
a -> new SnapshotInvocationRecord((String) a[0], (Long) a[1], (long) a[2], (String) a[3]));

static {
PARSER.declareString(ConstructingObjectParser.constructorArg(), SNAPSHOT_NAME);
PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), START_TIMESTAMP);
PARSER.declareLong(ConstructingObjectParser.constructorArg(), TIMESTAMP);
PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), DETAILS);
}
Expand All @@ -50,24 +55,35 @@ public static SnapshotInvocationRecord parse(XContentParser parser, String name)
return PARSER.apply(parser, name);
}

public SnapshotInvocationRecord(String snapshotName, long timestamp, String details) {
public SnapshotInvocationRecord(String snapshotName, Long snapshotStartTimestamp, long snapshotFinishTimestamp, String details) {
this.snapshotName = Objects.requireNonNull(snapshotName, "snapshot name must be provided");
this.timestamp = timestamp;
this.snapshotStartTimestamp = snapshotStartTimestamp;
this.snapshotFinishTimestamp = snapshotFinishTimestamp;
this.details = details;
}

public SnapshotInvocationRecord(StreamInput in) throws IOException {
this.snapshotName = in.readString();
this.timestamp = in.readVLong();
if (in.getVersion().onOrAfter(Version.V_8_0_0)) {
this.snapshotStartTimestamp = in.readOptionalVLong();
} else {
this.snapshotStartTimestamp = null;
}
this.snapshotFinishTimestamp = in.readVLong();
this.details = in.readOptionalString();
}

public String getSnapshotName() {
return snapshotName;
}

public long getTimestamp() {
return timestamp;
@Nullable
public Long getSnapshotStartTimestamp() {
masseyke marked this conversation as resolved.
Show resolved Hide resolved
return snapshotStartTimestamp;
}

public long getSnapshotFinishTimestamp() {
return snapshotFinishTimestamp;
}

public String getDetails() {
Expand All @@ -77,7 +93,10 @@ public String getDetails() {
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(snapshotName);
out.writeVLong(timestamp);
if (out.getVersion().onOrAfter(Version.V_8_0_0)) {
out.writeOptionalVLong(snapshotStartTimestamp);
}
out.writeVLong(snapshotFinishTimestamp);
out.writeOptionalString(details);
}

Expand All @@ -86,7 +105,10 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
builder.startObject();
{
builder.field(SNAPSHOT_NAME.getPreferredName(), snapshotName);
builder.timeField(TIMESTAMP.getPreferredName(), "time_string", timestamp);
if (snapshotStartTimestamp != null) {
builder.timeField(START_TIMESTAMP.getPreferredName(), "start_time_string", snapshotStartTimestamp);
}
builder.timeField(TIMESTAMP.getPreferredName(), "time_string", snapshotFinishTimestamp);
if (Objects.nonNull(details)) {
builder.field(DETAILS.getPreferredName(), details);
}
Expand All @@ -100,13 +122,14 @@ public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SnapshotInvocationRecord that = (SnapshotInvocationRecord) o;
return getTimestamp() == that.getTimestamp() &&
return getSnapshotFinishTimestamp() == that.getSnapshotFinishTimestamp() &&
Objects.equals(getSnapshotStartTimestamp(), that.getSnapshotStartTimestamp()) &&
Objects.equals(getSnapshotName(), that.getSnapshotName()) &&
Objects.equals(getDetails(), that.getDetails());
}

@Override
public int hashCode() {
return Objects.hash(getSnapshotName(), getTimestamp(), getDetails());
return Objects.hash(getSnapshotName(), getSnapshotStartTimestamp(), getSnapshotFinishTimestamp(), getDetails());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,30 @@ public void testSlmPolicyNotExecuted() throws IOException {
}

public void testSlmPolicyExecutedBeforeStep() throws IOException {
// The snapshot was started and finished before the phase time, so we do not expect the step to finish:
assertSlmPolicyExecuted(false, false);
}

public void testSlmPolicyExecutedAfterStep() throws IOException {
// The snapshot was started and finished after the phase time, so we do expect the step to finish:
assertSlmPolicyExecuted(true, true);
}

public void testSlmPolicyNotExecutedWhenStartIsBeforePhaseTime() throws IOException {
// The snapshot was started before the phase time and finished after, so we do expect the step to finish:
assertSlmPolicyExecuted(false, true);
}

private void assertSlmPolicyExecuted(boolean startTimeAfterPhaseTime, boolean finishTimeAfterPhaseTime) throws IOException {
long phaseTime = randomLong();

WaitForSnapshotStep instance = createRandomInstance();
SnapshotLifecyclePolicyMetadata slmPolicy = SnapshotLifecyclePolicyMetadata.builder()
.setModifiedDate(randomLong())
.setPolicy(new SnapshotLifecyclePolicy("", "", "", "", null, null))
.setLastSuccess(new SnapshotInvocationRecord("", phaseTime - 10, ""))
.setLastSuccess(new SnapshotInvocationRecord("",
phaseTime + (startTimeAfterPhaseTime ? 10 : -100),
phaseTime + (finishTimeAfterPhaseTime ? 100 : -10), ""))
.build();
SnapshotLifecycleMetadata smlMetadata = new SnapshotLifecycleMetadata(Map.of(instance.getPolicy(), slmPolicy),
OperationMode.RUNNING, null);
Expand All @@ -115,18 +132,25 @@ public void testSlmPolicyExecutedBeforeStep() throws IOException {
Metadata.Builder meta = Metadata.builder().indices(indices.build()).putCustom(SnapshotLifecycleMetadata.TYPE, smlMetadata);
ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metadata(meta).build();
ClusterStateWaitStep.Result result = instance.isConditionMet(indexMetadata.getIndex(), clusterState);
assertFalse(result.isComplete());
assertTrue(getMessage(result).contains("to be executed"));
if (startTimeAfterPhaseTime) {
assertTrue(result.isComplete());
assertNull(result.getInfomationContext());
} else {
assertFalse(result.isComplete());
assertTrue(getMessage(result).contains("to be executed"));
}
}

public void testSlmPolicyExecutedAfterStep() throws IOException {
public void testNullStartTime() throws IOException {
long phaseTime = randomLong();

WaitForSnapshotStep instance = createRandomInstance();
SnapshotLifecyclePolicyMetadata slmPolicy = SnapshotLifecyclePolicyMetadata.builder()
.setModifiedDate(randomLong())
.setPolicy(new SnapshotLifecyclePolicy("", "", "", "", null, null))
.setLastSuccess(new SnapshotInvocationRecord("", phaseTime + 10, ""))
.setLastSuccess(new SnapshotInvocationRecord("",
null,
phaseTime + 100, ""))
.build();
SnapshotLifecycleMetadata smlMetadata = new SnapshotLifecycleMetadata(Map.of(instance.getPolicy(), slmPolicy),
OperationMode.RUNNING, null);
Expand All @@ -140,8 +164,8 @@ public void testSlmPolicyExecutedAfterStep() throws IOException {
Metadata.Builder meta = Metadata.builder().indices(indices.build()).putCustom(SnapshotLifecycleMetadata.TYPE, smlMetadata);
ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metadata(meta).build();
ClusterStateWaitStep.Result result = instance.isConditionMet(indexMetadata.getIndex(), clusterState);
assertTrue(result.isComplete());
assertNull(result.getInfomationContext());
assertFalse(result.isComplete());
assertTrue(getMessage(result).contains("to be executed"));
}

private String getMessage(ClusterStateWaitStep.Result result) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,17 @@ protected SnapshotInvocationRecord mutateInstance(SnapshotInvocationRecord insta
case 0:
return new SnapshotInvocationRecord(
randomValueOtherThan(instance.getSnapshotName(), () -> randomAlphaOfLengthBetween(2,10)),
instance.getTimestamp(),
instance.getSnapshotFinishTimestamp() - 100,
instance.getSnapshotFinishTimestamp(),
instance.getDetails());
case 1:
long timestamp = randomValueOtherThan(instance.getSnapshotFinishTimestamp(), ESTestCase::randomNonNegativeLong);
return new SnapshotInvocationRecord(instance.getSnapshotName(),
randomValueOtherThan(instance.getTimestamp(), ESTestCase::randomNonNegativeLong),
timestamp - 100, timestamp,
instance.getDetails());
case 2:
return new SnapshotInvocationRecord(instance.getSnapshotName(),
instance.getTimestamp(),
instance.getSnapshotFinishTimestamp() - 100, instance.getSnapshotFinishTimestamp(),
randomValueOtherThan(instance.getDetails(), () -> randomAlphaOfLengthBetween(2,10)));
default:
throw new AssertionError("failure, got illegal switch case");
Expand All @@ -55,8 +57,18 @@ protected SnapshotInvocationRecord mutateInstance(SnapshotInvocationRecord insta
public static SnapshotInvocationRecord randomSnapshotInvocationRecord() {
return new SnapshotInvocationRecord(
randomAlphaOfLengthBetween(5,10),
randomNonNegativeNullableLong(),
randomNonNegativeLong(),
randomBoolean() ? null : randomAlphaOfLengthBetween(5, 10));
}

private static Long randomNonNegativeNullableLong() {
long value = randomLong();
if (value < 0) {
return null;
} else {
return value;
}
}

}
Loading