Skip to content

Commit

Permalink
[ML] Fix possible race condition when starting datafeed (elastic#51302)
Browse files Browse the repository at this point in the history
The ID of the datafeed's associated job was being obtained
frequently by looking up the datafeed task in a map that
was being modified in other threads.  This could lead to
NPEs if the datafeed stopped running at an unexpected time.

This change reduces the number of places where a datafeed's
associated job ID is looked up to avoid the possibility of
failures when the datafeed's task is removed from the map
of running tasks during multi-step operations in other
threads.

Fixes elastic#51285
  • Loading branch information
droberts195 authored and debadair committed Jan 28, 2020
1 parent d9d58da commit 76f4f21
Showing 1 changed file with 25 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,14 @@ public void run(TransportStartDatafeedAction.DatafeedTask task, Consumer<Excepti

ActionListener<DatafeedJob> datafeedJobHandler = ActionListener.wrap(
datafeedJob -> {
String jobId = datafeedJob.getJobId();
Holder holder = new Holder(task, datafeedId, datafeedJob,
new ProblemTracker(auditor, datafeedJob.getJobId()), finishHandler);
new ProblemTracker(auditor, jobId), finishHandler);
runningDatafeedsOnThisNode.put(task.getAllocationId(), holder);
task.updatePersistentTaskState(DatafeedState.STARTED, new ActionListener<PersistentTask<?>>() {
@Override
public void onResponse(PersistentTask<?> persistentTask) {
taskRunner.runWhenJobIsOpened(task);
taskRunner.runWhenJobIsOpened(task, jobId);
}

@Override
Expand Down Expand Up @@ -267,17 +268,23 @@ protected void doRun() {
}
}

private String getJobId(TransportStartDatafeedAction.DatafeedTask task) {
return runningDatafeedsOnThisNode.get(task.getAllocationId()).getJobId();
/**
* Returns <code>null</code> if the datafeed is not running on this node.
*/
private String getJobIdIfDatafeedRunningOnThisNode(TransportStartDatafeedAction.DatafeedTask task) {
Holder holder = runningDatafeedsOnThisNode.get(task.getAllocationId());
if (holder == null) {
return null;
}
return holder.getJobId();
}

private JobState getJobState(PersistentTasksCustomMetaData tasks, TransportStartDatafeedAction.DatafeedTask datafeedTask) {
return MlTasks.getJobStateModifiedForReassignments(getJobId(datafeedTask), tasks);
private JobState getJobState(PersistentTasksCustomMetaData tasks, String jobId) {
return MlTasks.getJobStateModifiedForReassignments(jobId, tasks);
}

private boolean jobHasOpenAutodetectCommunicator(PersistentTasksCustomMetaData tasks,
TransportStartDatafeedAction.DatafeedTask datafeedTask) {
PersistentTasksCustomMetaData.PersistentTask<?> jobTask = MlTasks.getJobTask(getJobId(datafeedTask), tasks);
private boolean jobHasOpenAutodetectCommunicator(PersistentTasksCustomMetaData tasks, String jobId) {
PersistentTasksCustomMetaData.PersistentTask<?> jobTask = MlTasks.getJobTask(jobId, tasks);
if (jobTask == null) {
return false;
}
Expand Down Expand Up @@ -492,14 +499,14 @@ private class TaskRunner implements ClusterStateListener {

private final List<TransportStartDatafeedAction.DatafeedTask> tasksToRun = new CopyOnWriteArrayList<>();

private void runWhenJobIsOpened(TransportStartDatafeedAction.DatafeedTask datafeedTask) {
private void runWhenJobIsOpened(TransportStartDatafeedAction.DatafeedTask datafeedTask, String jobId) {
ClusterState clusterState = clusterService.state();
PersistentTasksCustomMetaData tasks = clusterState.getMetaData().custom(PersistentTasksCustomMetaData.TYPE);
if (getJobState(tasks, datafeedTask) == JobState.OPENED && jobHasOpenAutodetectCommunicator(tasks, datafeedTask)) {
if (getJobState(tasks, jobId) == JobState.OPENED && jobHasOpenAutodetectCommunicator(tasks, jobId)) {
runTask(datafeedTask);
} else {
logger.info("Datafeed [{}] is waiting for job [{}] to be opened",
datafeedTask.getDatafeedId(), getJobId(datafeedTask));
datafeedTask.getDatafeedId(), jobId);
tasksToRun.add(datafeedTask);
}
}
Expand Down Expand Up @@ -530,17 +537,19 @@ public void clusterChanged(ClusterChangedEvent event) {

List<TransportStartDatafeedAction.DatafeedTask> remainingTasks = new ArrayList<>();
for (TransportStartDatafeedAction.DatafeedTask datafeedTask : tasksToRun) {
if (runningDatafeedsOnThisNode.containsKey(datafeedTask.getAllocationId()) == false) {
String jobId = getJobIdIfDatafeedRunningOnThisNode(datafeedTask);
if (jobId == null) {
// Datafeed is not running on this node any more
continue;
}
JobState jobState = getJobState(currentTasks, datafeedTask);
if (jobState == JobState.OPENING || jobHasOpenAutodetectCommunicator(currentTasks, datafeedTask) == false) {
JobState jobState = getJobState(currentTasks, jobId);
if (jobState == JobState.OPENING || jobHasOpenAutodetectCommunicator(currentTasks, jobId) == false) {
remainingTasks.add(datafeedTask);
} else if (jobState == JobState.OPENED) {
runTask(datafeedTask);
} else {
logger.warn("Datafeed [{}] is stopping because job [{}] state is [{}]",
datafeedTask.getDatafeedId(), getJobId(datafeedTask), jobState);
datafeedTask.getDatafeedId(), jobId, jobState);
datafeedTask.stop("job_never_opened", TimeValue.timeValueSeconds(20));
}
}
Expand Down

0 comments on commit 76f4f21

Please sign in to comment.