Skip to content

Commit

Permalink
[ML] Fix possible race condition when starting datafeed
Browse files Browse the repository at this point in the history
The ID of the datafeed's associated job was being obtained
frequently by looking up the datafeed task in a map that
was being modified in other threads.  This could lead to
NPEs if the datafeed stopped running at an unexpected time.

This change reduces the number of places where a datafeed's
associated job ID is looked up to avoid the possibility of
failures when the datafeed's task is removed from the map
of running tasks during multi-step operations in other
threads.

Fixes elastic#51285
  • Loading branch information
droberts195 committed Jan 22, 2020
1 parent fac1247 commit b61ab9b
Showing 1 changed file with 25 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,14 @@ public void run(TransportStartDatafeedAction.DatafeedTask task, Consumer<Excepti

ActionListener<DatafeedJob> datafeedJobHandler = ActionListener.wrap(
datafeedJob -> {
String jobId = datafeedJob.getJobId();
Holder holder = new Holder(task, datafeedId, datafeedJob,
new ProblemTracker(auditor, datafeedJob.getJobId()), finishHandler);
new ProblemTracker(auditor, jobId), finishHandler);
runningDatafeedsOnThisNode.put(task.getAllocationId(), holder);
task.updatePersistentTaskState(DatafeedState.STARTED, new ActionListener<PersistentTask<?>>() {
@Override
public void onResponse(PersistentTask<?> persistentTask) {
taskRunner.runWhenJobIsOpened(task);
taskRunner.runWhenJobIsOpened(task, jobId);
}

@Override
Expand Down Expand Up @@ -267,17 +268,23 @@ protected void doRun() {
}
}

private String getJobId(TransportStartDatafeedAction.DatafeedTask task) {
return runningDatafeedsOnThisNode.get(task.getAllocationId()).getJobId();
/**
* Returns <code>null</code> if the datafeed is not running on this node.
*/
private String getJobIdIfDatafeedRunningOnThisNode(TransportStartDatafeedAction.DatafeedTask task) {
Holder holder = runningDatafeedsOnThisNode.get(task.getAllocationId());
if (holder == null) {
return null;
}
return holder.getJobId();
}

private JobState getJobState(PersistentTasksCustomMetaData tasks, TransportStartDatafeedAction.DatafeedTask datafeedTask) {
return MlTasks.getJobStateModifiedForReassignments(getJobId(datafeedTask), tasks);
private JobState getJobState(PersistentTasksCustomMetaData tasks, String jobId) {
return MlTasks.getJobStateModifiedForReassignments(jobId, tasks);
}

private boolean jobHasOpenAutodetectCommunicator(PersistentTasksCustomMetaData tasks,
TransportStartDatafeedAction.DatafeedTask datafeedTask) {
PersistentTasksCustomMetaData.PersistentTask<?> jobTask = MlTasks.getJobTask(getJobId(datafeedTask), tasks);
private boolean jobHasOpenAutodetectCommunicator(PersistentTasksCustomMetaData tasks, String jobId) {
PersistentTasksCustomMetaData.PersistentTask<?> jobTask = MlTasks.getJobTask(jobId, tasks);
if (jobTask == null) {
return false;
}
Expand Down Expand Up @@ -492,14 +499,14 @@ private class TaskRunner implements ClusterStateListener {

private final List<TransportStartDatafeedAction.DatafeedTask> tasksToRun = new CopyOnWriteArrayList<>();

private void runWhenJobIsOpened(TransportStartDatafeedAction.DatafeedTask datafeedTask) {
private void runWhenJobIsOpened(TransportStartDatafeedAction.DatafeedTask datafeedTask, String jobId) {
ClusterState clusterState = clusterService.state();
PersistentTasksCustomMetaData tasks = clusterState.getMetaData().custom(PersistentTasksCustomMetaData.TYPE);
if (getJobState(tasks, datafeedTask) == JobState.OPENED && jobHasOpenAutodetectCommunicator(tasks, datafeedTask)) {
if (getJobState(tasks, jobId) == JobState.OPENED && jobHasOpenAutodetectCommunicator(tasks, jobId)) {
runTask(datafeedTask);
} else {
logger.info("Datafeed [{}] is waiting for job [{}] to be opened",
datafeedTask.getDatafeedId(), getJobId(datafeedTask));
datafeedTask.getDatafeedId(), jobId);
tasksToRun.add(datafeedTask);
}
}
Expand Down Expand Up @@ -530,17 +537,19 @@ public void clusterChanged(ClusterChangedEvent event) {

List<TransportStartDatafeedAction.DatafeedTask> remainingTasks = new ArrayList<>();
for (TransportStartDatafeedAction.DatafeedTask datafeedTask : tasksToRun) {
if (runningDatafeedsOnThisNode.containsKey(datafeedTask.getAllocationId()) == false) {
String jobId = getJobIdIfDatafeedRunningOnThisNode(datafeedTask);
if (jobId == null) {
// Datafeed is not running on this node any more
continue;
}
JobState jobState = getJobState(currentTasks, datafeedTask);
if (jobState == JobState.OPENING || jobHasOpenAutodetectCommunicator(currentTasks, datafeedTask) == false) {
JobState jobState = getJobState(currentTasks, jobId);
if (jobState == JobState.OPENING || jobHasOpenAutodetectCommunicator(currentTasks, jobId) == false) {
remainingTasks.add(datafeedTask);
} else if (jobState == JobState.OPENED) {
runTask(datafeedTask);
} else {
logger.warn("Datafeed [{}] is stopping because job [{}] state is [{}]",
datafeedTask.getDatafeedId(), getJobId(datafeedTask), jobState);
datafeedTask.getDatafeedId(), jobId, jobState);
datafeedTask.stop("job_never_opened", TimeValue.timeValueSeconds(20));
}
}
Expand Down

0 comments on commit b61ab9b

Please sign in to comment.