Skip to content

Commit

Permalink
[ML] fix race condition with job snapshot upgrade and feature reset (#…
Browse files Browse the repository at this point in the history
…85121) (#85396)

This addresses a race condition in feature reset && model snapshot upgrade.

It is possible that feature reset is called soon after the task is assigned, meaning the persistent task is cancelled before the model snapshot upgrader is set on the task.

This means that the task could continue executing even after being cancelled as nothing else checks if the feature is being reset or if the task has been cancelled.

closes #84709
  • Loading branch information
benwtrent committed Mar 28, 2022
1 parent a2ae13f commit fe97af9
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,17 @@ public void upgradeSnapshot(SnapshotUpgradeTask task, Consumer<Exception> closeH
);
return;
}
if (resetInProgress) {
logger.trace(
() -> new ParameterizedMessage(
"Aborted upgrading snapshot [{}] for job [{}] as ML feature is being reset",
snapshotId,
jobId
)
);
closeHandler.accept(null);
return;
}
// We need to fork, otherwise we restore model state from a network thread (several GET api calls):
threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME).execute(new AbstractRunnable() {
@Override
Expand All @@ -500,6 +511,17 @@ protected void doRun() {
closeHandler.accept(null);
return;
}
if (resetInProgress) {
logger.trace(
() -> new ParameterizedMessage(
"Aborted upgrading snapshot [{}] for job [{}] as ML feature is being reset",
snapshotId,
jobId
)
);
closeHandler.accept(null);
return;
}
runSnapshotUpgrade(task, job, params, closeHandler);
}
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,10 @@ public final class JobModelSnapshotUpgrader {
}

synchronized void start() {
task.setJobModelSnapshotUpgrader(this);
if (task.setJobModelSnapshotUpgrader(this) == false) {
this.killProcess(task.getReasonCancelled());
return;
}

// A TP with no queue, so that we fail immediately if there are no threads available
ExecutorService autodetectExecutorService = threadPool.executor(MachineLearning.JOB_COMMS_THREAD_POOL_NAME);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,15 @@ protected synchronized void onCancelled() {
}
}

public synchronized void setJobModelSnapshotUpgrader(JobModelSnapshotUpgrader jobModelSnapshotUpgrader) {
/**
* @param jobModelSnapshotUpgrader the snapshot upgrader
* @return false if the task has been canceled and upgrader not set
*/
public synchronized boolean setJobModelSnapshotUpgrader(JobModelSnapshotUpgrader jobModelSnapshotUpgrader) {
if (this.isCancelled()) {
return false;
}
this.jobModelSnapshotUpgrader = jobModelSnapshotUpgrader;
return true;
}
}

0 comments on commit fe97af9

Please sign in to comment.