|
32 | 32 | NO_REMOTE_GALAXY_FOR_METADATA_MESSAGE = "Pulsar misconfiguration - Pulsar client configured to set metadata remotely, but remote Pulsar isn't properly configured with a galaxy_home directory."
|
33 | 33 | NO_REMOTE_DATATYPES_CONFIG = "Pulsar client is configured to use remote datatypes configuration when setting metadata externally, but Pulsar is not configured with this information. Defaulting to datatypes_conf.xml."
|
34 | 34 | GENERIC_REMOTE_ERROR = "Failed to communicate with remote job server."
|
| 35 | +FAILED_REMOTE_ERROR = "Remote job server indicated a problem running or monitoring this job." |
| 36 | +LOST_REMOTE_ERROR = "Remote job server could not determine this job's state." |
35 | 37 |
|
36 | 38 | # Is there a good way to infer some default for this? Can only use
|
37 | 39 | # url_for from web threads. https://gist.github.com/jmchilton/9098762
|
@@ -170,8 +172,13 @@ def _update_job_state_for_status(self, job_state, pulsar_status):
|
170 | 172 | if pulsar_status == "complete":
|
171 | 173 | self.mark_as_finished(job_state)
|
172 | 174 | return None
|
173 |
| - if pulsar_status == "failed": |
174 |
| - self.fail_job(job_state) |
| 175 | + if pulsar_status in ["failed", "lost"]: |
| 176 | + if pulsar_status == "failed": |
| 177 | + message = FAILED_REMOTE_ERROR |
| 178 | + else: |
| 179 | + message = LOST_REMOTE_ERROR |
| 180 | + if not job_state.job_wrapper.get_job().finished: |
| 181 | + self.fail_job(job_state, message) |
175 | 182 | return None
|
176 | 183 | if pulsar_status == "running" and not job_state.running:
|
177 | 184 | job_state.running = True
|
@@ -398,12 +405,12 @@ def finish_job( self, job_state ):
|
398 | 405 | log.exception("Job wrapper finish method failed")
|
399 | 406 | job_wrapper.fail("Unable to finish job", exception=True)
|
400 | 407 |
|
401 |
| - def fail_job( self, job_state ): |
| 408 | + def fail_job( self, job_state, message=GENERIC_REMOTE_ERROR ): |
402 | 409 | """
|
403 | 410 | Seperated out so we can use the worker threads for it.
|
404 | 411 | """
|
405 | 412 | self.stop_job( self.sa_session.query( self.app.model.Job ).get( job_state.job_wrapper.job_id ) )
|
406 |
| - job_state.job_wrapper.fail( getattr( job_state, "fail_message", GENERIC_REMOTE_ERROR ) ) |
| 413 | + job_state.job_wrapper.fail( getattr( job_state, "fail_message", message ) ) |
407 | 414 |
|
408 | 415 | def check_pid( self, pid ):
|
409 | 416 | try:
|
|
0 commit comments