From 1438fb2d0076019655f9c15c44f0c0b620e25c6f Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 4 Feb 2022 15:59:23 -0500 Subject: [PATCH 1/2] fix a bug in repeat submission When a job previously failed, retries submitting, but is not submitted, the job id should be cleaned. Otherwise, it will be considered failed again (although it isn't actually submitted.) --- dpdispatcher/submission.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 4938d6de..b305dcba 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -610,8 +610,8 @@ def register_job_id(self, job_id): def submit_job(self): job_id = self.machine.do_submit(self) + self.register_job_id(job_id) if job_id: - self.register_job_id(job_id) self.job_state = JobStatus.waiting else: self.job_state = JobStatus.unsubmitted From b0ae5891eccaa3b7a1d30b63c1a4c0fbbfe26250 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 4 Feb 2022 16:08:11 -0500 Subject: [PATCH 2/2] skip logging if it is not submitted --- dpdispatcher/submission.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index b305dcba..e1f82961 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -559,11 +559,12 @@ def handle_unexpected_job_state(self): if ( self.fail_count ) > 0 and ( self.fail_count % 3 == 0 ) : raise RuntimeError(f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times.job_detail:{self}") self.submit_job() - dlog.info("job:{job_hash} re-submit after terminated; new job_id is {job_id}".format(job_hash=self.job_hash, job_id=self.job_id)) - time.sleep(0.2) - self.get_job_state() - dlog.info(f"job:{self.job_hash} job_id:{self.job_id} after re-submitting; the state now is {repr(self.job_state)}") - self.handle_unexpected_job_state() + if self.job_state != JobStatus.unsubmitted: + dlog.info("job:{job_hash} re-submit after terminated; new job_id is {job_id}".format(job_hash=self.job_hash, job_id=self.job_id)) + time.sleep(0.2) + self.get_job_state() + dlog.info(f"job:{self.job_hash} job_id:{self.job_id} after re-submitting; the state now is {repr(self.job_state)}") + self.handle_unexpected_job_state() if job_state == JobStatus.unsubmitted: dlog.debug(f"job: {self.job_hash} unsubmitted; submit it")