Skip to content

Commit

Permalink
Merge pull request #5786 from garlick/sched_free_noresponse
Browse files Browse the repository at this point in the history
job-manager: drop sched.free response requirement
  • Loading branch information
mergify[bot] committed Mar 11, 2024
2 parents 3f26e6e + a1abce4 commit 9373f55
Show file tree
Hide file tree
Showing 13 changed files with 16 additions and 92 deletions.
1 change: 0 additions & 1 deletion src/cmd/flux-queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ def alloc_query(handle):
sys.exit(1)
print("{0} alloc requests queued".format(query["queue_length"]))
print("{0} alloc requests pending to scheduler".format(query["alloc_pending"]))
print("{0} free requests pending to scheduler".format(query["free_pending"]))
print("{0} running jobs".format(query["running"]))


Expand Down
6 changes: 1 addition & 5 deletions src/common/libschedutil/free.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,7 @@

int schedutil_free_respond (schedutil_t *util, const flux_msg_t *msg)
{
flux_jobid_t id;

if (flux_request_unpack (msg, NULL, "{s:I}", "id", &id) < 0)
return -1;
return flux_respond_pack (util->h, msg, "{s:I}", "id", id);
return 0;
}

/*
Expand Down
2 changes: 1 addition & 1 deletion src/common/libschedutil/free.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
extern "C" {
#endif

/* Respond to a free request.
/* This is a no-op now that sched.free doesn't require a response.
*/
int schedutil_free_respond (schedutil_t *util, const flux_msg_t *msg);

Expand Down
52 changes: 2 additions & 50 deletions src/modules/job-manager/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ struct alloc {
unsigned int alloc_limit;
// e.g. for mode limited w/ limit=1, max of 1
unsigned int alloc_pending_count;
unsigned int free_pending_count;
char *sched_sender; // for disconnect
};

Expand Down Expand Up @@ -106,55 +105,16 @@ static void interface_teardown (struct alloc *alloc, char *s, int errnum)
*/
if (job->alloc_pending)
requeue_pending (alloc, job);
/* jobs with free request pending (much smaller window for this
* to be true) need to be picked up again after 'ready'.
*/
job->free_pending = 0;
job = zhashx_next (ctx->active_jobs);
}
alloc->ready = false;
alloc->alloc_pending_count = 0;
alloc->free_pending_count = 0;
free (alloc->sched_sender);
alloc->sched_sender = NULL;
drain_check (alloc->ctx->drain);
}
}

/* Handle a sched.free response.
*/
static void free_response_cb (flux_t *h, flux_msg_handler_t *mh,
const flux_msg_t *msg, void *arg)
{
struct job_manager *ctx = arg;
flux_jobid_t id = 0;
struct job *job;

if (flux_response_decode (msg, NULL, NULL) < 0)
goto teardown;
if (flux_msg_unpack (msg, "{s:I}", "id", &id) < 0)
goto teardown;
if (!(job = zhashx_lookup (ctx->active_jobs, &id))) {
flux_log (h, LOG_ERR, "sched.free-response: id=%s not active",
idf58 (id));
errno = EINVAL;
goto teardown;
}
if (!job->has_resources) {
flux_log (h, LOG_ERR, "sched.free-response: %s not allocated",
idf58 (id));
errno = EINVAL;
goto teardown;
}
job->free_pending = 0;
ctx->alloc->free_pending_count--;
if (event_job_post_pack (ctx->event, job, "free", 0, NULL) < 0)
goto teardown;
return;
teardown:
interface_teardown (ctx->alloc, "free response error", errno);
}

/* Send sched.free request for job.
* Update flags.
*/
Expand Down Expand Up @@ -580,17 +540,15 @@ static void check_cb (flux_reactor_t *r, flux_watcher_t *w,
int alloc_send_free_request (struct alloc *alloc, struct job *job)
{
assert (job->state == FLUX_JOB_STATE_CLEANUP);
if (!job->free_pending && alloc->ready) {
if (alloc->ready) {
if (free_request (alloc, job) < 0)
return -1;
job->free_pending = 1;
if ((job->flags & FLUX_JOB_DEBUG))
(void)event_job_post_pack (alloc->ctx->event,
job,
"debug.free-request",
0,
NULL);
alloc->free_pending_count++;
}
return 0;
}
Expand Down Expand Up @@ -742,10 +700,9 @@ static void alloc_query_cb (flux_t *h,

if (flux_respond_pack (h,
msg,
"{s:i s:i s:i s:i}",
"{s:i s:i s:i}",
"queue_length", zlistx_size (alloc->queue),
"alloc_pending", alloc->alloc_pending_count,
"free_pending", alloc->free_pending_count,
"running", alloc->ctx->running_jobs) < 0)
flux_log_error (h, "%s: flux_respond", __FUNCTION__);
return;
Expand Down Expand Up @@ -805,11 +762,6 @@ static const struct flux_msg_handler_spec htab[] = {
alloc_response_cb,
0
},
{ FLUX_MSGTYPE_RESPONSE,
"sched.free",
free_response_cb,
0
},
FLUX_MSGHANDLER_TABLE_END,
};

Expand Down
6 changes: 4 additions & 2 deletions src/modules/job-manager/event.c
Original file line number Diff line number Diff line change
Expand Up @@ -327,16 +327,18 @@ int event_job_action (struct event *event, struct job *job)
&& !job->perilog_active
&& !job->alloc_bypass
&& !job->start_pending
&& !job->free_pending) {
&& !job->free_posted) {
if (alloc_send_free_request (ctx->alloc, job) < 0)
return -1;
if (event_job_post_pack (ctx->event, job, "free", 0, NULL) < 0)
return -1;
job->free_posted = 1;
}

/* Post cleanup event when cleanup is complete.
*/
if (!job->alloc_queued
&& !job->alloc_pending
&& !job->free_pending
&& !job->start_pending
&& !job->has_resources
&& !job_event_is_queued (job, "epilog-start")
Expand Down
2 changes: 1 addition & 1 deletion src/modules/job-manager/job.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ struct job {
uint8_t alloc_queued:1; // queued for alloc, but alloc request not sent
uint8_t alloc_pending:1;// alloc request sent to sched
uint8_t alloc_bypass:1; // alloc bypass enabled
uint8_t free_pending:1; // free request sent to sched
uint8_t free_posted:1; // free event already posted
uint8_t has_resources:1;
uint8_t start_pending:1;// start request sent to job-exec
uint8_t reattach:1;
Expand Down
3 changes: 1 addition & 2 deletions src/modules/job-manager/jobtap.c
Original file line number Diff line number Diff line change
Expand Up @@ -2356,8 +2356,7 @@ static int jobtap_emit_perilog_event (struct jobtap *jobtap,
*/
if ((prolog && job->start_pending)
|| ((prolog && start) && job->state == FLUX_JOB_STATE_CLEANUP)
|| (!prolog && job->state != FLUX_JOB_STATE_CLEANUP)
|| (!prolog && job->free_pending)) {
|| (!prolog && job->state != FLUX_JOB_STATE_CLEANUP)) {
errno = EINVAL;
return -1;
}
Expand Down
8 changes: 0 additions & 8 deletions src/modules/job-manager/test/job.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ void test_create (void)
&& job->flags == 0,
"job_create set id, urgency, userid, and t_submit to expected values");
ok (!job->alloc_pending
&& !job->free_pending
&& !job->has_resources,
"job_create set no internal flags");
ok (job->handle == NULL,
Expand Down Expand Up @@ -190,7 +189,6 @@ void test_create_from_eventlog (void)
ok (job->id == 2,
"job_create_from_eventlog log=(submit) set id from param");
ok (!job->alloc_pending
&& !job->free_pending
&& !job->has_resources,
"job_create_from_eventlog log=(submit) set no internal flags");
ok (job->userid == 66,
Expand Down Expand Up @@ -224,7 +222,6 @@ void test_create_from_eventlog (void)
ok (job->t_submit == 42.2,
"job_create_from_eventlog log=(submit+urgency) set t_submit from submit");
ok (!job->alloc_pending
&& !job->free_pending
&& !job->has_resources,
"job_create_from_eventlog log=(submit+urgency) set no internal flags");
ok (job->state == FLUX_JOB_STATE_DEPEND,
Expand Down Expand Up @@ -252,7 +249,6 @@ void test_create_from_eventlog (void)
ok (job->t_submit == 42.2,
"job_create_from_eventlog log=(submit+depend+priority) set t_submit from submit");
ok (!job->alloc_pending
&& !job->free_pending
&& !job->has_resources,
"job_create_from_eventlog log=(submit+depend+priority) set no internal flags");
ok (job->state == FLUX_JOB_STATE_SCHED,
Expand All @@ -276,7 +272,6 @@ void test_create_from_eventlog (void)
ok (job->t_submit == 42.2,
"job_create_from_eventlog log=(submit+ex0) set t_submit from submit");
ok (!job->alloc_pending
&& !job->free_pending
&& !job->has_resources,
"job_create_from_eventlog log=(submit+ex0) set no internal flags");
ok (job->state == FLUX_JOB_STATE_CLEANUP,
Expand All @@ -296,7 +291,6 @@ void test_create_from_eventlog (void)
ok (job->state == FLUX_JOB_STATE_DEPEND,
"job_create_from_eventlog log=(submit+ex1) set state=DEPEND");
ok (!job->alloc_pending
&& !job->free_pending
&& !job->has_resources,
"job_create_from_eventlog log=(submit+ex1) set no internal flags");
job_decref (job);
Expand All @@ -312,7 +306,6 @@ void test_create_from_eventlog (void)
error.text);
}
ok (!job->alloc_pending
&& !job->free_pending
&& job->has_resources,
"job_create_from_eventlog log=(submit+depend+priority+alloc) set has_resources flag");
ok (job->R_redacted != NULL,
Expand Down Expand Up @@ -345,7 +338,6 @@ void test_create_from_eventlog (void)
error.text);
}
ok (!job->alloc_pending
&& !job->free_pending
&& !job->has_resources,
"job_create_from_eventlog log=(submit+depend+priority+alloc+ex0+free) set no internal flags");
ok (job->state == FLUX_JOB_STATE_CLEANUP,
Expand Down
8 changes: 4 additions & 4 deletions src/modules/sched-simple/sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -365,16 +365,16 @@ void free_cb (flux_t *h, const flux_msg_t *msg, const char *R_str, void *arg)

if (flux_request_unpack (msg, NULL, "{s:o}", "R", &R) < 0) {
flux_log (h, LOG_ERR, "free: error unpacking sched.free request");
if (flux_respond_error (h, msg, EINVAL, NULL) < 0)
flux_log_error (h, "free_cb: flux_respond_error");
return;
}

if (try_free (h, ss, R) < 0) {
if (flux_respond_error (h, msg, errno, NULL) < 0)
flux_log_error (h, "free_cb: flux_respond_error");
flux_log_error (h, "free: could not free R");
return;
}
/* This is a no-op now that sched.free requires no response
* but we still call it to get test coverage.
*/
if (schedutil_free_respond (ss->util_ctx, msg) < 0)
flux_log_error (h, "free_cb: schedutil_free_respond");

Expand Down
6 changes: 0 additions & 6 deletions t/t2223-job-manager-queue-priority-order-limited.t
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
Scheduling is started
2 alloc requests queued
2 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_all1.exp counts_all1.out
Expand All @@ -58,7 +57,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
Scheduling is stopped
0 alloc requests queued
0 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_all2.exp counts_all2.out
Expand Down Expand Up @@ -89,7 +87,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
Scheduling is started
1 alloc requests queued
2 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_all3.exp counts_all3.out
Expand Down Expand Up @@ -173,7 +170,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
debug: Scheduling is started
6 alloc requests queued
2 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_named1.exp counts_named1.out
Expand Down Expand Up @@ -211,7 +207,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
debug: Scheduling is stopped
0 alloc requests queued
2 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_named2.exp counts_named2.out
Expand Down Expand Up @@ -253,7 +248,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
debug: Scheduling is started
2 alloc requests queued
2 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_named3.exp counts_named3.out
Expand Down
6 changes: 0 additions & 6 deletions t/t2224-job-manager-queue-priority-order-unlimited.t
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
Scheduling is started
0 alloc requests queued
3 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_all1.exp counts_all1.out
Expand All @@ -57,7 +56,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
Scheduling is stopped
0 alloc requests queued
0 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_all2.exp counts_all2.out
Expand Down Expand Up @@ -87,7 +85,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
Scheduling is started
0 alloc requests queued
2 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_all3.exp counts_all3.out
Expand Down Expand Up @@ -144,7 +141,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
debug: Scheduling is started
0 alloc requests queued
8 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_named1.exp counts_named1.out
Expand Down Expand Up @@ -182,7 +178,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
debug: Scheduling is stopped
0 alloc requests queued
2 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_named2.exp counts_named2.out
Expand Down Expand Up @@ -224,7 +219,6 @@ test_expect_success 'job-manager: queue counts are as expected' '
debug: Scheduling is started
0 alloc requests queued
4 alloc requests pending to scheduler
0 free requests pending to scheduler
2 running jobs
EOT
test_cmp counts_named3.exp counts_named3.out
Expand Down
2 changes: 0 additions & 2 deletions t/t2240-queue-cmd.t
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,6 @@ test_expect_success 'flux-queue: queue status -v shows expected counts' '
Scheduling is started
1 alloc requests queued
1 alloc requests pending to scheduler
0 free requests pending to scheduler
1 running jobs
EOT
test_cmp stat.exp stat.out
Expand All @@ -282,7 +281,6 @@ test_expect_success 'flux-queue: queue status -v shows expected counts' '
Scheduling is stopped
0 alloc requests queued
0 alloc requests pending to scheduler
0 free requests pending to scheduler
0 running jobs
EOT
test_cmp stat2.exp stat2.out
Expand Down
6 changes: 2 additions & 4 deletions t/t2300-sched-simple.t
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,7 @@ test_expect_success 'sched-simple: remove sched-simple and cancel jobs' '
'
test_expect_success 'sched-simple: there are no outstanding sched requests' '
flux queue status -v >queue_status.out &&
grep "0 alloc requests pending to scheduler" queue_status.out &&
grep "0 free requests pending to scheduler" queue_status.out
grep "0 alloc requests pending to scheduler" queue_status.out
'
test_expect_success 'sched-simple: reload in unlimited mode' '
flux module load sched-simple mode=unlimited &&
Expand Down Expand Up @@ -257,8 +256,7 @@ test_expect_success 'sched-simple: remove sched-simple and cancel jobs' '
'
test_expect_success 'sched-simple: there are no outstanding sched requests' '
flux queue status -v >queue_status.out &&
grep "0 alloc requests pending to scheduler" queue_status.out &&
grep "0 free requests pending to scheduler" queue_status.out
grep "0 alloc requests pending to scheduler" queue_status.out
'

test_expect_success 'sched-simple: load sched-simple and wait for queue drain' '
Expand Down

0 comments on commit 9373f55

Please sign in to comment.