Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

testsuite: fix some test races and improve debugging #5609

Merged
merged 10 commits into from
Dec 7, 2023
10 changes: 7 additions & 3 deletions src/bindings/python/flux/uri/resolvers/lsf.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,13 @@


def check_lsf_jobid(pid, jobid):
with open(f"/proc/{pid}/environ", encoding="utf-8") as envfile:
if f"LSB_JOBID={jobid}" in envfile.read():
return True
try:
with open(f"/proc/{pid}/environ", encoding="utf-8") as envfile:
if f"LSB_JOBID={jobid}" in envfile.read():
return True
except FileNotFoundError:

Check warning on line 55 in src/bindings/python/flux/uri/resolvers/lsf.py

View check run for this annotation

Codecov / codecov/patch

src/bindings/python/flux/uri/resolvers/lsf.py#L55

Added line #L55 was not covered by tests
# if pid disappears while we try to read it, this is a False
pass

Check warning on line 57 in src/bindings/python/flux/uri/resolvers/lsf.py

View check run for this annotation

Codecov / codecov/patch

src/bindings/python/flux/uri/resolvers/lsf.py#L57

Added line #L57 was not covered by tests
return False


Expand Down
10 changes: 4 additions & 6 deletions t/python/t0015-job-output.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,16 +225,14 @@ def event_watch(jobid):
return stdout, stderr

jobid = self.submit(hold=True)
future = self.executor.submit(event_watch, jobid)

# Due to #5344, output_event_watch(nowait=True) won't error until
# job starts
# Note: We don't test for FileNotFoundError here because there
# ultimately no way to make a race free test. There would have to
# be some way to ensure the watch request has been registered first.
self.release_job(jobid)
with self.assertRaises(FileNotFoundError):
stdout, stderr = future.result()
event_wait(self.fh, jobid, "shell.init", "guest.exec.eventlog")

# Now nowait should work:
event_wait(self.fh, jobid, "shell.init", "guest.exec.eventlog")
stdout, stderr = event_watch(jobid)
self.assertEqual(stdout, self.test_stdout)
self.assertEqual(stderr, self.test_stderr)
Expand Down
1 change: 1 addition & 0 deletions t/t0005-exec.t
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ test_expect_success 'signal forwarding works' '
cat >test_signal.sh <<-EOF &&
#!/bin/bash
sig=\${1-INT}
rm -f sleepready.out
mkfifo input.fifo
stdbuf --output=L \
flux exec -v -n awk "BEGIN {print \"hi\"} {print}" input.fifo \
Expand Down
8 changes: 7 additions & 1 deletion t/t1102-cmddriver.t
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,13 @@ test_expect_success 'cmddriver inserts its path at end of PATH' '
'
# Ensure a PATH that already returns current flux first is not modified
# by flux(1)
test_expect_success READLINK 'cmddriver does not adjust PATH if unnecessary' '
#
# The following test assumes flux(1) is not in /bin or /usr/bin. Skip the
# test if so.
#
fluxdir=$(dirname $fluxcmd)
test "$fluxdir" = "/bin" -o "$fluxdir" = "/usr/bin" || test_set_prereq NOBINDIR
test_expect_success READLINK,NOBINDIR 'cmddriver does not adjust PATH if unnecessary' '
fluxdir=$(dirname $fluxcmd) &&
mypath=/foo:/bar:$fluxdir:/usr/bin:/bin &&
newpath=$(PATH=$mypath $fluxcmd env $path_printenv PATH) &&
Expand Down
10 changes: 9 additions & 1 deletion t/t2201-job-cmd.t
Original file line number Diff line number Diff line change
Expand Up @@ -711,13 +711,17 @@ test_expect_success 'flux-job: kill --signal works' '

test_expect_success 'flux job: killall -f kills one job' '
id=$(flux submit sleep 600) &&
flux job wait-event $id start &&
flux job wait-event -vt 30 -p guest.exec.eventlog $id shell.init &&
flux job killall -f &&
run_timeout 60 flux queue drain
'

test_expect_success 'flux job: cancel can operate on multiple jobs' '
ids=$(flux submit --bcc=1-3 sleep 600) &&
for id in ${ids}; do
flux job wait-event \
-vt 30 -p guest.exec.eventlog $id shell.init
done &&
flux job cancel ${ids} cancel multiple jobs &&
for id in ${ids}; do
flux job wait-event -t 30 ${id} exception >exception.out &&
Expand All @@ -737,6 +741,10 @@ test_expect_success 'flux job: raise can operate on multiple jobs' '
# N.B. SIGTERM == 15
test_expect_success 'flux job: kill can operate on multiple jobs' '
ids=$(flux submit --wait-event=start --bcc=1-3 sleep 600) &&
for id in ${ids}; do
flux job wait-event \
-t 30 -p guest.exec.eventlog ${id} shell.init
done &&
flux job kill ${ids} &&
for id in ${ids}; do
flux job wait-event -t 30 ${id} finish >killmulti.out &&
Expand Down
2 changes: 2 additions & 0 deletions t/t2233-job-info-update.t
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ test_expect_success NO_CHAIN_LINT 'job-info: update watch can be canceled (multi
wait $watchpidA &&
kill -s USR1 $watchpidB &&
wait $watchpidB &&
test_debug "echo watch10A: $(cat watch10A.out)" &&
test_debug "echo watch10B: $(cat watch10B.out)" &&
flux cancel $jobid &&
test $(cat watch10A.out | wc -l) -eq 2 &&
test $(cat watch10B.out | wc -l) -eq 1 &&
Expand Down
10 changes: 10 additions & 0 deletions t/t2260-job-list.t
Original file line number Diff line number Diff line change
Expand Up @@ -1863,6 +1863,7 @@ test_expect_success 'verify task count preserved across restart' '
jobid1=`cat success1.id` &&
jobid2=`cat success2.id` &&
obj=$(flux job list -s inactive | grep ${jobid1}) &&
test_debug "echo $obj | jq -S ." &&
echo $obj | jq -e ".success == true" &&
obj=$(flux job list -s inactive | grep ${jobid2}) &&
echo $obj | jq -e ".success == false"
Expand All @@ -1875,6 +1876,7 @@ test_expect_success 'flux job list outputs exceptions correctly (no exception)'
echo $jobid > exceptions1.id &&
wait_jobid_state $jobid inactive &&
obj=$(flux job list -s inactive | grep $jobid) &&
test_debug "echo $obj | jq -S ." &&
echo $obj | jq -e ".exception_occurred == false" &&
echo $obj | jq -e ".exception_severity == null" &&
echo $obj | jq -e ".exception_type == null" &&
Expand All @@ -1886,6 +1888,7 @@ test_expect_success 'flux job list outputs exceptions correctly (exception)' '
echo $jobid > exceptions2.id &&
wait_jobid_state $jobid inactive &&
obj=$(flux job list -s inactive | grep $jobid) &&
test_debug "echo $obj | jq -S ." &&
echo $obj | jq -e ".exception_occurred == true" &&
echo $obj | jq -e ".exception_severity == 0" &&
echo $obj | jq -e ".exception_type == \"exec\"" &&
Expand All @@ -1901,6 +1904,7 @@ test_expect_success 'flux job list outputs exceptions correctly (exception cance
flux cancel $jobid &&
wait_jobid_state $jobid inactive &&
obj=$(flux job list -s inactive | grep $jobid) &&
test_debug "echo $obj | jq -S ." &&
echo $obj | jq -e ".exception_occurred == true" &&
echo $obj | jq -e ".exception_severity == 0" &&
echo $obj | jq -e ".exception_type == \"cancel\"" &&
Expand All @@ -1916,6 +1920,7 @@ test_expect_success 'flux job list outputs exceptions correctly (exception cance
flux cancel -m "mecanceled" $jobid &&
wait_jobid_state $jobid inactive &&
obj=$(flux job list -s inactive | grep $jobid) &&
test_debug "echo $obj | jq -S ." &&
echo $obj | jq -e ".exception_occurred == true" &&
echo $obj | jq -e ".exception_severity == 0" &&
echo $obj | jq -e ".exception_type == \"cancel\"" &&
Expand All @@ -1927,10 +1932,15 @@ test_expect_success 'flux job list outputs exceptions correctly (exception cance
test_expect_success 'flux job list outputs exceptions correctly (user exception)' '
jobid=`flux submit ./sleepinf.sh | flux job id` &&
echo $jobid > exceptions5.id &&
test_debug "echo started $jobid, waiting for first line of output" &&
flux job wait-event -W -p guest.output $jobid data &&
test_debug "echo raising user exception foo" &&
flux job raise --type=foo --severity=0 -m "foobar" $jobid &&
test_debug "echo waiting for $jobid to become inactive" &&
wait_jobid_state $jobid inactive &&
test_debug "flux job list -s inactive | jq" &&
obj=$(flux job list -s inactive | grep $jobid) &&
test_debug "echo $obj | jq -S ." &&
echo $obj | jq -e ".exception_occurred == true" &&
echo $obj | jq -e ".exception_severity == 0" &&
echo $obj | jq -e ".exception_type == \"foo\"" &&
Expand Down
2 changes: 1 addition & 1 deletion t/t2500-job-attach.t
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ test_expect_success 'attach: cannot attach to interactive pty when --read-only s
test_expect_success 'attach: --stdin-ranks works' '
id=$(flux submit -N4 -t20s cat) &&
echo hello from 0 \
| flux job attach --label-io -i0 $id >stdin-ranks.out 2>&1 &&
| flux job attach --label-io -i0 $id >stdin-ranks.out &&
flux job eventlog -p guest.input $id &&
cat <<-EOF >stdin-ranks.expected &&
0: hello from 0
Expand Down
30 changes: 15 additions & 15 deletions t/t2607-job-shell-input.t
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ test_expect_success NO_CHAIN_LINT 'flux-shell: attach twice, one with data' '
mkfifo stdin4.pipe
id=$(flux submit -n1 \
${TEST_SUBPROCESS_DIR}/test_echo -O -n)
flux job attach $id < stdin4.pipe > pipe4A.out 2> pipe4A.err &
flux job attach $id < stdin4.pipe > pipe4A.out &
pid1=$!
flux job attach $id < input_stdin_file > pipe4B.out 2> pipe4B.err &
flux job attach $id < input_stdin_file > pipe4B.out &
pid2=$!
exec 9> stdin4.pipe &&
wait $pid1 &&
Expand All @@ -79,18 +79,18 @@ test_expect_success 'flux-shell: multiple jobs, each want stdin' '
i=1 &&
for id in $(cat pipe5.jobids); do
flux job attach $id \
<input_stdin_file >pipe5_${i}.out 2>pipe5_${i}.err &&
<input_stdin_file >pipe5_${i}.out &&
test_cmp input_stdin_file pipe5_${i}.out &&
i=$((i+1))
done
'

test_expect_success NO_CHAIN_LINT 'flux-shell: no stdin desired in job' '
id=$(flux submit -n1 sleep 60)
flux job attach $id < input_stdin_file 2> pipe6A.err &
flux job attach $id < input_stdin_file &
pid=$! &&
flux job wait-event -W -p guest.input -m eof=true $id data 2> pipe6C.err &&
flux cancel $id 2> pipe6D.err &&
flux job wait-event -W -p guest.input -m eof=true $id data &&
flux cancel $id &&
test_expect_code 143 wait $pid
'

Expand All @@ -103,24 +103,24 @@ test_expect_success NO_CHAIN_LINT 'flux-shell: no stdin desired in job' '
test_expect_success 'flux-shell: task completed, try to pipe into stdin' '
${LPTEST} 79 500 > big_dataset &&
id=$(flux submit -n1 cat big_dataset) &&
flux job wait-event $id clean 2> pipe7A.err &&
test_must_fail flux job attach $id < input_stdin_file 2> pipe7B.err
flux job wait-event $id clean &&
test_must_fail flux job attach $id < input_stdin_file
'

test_expect_success 'flux-shell: task completed, try to pipe into stdin, no error if read only' '
${LPTEST} 79 500 > big_dataset &&
id=$(flux submit -n1 cat big_dataset) &&
flux job wait-event $id clean 2> pipe8A.err &&
flux job attach --read-only $id < input_stdin_file 2> pipe8B.err
flux job wait-event $id clean &&
flux job attach --read-only $id < input_stdin_file
'

test_expect_success NO_CHAIN_LINT 'flux-shell: pipe to stdin twice, second fails' '
id=$(flux submit -n1 sleep 60)
flux job attach $id < input_stdin_file 2> pipe9A.err &
flux job attach $id < input_stdin_file &
pid=$!
flux job wait-event -W -p guest.input -m eof=true $id data 2> pipe9C.err &&
test_must_fail flux job attach $id < input_stdin_file 2> pipe9D.err &&
flux cancel $id 2> pipe9E.err &&
flux job wait-event -W -p guest.input -m eof=true $id data &&
test_must_fail flux job attach $id < input_stdin_file &&
flux cancel $id &&
test_expect_code 143 wait $pid
'

Expand Down Expand Up @@ -154,7 +154,7 @@ test_expect_success 'flux-shell: multiple jobs, each want stdin via file' '
test_debug "cat file2.jobids" &&
i=1 &&
for id in $(cat file2.jobids); do
flux job attach $id >file2_${i}.out 2>file2_${i}.err &&
flux job attach $id >file2_${i}.out &&
test_cmp input_stdin_file file2_${i}.out &&
i=$((i+1))
done
Expand Down
2 changes: 2 additions & 0 deletions t/t2712-python-cli-alloc.t
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ test_expect_success NO_CHAIN_LINT 'flux alloc --bg can be interrupted' '
run_mini_bg &&
$waitfile -t 180 -v -p waiting sigint.log &&
kill -INT $(cat sigint.pid) &&
sleep 0.1 &&
(kill -INT $(cat sigint.pid) || true) &&
$waitfile -t 180 -v -p Interrupt sigint.log &&
wait $pid
'
Expand Down
32 changes: 5 additions & 27 deletions t/t2801-top-cmd.t
Original file line number Diff line number Diff line change
Expand Up @@ -284,11 +284,7 @@ test_expect_success 'flux-top shows expected data in queues' '
grep "1 complete" all.out &&
grep "0 pending" all.out &&
grep "3 running" all.out &&
grep "2 failed" all.out &&
test $(grep bash all.out | wc -l) -eq 2 &&
test $(grep sleep all.out | wc -l) -eq 1 &&
test $(grep batch all.out | wc -l) -eq 2 &&
test $(grep debug all.out | wc -l) -eq 1
grep "2 failed" all.out
'
test_expect_success 'flux-top fails on invalid queue' '
test_must_fail flux top --queue=foobar
Expand All @@ -301,11 +297,7 @@ test_expect_success 'flux-top shows expected data in batch queue' '
grep "0 complete" batchq.out &&
grep "0 pending" batchq.out &&
grep "2 running" batchq.out &&
grep "0 failed" batchq.out &&
test $(grep bash batchq.out | wc -l) -eq 2 &&
test $(grep sleep batchq.out | wc -l) -eq 0 &&
test $(grep batch batchq.out | wc -l) -eq 2 &&
test $(grep debug batchq.out | wc -l) -eq 0
grep "0 failed" batchq.out
'
test_expect_success 'flux-top shows expected data in debug queue' '
$runpty flux top --queue=debug --test-exit \
Expand All @@ -316,9 +308,7 @@ test_expect_success 'flux-top shows expected data in debug queue' '
grep "0 pending" debugq.out &&
grep "1 running" debugq.out &&
grep "0 failed" debugq.out &&
test $(grep bash debugq.out | wc -l) -eq 0 &&
test $(grep sleep debugq.out | wc -l) -eq 1 &&
test $(grep batch debugq.out | wc -l) -eq 0 &&
test $(grep debug debugq.out | wc -l) -eq 1
'
test_expect_success 'cancel all jobs' '
Expand All @@ -332,11 +322,7 @@ test_expect_success 'flux-top shows expected data in queues after cancels' '
grep "1 complete" allC.out &&
grep "0 pending" allC.out &&
grep "0 running" allC.out &&
grep "5 failed" allC.out &&
test $(grep bash allC.out | wc -l) -eq 0 &&
test $(grep sleep allC.out | wc -l) -eq 0 &&
test $(grep batch allC.out | wc -l) -eq 0 &&
test $(grep debug allC.out | wc -l) -eq 0
grep "5 failed" allC.out
'
test_expect_success 'flux-top shows expected data in batch queue after cancels' '
$runpty flux top --queue=batch --test-exit \
Expand All @@ -346,11 +332,7 @@ test_expect_success 'flux-top shows expected data in batch queue after cancels'
grep "0 complete" batchqC.out &&
grep "0 pending" batchqC.out &&
grep "0 running" batchqC.out &&
grep "2 failed" batchqC.out &&
test $(grep bash batchqC.out | wc -l) -eq 0 &&
test $(grep sleep batchqC.out | wc -l) -eq 0 &&
test $(grep batch batchqC.out | wc -l) -eq 0 &&
test $(grep debug batchqC.out | wc -l) -eq 0
grep "2 failed" batchqC.out
'
test_expect_success 'flux-top shows expected data in debug queue after cancels' '
$runpty flux top --queue=debug --test-exit \
Expand All @@ -360,11 +342,7 @@ test_expect_success 'flux-top shows expected data in debug queue after cancels'
grep "0 complete" debugqC.out &&
grep "0 pending" debugqC.out &&
grep "0 running" debugqC.out &&
grep "1 failed" debugqC.out &&
test $(grep bash debugqC.out | wc -l) -eq 0 &&
test $(grep sleep debugqC.out | wc -l) -eq 0 &&
test $(grep batch debugqC.out | wc -l) -eq 0 &&
test $(grep debug debugqC.out | wc -l) -eq 0
grep "1 failed" debugqC.out
'
# for interactive test below, job submission order here is important.
# first two jobs are to batch queue, last is to debug queue. This
Expand Down
13 changes: 11 additions & 2 deletions t/t3307-system-leafcrash.t
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ transition through lost occurs.

. `dirname $0`/sharness.sh

#
# With --chain-lint, most tests below are skipped and this can cause
# the final test to hang. Therefore just skip all tests with chain-lint.
#
if ! test_have_prereq NO_CHAIN_LINT; then
skip_all='test may hang with --chain-lint, skipping all.'
test_done
fi

test_under_flux 2 system

startctl="flux python ${SHARNESS_TEST_SRCDIR}/scripts/startctl.py"
Expand All @@ -23,7 +32,7 @@ test_expect_success 'tell brokers to log to stderr' '
'

# Degraded at parent means child was lost
test_expect_success NO_CHAIN_LINT 'start overlay status wait in the background' '
test_expect_success 'start overlay status wait in the background' '
flux overlay status --timeout=0 --wait degraded &
echo $! >subtree.pid
'
Expand All @@ -37,7 +46,7 @@ test_expect_success 'restart broker 1' '
$startctl run 1
'

test_expect_success NO_CHAIN_LINT 'ensure child was lost' '
test_expect_success 'ensure child was lost' '
wait $(cat subtree.pid)
'

Expand Down