Skip to content

Commit

Permalink
Testing CI : Fix the intermittent failing build with SGE (#515)
Browse files Browse the repository at this point in the history
* Keeping some debug message, trying to not restart SGE slaves at startup

* Use run-master and run-slave from git repo (instead of docker image), remove some useless debug commands

* Fix no SGE worker logs in cleanup CI
  • Loading branch information
guillaumeeb committed Sep 7, 2021
1 parent 7e8a9a0 commit 79a9a3b
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 9 deletions.
8 changes: 5 additions & 3 deletions ci/sge.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ function jobqueue_before_install {

docker ps -a
docker images
docker exec sge_master qconf -sq dask.q
}

function jobqueue_install {
Expand All @@ -23,7 +24,8 @@ function jobqueue_script {
}

function jobqueue_after_script {
docker exec sge_master bash -c 'cat /tmp/sge*'
docker exec slave_one bash -c 'cat /tmp/exec*'
docker exec slave_two bash -c 'cat /tmp/exec*'
echo "Daemon logs"
docker exec sge_master bash -c 'cat /tmp/sge*' || echo "No sge_master logs"
docker exec slave_one bash -c 'cat /tmp/exec*' || echo "No slave_one logs"
docker exec slave_two bash -c 'cat /tmp/exec*' || echo "No slave_two logs"
}
6 changes: 3 additions & 3 deletions ci/sge/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ services:
#network_mode: host
volumes:
- ../..:/dask-jobqueue
command: bash /run-master.sh
command: bash /dask-jobqueue/ci/sge/run-master.sh

slave-one:
image: daskdev/dask-jobqueue:sge-slave
Expand All @@ -28,7 +28,7 @@ services:
#network_mode: host
volumes:
- ../..:/dask-jobqueue
command: bash /run-slave.sh
command: bash /dask-jobqueue/ci/sge/run-slave.sh
links:
- "master:sge_master"
depends_on:
Expand All @@ -46,7 +46,7 @@ services:
#network_mode: host
volumes:
- ../..:/dask-jobqueue
command: bash /run-slave.sh
command: bash /dask-jobqueue/ci/sge/run-slave.sh
links:
- "master:sge_master"
depends_on:
Expand Down
3 changes: 3 additions & 0 deletions ci/sge/run-master.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
sudo service gridengine-master restart

while ! ping -c1 slave_one &>/dev/null; do :; done
#Sometimes conf is inaccessible at first
while ! qconf -sconf &>/dev/null; do sleep 0.1; done
cat /var/lib/gridengine//default/common/act_qmaster

qconf -Msconf /scheduler.txt
qconf -Ahgrp /hosts.txt
Expand Down
5 changes: 2 additions & 3 deletions ci/sge/run-slave.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#!/bin/bash

# start sge
sudo service gridengine-exec restart

sleep 4
#wait a bit for master configuration
sleep 3

sudo service gridengine-exec restart

Expand Down

0 comments on commit 79a9a3b

Please sign in to comment.