Skip to content

Commit

Permalink
Merge pull request #47 from guillaumeeb/pbs_docker_ci
Browse files Browse the repository at this point in the history
Pbs docker ci
  • Loading branch information
Joe Hamman committed May 14, 2018
2 parents f7b14fe + ca514d9 commit 8d243b6
Show file tree
Hide file tree
Showing 13 changed files with 222 additions and 14 deletions.
8 changes: 6 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ matrix:
- OS=ubuntu-14.04
# JOBQUEUE=none is for tests that do not need a cluster to run
- JOBQUEUE=none
- python: "3.6"
env:
- OS=ubuntu-14.04
- JOBQUEUE=pbs
env:
global:
- DOCKER_COMPOSE_VERSION=1.6.0
Expand All @@ -40,8 +44,8 @@ install:
- jobqueue_install
script:
- jobqueue_script
after_success:
- jobqueue_after_success
after_script:
- jobqueue_after_script

# TODO
# - pip install --no-cache-dir coveralls
Expand Down
4 changes: 2 additions & 2 deletions ci/none.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ function jobqueue_script {
py.test --verbose
}

function jobqueue_after_success {
echo "Hurrah"
function jobqueue_after_script {
echo "Done."
}
40 changes: 40 additions & 0 deletions ci/pbs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env bash

set -x

function jobqueue_before_install {
docker version
docker-compose version

# start pbs cluster
cd ./ci/pbs
./start-pbs.sh
cd -

docker exec -it -u pbsuser pbs_master pbsnodes -a
docker ps -a
docker images
}

function jobqueue_install {
docker exec -it pbs_master /bin/bash -c "cd /dask-jobqueue; python setup.py install"
}

function jobqueue_script {
docker exec -it -u pbsuser pbs_master /bin/bash -c "cd /dask-jobqueue; py.test dask_jobqueue --verbose -E pbs"
}

function jobqueue_after_script {
docker exec -it -u pbsuser pbs_master qstat -fx
docker exec -it pbs_master bash -c 'cat /var/spool/pbs/sched_logs/*'
docker exec -it pbs_master bash -c 'cat /var/spool/pbs/server_logs/*'
docker exec -it pbs_master bash -c 'cat /var/spool/pbs/server_priv/accounting/*'
docker exec -it pbs_slave_1 bash -c 'cat /var/spool/pbs/mom_logs/*'
docker exec -it pbs_slave_1 bash -c 'cat /var/spool/pbs/spool/*'
docker exec -it pbs_slave_1 bash -c 'cat /tmp/*.e*'
docker exec -it pbs_slave_1 bash -c 'cat /tmp/*.o*'
docker exec -it pbs_slave_2 bash -c 'cat /var/spool/pbs/mom_logs/*'
docker exec -it pbs_slave_2 bash -c 'cat /var/spool/pbs/spool/*'
docker exec -it pbs_slave_2 bash -c 'cat /tmp/*.e*'
docker exec -it pbs_slave_2 bash -c 'cat /tmp/*.o*'
}
42 changes: 42 additions & 0 deletions ci/pbs/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# inspired from https://github.com/PBSPro/pbspro/blob/v18.1.beta/docker/centos7/
# multi-stage build
# build script will be triggered
FROM centos:7.4.1708 AS builder
# install dependencies for building
RUN yum install -y gcc make rpm-build libtool hwloc-devel libX11-devel \
libXt-devel libedit-devel libical-devel ncurses-devel perl \
postgresql-devel python-devel tcl-devel tk-devel swig expat-devel \
openssl-devel libXext libXft git
# get known PBS Pro source code
RUN git clone --branch v14.1.2 https://github.com/pbspro/pbspro.git /src/pbspro
COPY build.sh /
RUN bash /build.sh

# base image
FROM centos:7.4.1708
LABEL description="PBS Professional Open Source and conda"

#The pbs master node name, can be overriden if needed
ENV PBS_MASTER pbs_master
ENV PATH /opt/pbs/bin:/opt/anaconda/bin:$PATH
ENV LANG en_US.UTF-8
ENV LC_ALL en_US.UTF-8

COPY --from=builder /root/rpmbuild/RPMS/x86_64/pbspro-server-*.rpm .
# install pbspro and useful packages
RUN yum install -y pbspro-server-*.rpm curl bzip2 git gcc sudo openssh-server && yum clean all
# install python
RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash miniconda.sh -f -b -p /opt/anaconda && \
/opt/anaconda/bin/conda clean -tipy && \
rm -f miniconda.sh
RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep
# Take the latest version of distributed due to test failure otherwise (see #47 comment by mrocklin)
RUN pip install --no-cache-dir git+https://github.com/dask/distributed.git --upgrade

# Copy entrypoint and other needed scripts
COPY ./*.sh /
RUN chmod a+x ./*.sh

# default entrypoint launch pbs master
ENTRYPOINT ["bash", "/master-entrypoint.sh"]
10 changes: 10 additions & 0 deletions ci/pbs/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
cd /src/pbspro
./autogen.sh
./configure -prefix=/opt/pbs
make dist
mkdir /root/rpmbuild /root/rpmbuild/SOURCES /root/rpmbuild/SPECS
cp pbspro-*.tar.gz /root/rpmbuild/SOURCES
cp pbspro.spec /root/rpmbuild/SPECS
cd /root/rpmbuild/SPECS
rpmbuild -ba pbspro.spec
42 changes: 42 additions & 0 deletions ci/pbs/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
version: "2"

services:

master:
build: .
container_name: pbs_master
hostname: pbs_master
volumes:
- ../..:/dask-jobqueue
command: bash /run-master.sh

slave_one:
build: .
container_name: pbs_slave_1
hostname: pbs_slave_1
volumes:
- ../..:/dask-jobqueue
entrypoint: "bash /slave-entrypoint.sh"
command: bash /run-slave.sh
links:
- "master:pbs_master"
environment:
- PBS_MASTER=pbs_master
depends_on:
- master

slave_two:
build: .
container_name: pbs_slave_2
hostname: pbs_slave_2
volumes:
- ../..:/dask-jobqueue
entrypoint: "bash /slave-entrypoint.sh"
command: bash /run-slave.sh
links:
- "master:pbs_master"
environment:
- PBS_MASTER=pbs_master
depends_on:
- master

16 changes: 16 additions & 0 deletions ci/pbs/master-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/sh
pbs_conf_file=/etc/pbs.conf
mom_conf_file=/var/spool/pbs/mom_priv/config
hostname=$(hostname)

# replace hostname in pbs.conf and mom_priv/config
sed -i "s/PBS_SERVER=.*/PBS_SERVER=$hostname/" $pbs_conf_file
sed -i "s/\$clienthost .*/\$clienthost $hostname/" $mom_conf_file

# start PBS Pro
/etc/init.d/pbs start

# create default non-root user
adduser pbsuser

exec "$@"
13 changes: 13 additions & 0 deletions ci/pbs/run-master.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

# Reduce time between PBS scheduling and add history
qmgr -c "set server scheduler_iteration = 20"
qmgr -c "set server job_history_enable = True"
qmgr -c "set server job_history_duration = 24:00:00"

# add two slaves to pbs
qmgr -c "create node pbs_slave_1"
qmgr -c "create node pbs_slave_2"

# Start hanging process to leave the container up and running
python -m http.server 8888
4 changes: 4 additions & 0 deletions ci/pbs/run-slave.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

# Start hanging process to leave the container up and running
python -m http.server 8888
25 changes: 25 additions & 0 deletions ci/pbs/slave-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/sh
pbs_conf_file=/etc/pbs.conf
mom_conf_file=/var/spool/pbs/mom_priv/config
hostname=$(hostname)

# replace hostname in pbs.conf and mom_priv/config
sed -i "s/PBS_SERVER=.*/PBS_SERVER=$PBS_MASTER/" $pbs_conf_file
sed -i "s/\$clienthost .*/\$clienthost $hostname/" $mom_conf_file
sed -i "s/PBS_START_SERVER=.*/PBS_START_SERVER=0/" $pbs_conf_file
sed -i "s/PBS_START_SCHED=.*/PBS_START_SCHED=0/" $pbs_conf_file
sed -i "s/PBS_START_COMM=.*/PBS_START_COMM=0/" $pbs_conf_file
sed -i "s/PBS_START_MOM=.*/PBS_START_MOM=1/" $pbs_conf_file

# Prevent PBS trying to use scp between host for stdout and stderr file of jobs
# On standard PBS deployement, you would use a shared mount, or correctly configured passwordless scp
echo "\$usecp *:/home/ /home/" >> $mom_conf_file
echo "\$usecp *:/dask-jobqueue/ /tmp/" >> $mom_conf_file

# start PBS Pro
/etc/init.d/pbs start

# create default non-root user
adduser pbsuser

exec "$@"
9 changes: 9 additions & 0 deletions ci/pbs/start-pbs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

docker-compose up -d
while [ `docker exec -it -u pbsuser pbs_master pbsnodes -a | grep "Mom = pbs_slave" | wc -l` -ne 2 ]
do
echo "Waiting for PBS slave nodes to become available";
sleep 2
done
echo "PBS properly configured"
2 changes: 1 addition & 1 deletion ci/sge.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ function jobqueue_script {
docker exec -it sge_master /bin/bash -c "cd /dask-jobqueue; py.test dask_jobqueue --verbose -E sge"
}

function jobqueue_after_success {
function jobqueue_after_script {
docker exec -it sge_master bash -c 'cat /tmp/sge*'
docker exec -it slave_one bash -c 'cat /tmp/exec*'
docker exec -it slave_two bash -c 'cat /tmp/exec*'
Expand Down
21 changes: 12 additions & 9 deletions dask_jobqueue/tests/test_pbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ def test_job_script():

@pytest.mark.env("pbs") # noqa: F811
def test_basic(loop):
with PBSCluster(walltime='00:02:00', threads_per_worker=2, memory='7GB',
interface='ib0', loop=loop) as cluster:
with PBSCluster(walltime='00:02:00', processes=1, threads=2, memory='2GB', local_directory='/tmp',
job_extra=['-V'], loop=loop) as cluster:
with Client(cluster) as client:
workers = cluster.start_workers(2)
future = client.submit(lambda x: x + 1, 10)
Expand All @@ -87,7 +87,7 @@ def test_basic(loop):

info = client.scheduler_info()
w = list(info['workers'].values())[0]
assert w['memory_limit'] == 7e9
assert w['memory_limit'] == 2e9
assert w['ncores'] == 2

cluster.stop_workers(workers)
Expand All @@ -102,7 +102,8 @@ def test_basic(loop):

@pytest.mark.env("pbs") # noqa: F811
def test_adaptive(loop):
with PBSCluster(walltime='00:02:00', loop=loop) as cluster:
with PBSCluster(walltime='00:02:00', processes=1, threads=2, memory='2GB', local_directory='/tmp',
job_extra=['-V'], loop=loop) as cluster:
cluster.adapt()
with Client(cluster) as client:
future = client.submit(lambda x: x + 1, 10)
Expand All @@ -111,7 +112,7 @@ def test_adaptive(loop):
assert cluster.jobs

start = time()
processes = cluster.config['processes']
processes = cluster.worker_processes
while len(client.scheduler_info()['workers']) != processes:
sleep(0.1)
assert time() < start + 10
Expand All @@ -123,7 +124,9 @@ def test_adaptive(loop):
sleep(0.100)
assert time() < start + 10

start = time()
while cluster.jobs:
sleep(0.100)
assert time() < start + 10
# There is probably a bug to fix in the adaptive methods of the JobQueueCluster
# Currently cluster.jobs is not cleaned up.
#start = time()
#while cluster.jobs:
# sleep(0.100)
# assert time() < start + 10

0 comments on commit 8d243b6

Please sign in to comment.