Skip to content

Commit

Permalink
Add Slurm CI implementation (#57)
Browse files Browse the repository at this point in the history
  • Loading branch information
guillaumeeb authored and mrocklin committed May 15, 2018
1 parent 495dc9b commit 0cae4ec
Show file tree
Hide file tree
Showing 9 changed files with 249 additions and 12 deletions.
5 changes: 5 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ matrix:
env:
- OS=ubuntu-14.04
- JOBQUEUE=pbs
- python: "3.6"
env:
- OS=ubuntu-14.04
- JOBQUEUE=slurm

env:
global:
- DOCKER_COMPOSE_VERSION=1.6.0
Expand Down
2 changes: 0 additions & 2 deletions ci/pbs/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-L
/opt/anaconda/bin/conda clean -tipy && \
rm -f miniconda.sh
RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep
# Take the latest version of distributed due to test failure otherwise (see #47 comment by mrocklin)
RUN pip install --no-cache-dir git+https://github.com/dask/distributed.git --upgrade

# Copy entrypoint and other needed scripts
COPY ./*.sh /
Expand Down
30 changes: 30 additions & 0 deletions ci/slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

set -x

function jobqueue_before_install {
docker version
docker-compose version

# start slurm cluster
cd ./ci/slurm
./start-slurm.sh
cd -

docker ps -a
docker images
}

function jobqueue_install {
docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; python setup.py install"
}

function jobqueue_script {
docker exec -it slurmctld /bin/bash -c "cd /dask-jobqueue; py.test dask_jobqueue --verbose -E slurm"
}

function jobqueue_after_script {
docker exec -it slurmctld bash -c 'sinfo'
docker exec -it slurmctld bash -c 'squeue'
docker exec -it slurmctld bash -c 'sacct -l'
}
12 changes: 12 additions & 0 deletions ci/slurm/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM giovtorres/slurm-docker-cluster

RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash miniconda.sh -f -b -p /opt/anaconda && \
/opt/anaconda/bin/conda clean -tipy && \
rm -f miniconda.sh
ENV PATH /opt/anaconda/bin:$PATH
RUN conda install --yes -c conda-forge python=3.6 dask distributed flake8 pytest docrep

ENV LC_ALL en_US.UTF-8

COPY slurm.conf /etc/slurm/slurm.conf
81 changes: 81 additions & 0 deletions ci/slurm/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
version: "2.2"

services:
mysql:
image: mysql:5.7
hostname: mysql
container_name: mysql
environment:
MYSQL_RANDOM_ROOT_PASSWORD: "yes"
MYSQL_DATABASE: slurm_acct_db
MYSQL_USER: slurm
MYSQL_PASSWORD: password
volumes:
- var_lib_mysql:/var/lib/mysql

slurmdbd:
build: .
command: ["slurmdbd"]
container_name: slurmdbd
hostname: slurmdbd
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- var_log_slurm:/var/log/slurm
expose:
- "6819"
depends_on:
- mysql

slurmctld:
build: .
command: ["slurmctld"]
container_name: slurmctld
hostname: slurmctld
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- slurm_jobdir:/data
- var_log_slurm:/var/log/slurm
- ../..:/dask-jobqueue
expose:
- "6817"
depends_on:
- "slurmdbd"

c1:
build: .
command: ["slurmd"]
hostname: c1
container_name: c1
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- slurm_jobdir:/data
- var_log_slurm:/var/log/slurm
expose:
- "6818"
depends_on:
- "slurmctld"

c2:
build: .
command: ["slurmd"]
hostname: c2
container_name: c2
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- slurm_jobdir:/data
- var_log_slurm:/var/log/slurm
expose:
- "6818"
depends_on:
- "slurmctld"

volumes:
etc_munge:
etc_slurm:
slurm_jobdir:
var_lib_mysql:
var_log_slurm:
5 changes: 5 additions & 0 deletions ci/slurm/register_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
set -e

docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add cluster name=linux" && \
docker-compose restart slurmdbd slurmctld
94 changes: 94 additions & 0 deletions ci/slurm/slurm.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# slurm.conf
#
# See the slurm.conf man page for more information.
#
ClusterName=linux
ControlMachine=slurmctld
ControlAddr=slurmctld
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/lib/slurmd
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
ProctrackType=proctrack/linuxproc
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=0
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType=select/cons_res
SelectTypeParameters=CR_CPU_Memory
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurmd.log
JobCompType=jobcomp/filetxt
JobCompLoc=/var/log/slurm/jobcomp.log
#
# ACCOUNTING
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=slurmdbd
AccountingStoragePort=6819
AccountingStorageLoc=slurm_acct_db
#AccountingStoragePass=
#AccountingStorageUser=
#
# COMPUTE NODES
NodeName=c[1-2] RealMemory=4096 CPUs=2 State=UNKNOWN
#
# PARTITIONS
PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=2048 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
9 changes: 9 additions & 0 deletions ci/slurm/start-slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

docker-compose up --build -d
while [ `./register_cluster.sh | grep "sacctmgr: error" | wc -l` -ne 0 ]
do
echo "Waiting for SLURM cluster to become ready";
sleep 2
done
echo "SLURM properly configured"
23 changes: 13 additions & 10 deletions dask_jobqueue/tests/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ def test_job_script():

@pytest.mark.env("slurm") # noqa: F811
def test_basic(loop):
with SLURMCluster(walltime='00:02:00', threads_per_worker=2, memory='7GB',
loop=loop) as cluster:
with SLURMCluster(walltime='00:02:00', threads=2, processes=1, memory='4GB',
job_extra=['-D /'], loop=loop) as cluster:
with Client(cluster) as client:
workers = cluster.start_workers(2)
future = client.submit(lambda x: x + 1, 10)
Expand All @@ -93,7 +93,7 @@ def test_basic(loop):

info = client.scheduler_info()
w = list(info['workers'].values())[0]
assert w['memory_limit'] == 7e9
assert w['memory_limit'] == 4e9
assert w['ncores'] == 2

cluster.stop_workers(workers)
Expand All @@ -108,7 +108,8 @@ def test_basic(loop):

@pytest.mark.env("slurm") # noqa: F811
def test_adaptive(loop):
with SLURMCluster(walltime='00:02:00', loop=loop) as cluster:
with SLURMCluster(walltime='00:02:00', threads=2, processes=1, memory='4GB',
job_extra=['-D /'], loop=loop) as cluster:
cluster.adapt()
with Client(cluster) as client:
future = client.submit(lambda x: x + 1, 10)
Expand All @@ -117,8 +118,8 @@ def test_adaptive(loop):
assert cluster.jobs

start = time()
while (len(client.scheduler_info()['workers']) !=
cluster.config['processes']):
processes = cluster.worker_processes
while (len(client.scheduler_info()['workers']) != processes):
sleep(0.1)
assert time() < start + 10

Expand All @@ -129,7 +130,9 @@ def test_adaptive(loop):
sleep(0.100)
assert time() < start + 10

start = time()
while cluster.jobs:
sleep(0.100)
assert time() < start + 10
# There is probably a bug to fix in the adaptive methods of the JobQueueCluster
# Currently cluster.jobs is not cleaned up.
# start = time()
# while cluster.jobs:
# sleep(0.100)
# assert time() < start + 10

0 comments on commit 0cae4ec

Please sign in to comment.