Skip to content
Permalink
Fetching contributors…
Cannot retrieve contributors at this time
185 lines (135 sloc) 5.18 KB
# THIS FILE IS CONTROLLED BY ELASTICLUSTER
# local modifications will be overwritten
# the next time `elasticluster setup` is run!
#
{%- set slurm_workers_max_vcpus = groups.slurm_worker|map('extract', hostvars, 'ansible_processor_vcpus')|list|max -%}
{%- set slurm_workers_min_vcpus = groups.slurm_worker|map('extract', hostvars, 'ansible_processor_vcpus')|list|min -%}
{%- set slurm_workers_max_free_memory_mb = groups.slurm_worker|map('extract', hostvars, ['ansible_memory_mb', 'nocache', 'free'])|list|max -%}
# **Note:** This file needs to have identical contents on all nodes of
# the cluster. See the `slurm.conf` man page for more information.
#
# Unique name for identifying this cluster entries in the DB
ClusterName=elasticluster
## Cluster nodes
#
{% for host in groups.slurm_worker %}
# {{host}}
NodeName={{host}} RealMemory={{hostvars[host].ansible_memtotal_mb|int - 384}} CPUs={{hostvars[host].ansible_processor_vcpus}} Sockets={{hostvars[host].ansible_processor_vcpus}} CoresPerSocket=1 ThreadsPerCore=1 {% if hostvars[host].gpu_count_total|default(0) > 0 %}{% for model in hostvars[host].gpu_count %} Gres=gpu:{{model}}:{{hostvars[host].gpu_count[model]}}{% endfor %}{% endif %}
{% endfor %}
## Cluster partitions
#
PartitionName=main Nodes={% for host in groups.slurm_worker %}{{host}},{%endfor%} MaxTime=INFINITE State=UP Default=YES
## scheduler settings
#
SchedulerType=sched/backfill
SchedulerPort=7321
SelectType={{slurm_selecttype|default('select/cons_res')}}
SelectTypeParameters={{slurm_selecttypeparameters|default('CR_Core_Memory')}}
FastSchedule={{slurm_fastschedule|default('1')}}
{% if slurm_defmempercpu is defined %}
DefMemPerCPU={{slurm_defmempercpu|int}}
{% else %}
#DefMemPerCPU={{ (slurm_workers_max_free_memory_mb / slurm_workers_max_vcpus)|int }}
{% endif %}
# use the "multifactor" plugin with weights set up to be multi-user ready
PriorityType=priority/multifactor
PriorityWeightAge=12500
PriorityWeightFairshare=100000
PriorityWeightJobSize=12500
PriorityWeightPartition=0
PriorityWeightQOS=20000
# fair share settings
PriorityDecayHalfLife=14-0
PriorityUsageResetPeriod=NONE
PriorityMaxAge=7-0
PriorityFavorSmall=NO
## accounting settings
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageEnforce=associations,limits,qos
AccountingStoreJobComment=YES
AccountingStorageHost={{SLURM_ACCOUNTING_HOST}}
AccountingStoragePort=6819
# the "job completion" info is redundant if the accounting
# infrastructure is enabled, so turn it off as it's an endless source
# of authentication and DB connection problems ...
JobCompType=jobcomp/none
# No power consumption acct
AcctGatherEnergyType=acct_gather_energy/none
# No IB usage accounting
AcctGatherInfinibandType=acct_gather_infiniband/none
# No filesystem accounting (only works with Lustre)
AcctGatherFilesystemType=acct_gather_filesystem/none
# No job profiling (for now)
AcctGatherProfileType=acct_gather_profile/none
#AcctGatherProfileType=acct_gather_profile/hdf5
JobAcctGatherType={{slurm_jobacctgathertype|default('jobacct_gather/linux')}}
JobAcctGatherFrequency={{slurm_jobacctgatherfrequency|default(60)}}
## resource scheduling (GRES)
#
GresTypes=gpu
## job execution settings
#
CheckpointType=checkpoint/none
#CheckpointType=checkpoint/blcr
JobCheckpointDir=/var/lib/slurm/checkpoint
# requeue jobs on node failure, unless users ask otherwise
JobRequeue=1
# max number of jobs in a job array
MaxArraySize={{slurm_maxarraysize|default(1000)}}
# max number of jobs pending + running
MaxJobCount={{slurm_maxjobcount|default(10000)}}
MpiDefault=openmpi
# Note: Apparently, the `MpiParams` option is needed also for non-mpi
# jobs in slurm 2.5.3.
MpiParams=ports=12000-12999
# track resource usage via Linux /proc tree
ProctrackType={{slurm_proctracktype|default('proctrack/linuxproc')}}
# do not propagate `ulimit` restrictions found on login nodes
PropagateResourceLimits=NONE
# automatically return nodes to service, unless they have been marked DOWN by admins
ReturnToService={{slurm_returntoservice|default(2)}}
TaskPlugin={{slurm_taskplugin|default('task/none')}}
#TaskEpilog=/etc/slurm/task_epilog
#TaskProlog=/etc/slurm/task_prolog
TmpFs=/tmp
# limit virtual mem usage to this% of real mem usage
VSizeFactor={{ slurm_allowedramspace|default(100)|int + slurm_allowedswapspace|default(1)|int }}
# misc timeout settings (commented lines show the default)
#
BatchStartTimeout=60
CompleteWait=35
#EpilogMsgTime=2000
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
## `slurmctld` settings (controller nodes)
#
ControlMachine={{SLURM_MASTER_HOST}}
SlurmctldPidFile={{slurm_pid_dir}}/slurmctld.pid
SlurmctldPort=6817
SlurmctldTimeout=300
StateSaveLocation=/var/spool/slurm
SlurmctldDebug=error
SlurmctldLogFile=/var/log/slurm/slurmctld.log
DebugFlags=backfill,cpu_bind,priority,reservation,selecttype,steps
MailProg=/usr/bin/mail
## `slurmd` settings (compute nodes)
#
SlurmdPort=6818
SlurmdPidFile={{slurm_pid_dir}}/slurmd.pid
SlurmdSpoolDir=/var/lib/slurm/slurmd
SlurmdTimeout=300
SlurmdDebug=error
SlurmdLogFile=/var/log/slurm/slurmd.log
AuthType=auth/munge
CryptoType=crypto/munge
DisableRootJobs=NO
You can’t perform that action at this time.