/
csd3_slurm.sh
100 lines (78 loc) · 2.78 KB
/
csd3_slurm.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/bin/bash
# Author(s): James Owers (james.f.owers@gmail.com)
#
# example usage:
# ```
# EXPT_FILE=experiments.txt # <- this has a command to run on each line
# NR_EXPTS=`cat ${EXPT_FILE} | wc -l`
# MAX_PARALLEL_JOBS=12
# sbatch --array=1-${NR_EXPTS}%${MAX_PARALLEL_JOBS} slurm_arrayjob.sh $EXPT_FILE
# ```
#
# or, equivalently and as intended, with provided `run_experiement`:
# ```
# run_experiment -b slurm_arrayjob.sh -e experiments.txt -m 12
# ```
# ====================
# Options for sbatch
# ====================
#SBATCH -J hog
# Maximum number of nodes to use for the job
#SBATCH --nodes=1
# Organisation account
#SBATCH -A NLP-CDT-SL2-GPU
#SBATCH -p ampere
# Megabytes of RAM required. Check `cluster-status` for node configurations
#SBATCH --mem=16000
# Generic resources to use - typically you'll want gpu:n to get n gpus
#SBATCH --gres=gpu:1
# Number of CPUs to use. Check `cluster-status` for node configurations
#SBATCH --cpus-per-task=8
# Maximum time for the job to run, format: days-hours:minutes:seconds
#SBATCH --time=08:00:00
# Location for stdout log - see https://slurm.schedmd.com/sbatch.html#lbAH
#SBATCH --output="/home/%u/slurm_logs/%A_%a.out"
# Location for stderr log - see https://slurm.schedmd.com/sbatch.html#lbAH
#SBATCH --error="/home/%u/slurm_logs/%A_%a.err"
# =====================
# Logging information
# =====================
# slurm info - more at https://slurm.schedmd.com/sbatch.html#lbAJ
echo "Job running on ${SLURM_JOB_NODELIST}"
dt=$(date '+%d/%m/%Y %H:%M:%S')
echo "Job started: $dt"
# ===================
# Environment setup
# ===================
echo "Setting up bash enviroment"
# Make available all commands on $PATH as on headnode
source ~/.bashrc
# Make script bail out after first error
set -e
RDS_HOME=/rds/user/${USER}/hpc-work
dest_path=${RDS_HOME}/hog/data
output_path=${RDS_HOME}/hog/output
# Activate your conda environment
CONDA_ENV_NAME=hog
echo "Activating conda environment: ${CONDA_ENV_NAME}"
conda activate ${CONDA_ENV_NAME}
# ==============================
# Finally, run the experiment!
# ==============================
# Read line number ${SLURM_ARRAY_TASK_ID} from the experiment file and run it
# ${SLURM_ARRAY_TASK_ID} is simply the number of the job within the array. If
# you execute `sbatch --array=1:100 ...` the jobs will get numbers 1 to 100
# inclusive.
experiment_text_file=$1
COMMAND="`sed \"${SLURM_ARRAY_TASK_ID}q;d\" ${experiment_text_file}`"
echo "Running provided command: ${COMMAND}"
eval "${COMMAND} ++num_workers=${SLURM_CPUS_PER_TASK} ++paths.input_dir=${dest_path} ++paths.output_dir=${output_path}"
echo "Command ran successfully!"
# =========================
# Post experiment logging
# =========================
echo ""
echo "============"
echo "job finished successfully"
dt=$(date '+%d/%m/%Y %H:%M:%S')
echo "Job finished: $dt"