Merge pull request #40 from fasrc/mj_nonseqarray

PR to include slurm job array examples
fasrc · May 15, 2024 · fd68e5d · fd68e5d
2 parents 8ded410 + 375334e
commit fd68e5d
Show file tree

Hide file tree

Showing 43 changed files with 1,200,681 additions and 0 deletions.
diff --git a/Notes/JobArrays/Exercise_nonsequential/array_job_nonsequence.sh b/Notes/JobArrays/Exercise_nonsequential/array_job_nonsequence.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# This script runs the array job spawned using the main.sh script on a
+# single node on test partition to process folders starting with
+# name sub-blast located inside a work directory.
+
+#SBATCH -J slurm_python
+#SBATCH --partition=test  # Change partition name based on Cannon or FASSE and resources required 
+#SBATCH -o %A_%a.o 
+#SBATCH -e %A_%a.e 
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=1
+#SBATCH --time=00:20:00
+#SBATCH --mem=4G 
+
+
+# Change 'BASEDIR' and 'WORKDIR' based on desired locations
+BASEDIR=$PWD
+WORKDIR=$PWD/masks
+
+# Find sub-directories inside $WORKDIR with 'sub' in their names and
+# redirect the output to filelist.txt
+find $WORKDIR -type d -name "sub*" > filelist.txt
+
+cd $BASEDIR
+echo "In $BASEDIR"
+
+dirname=$(awk "NR==${SLURM_ARRAY_TASK_ID}" filelist.txt)
+
+echo "Job array ID: $SLURM_ARRAY_JOB_ID , sub-job $SLURM_ARRAY_TASK_ID is running!"
+echo "Sub-job $SLURM_ARRAY_TASK_ID is processing $dirname"
+
+# Do science here
+
+echo "Done processing $dirname"
diff --git a/Notes/JobArrays/Exercise_nonsequential/main.sh b/Notes/JobArrays/Exercise_nonsequential/main.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Bash script to list the number of sub-folders starting with the
+# keyword 'sub' located inside a working directory. It then submits a
+# Slurm job as an array job of array size equal to the number of 'sub'
+# sub-folders along with a batch script to process those folders.
+
+# Usage:
+# ./main.sh
+
+# Change 'WORKDIR' based on desired location
+WORKDIR=$PWD/masks
+
+# Count number of sub-directories inside $WORKDIR with 'sub' in their
+# names
+ENTRIES=$(find $WORKDIR -type d -name "sub*" | wc -l)
+
+sbatch --array=1-$ENTRIES array_job_nonsequence.sh
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/MNI05/test.txt b/Notes/JobArrays/Exercise_nonsequential/masks/MNI05/test.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta001/test1.txt b/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta001/test1.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta002/test2.txt b/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta002/test2.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta004/test4.txt b/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta004/test4.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta005/test5.txt b/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta005/test5.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta006/test6.txt b/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta006/test6.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta007/test7.txt b/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta007/test7.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta008/test8.txt b/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta008/test8.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta010/test10.txt b/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta010/test10.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta011/test11.txt b/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta011/test11.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta012/test12.txt b/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta012/test12.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta013/test13.txt b/Notes/JobArrays/Exercise_nonsequential/masks/sub-blasta013/test13.txt
diff --git a/Notes/JobArrays/Exercise_nonsequential_maxarray/README.txt b/Notes/JobArrays/Exercise_nonsequential_maxarray/README.txt
@@ -0,0 +1,17 @@
+In this folder, we have a folder named 'directories'. 'directories'
+contains multiple sub-folders which contain a job.sh file.  This
+example script simply prints 'Hello World' to the console.
+
+In order to process these files, we can use main.sh script as follows:
+./main.sh directories f job.sh output
+
+This will generate some .o (output) and .e (error) files along with
+output folders starting with the name 'output' and the corresponding
+Slurm job and task array IDs appended. 
+
+If you 'cat' or 'more' the contents of .o files, you should see 'Hello
+World' in those files along with print statements from the 'echo'
+command.
+
+Additionally, the output folders will have output.log that stores the
+path of the entry being processed in the joblist.txt file.
diff --git a/Notes/JobArrays/Exercise_nonsequential_maxarray/array_script.sh b/Notes/JobArrays/Exercise_nonsequential_maxarray/array_script.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+# This script runs the array job spawned using the main.sh script on a
+# single node on a partition to process entries with a given keyword
+# in their name and are located inside a work directory. The keyword
+# is provided as a command line argument in main.sh.
+
+# This script uses OFFSET, provided as a command line argument, to
+# operate on the corresponding line number in joblist.txt that stores
+# all the entries matching the $KEYWORD criterion.
+
+# Usage:                                                                                                                                              
+# array_script.sh <input-folder-name> <type> <keyword> <output-folder-name> <offset>                                                                     
+# For example: array_script.sh directories f job.sh output 0                        
+
+#SBATCH -J max_array_sample
+#SBATCH --partition=test  # Change partition name based on Cannon or FASSE and resources require                                  
+#SBATCH --reservation=bootcamp_cpu_2023
+#SBATCH -o %A_%a.o 
+#SBATCH -e %A_%a.e 
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=1
+#SBATCH --time=00:20:00
+#SBATCH --mem=4G 
+
+# Declare 'BASEDIR' and 'WORKDIR' based on desired locations
+BASEDIR=$PWD
+WORKDIR=$PWD/$1
+OUTDIR=$PWD/$4
+
+# Check if the type for entries, file (f) or directory (d) is provided                                                               
+TYPE=$2 
+if [ -z "TYPE" ] || [ $# -lt 4 ] 
+then 
+    echo "The script needs the type for entries, files (f) or directory (d), to be specified."
+    exit
+fi
+
+echo "type of entry is $TYPE" 
+
+# Check if the keyword to search for desired input files is provided                                                                
+# by the user or not                                                                                                                    
+KEYWORD=$3
+if [ -z "$KEYWORD" ] || [ $# -lt 4 ]
+then
+    echo "The script, main.sh, requires the keyword, to search for desired entries, as its 3rd command line argument."
+    exit
+fi
+
+echo "keyword is $KEYWORD"
+
+OFFSET=$5
+if [ -z "$OFFSET" ]
+then
+  OFFSET=0
+fi
+
+# Find entries inside $WORKDIR using joblist.txt 
+ENTRIES=$(wc -l joblist.txt | awk '{print $1}')
+echo "Number of entries are $ENTRIES"
+
+# If no more folders to process, then exit                                                                                                                                                                                                       
+FOLDER_NUMBER=$((OFFSET + SLURM_ARRAY_TASK_ID))
+if [ $((FOLDER_NUMBER)) -ge ${ENTRIES} ]
+then
+  exit
+fi
+
+echo "At Offset $OFFSET and folder number $FOLDER_NUMBER for task ID $SLURM_ARRAY_TASK_ID"
+
+cd $BASEDIR
+echo "In $BASEDIR"
+
+entryname=$(awk "NR==$((FOLDER_NUMBER+1))" joblist.txt)
+
+echo "Job array ID: $SLURM_ARRAY_JOB_ID , sub-job $SLURM_ARRAY_TASK_ID is running!"
+echo "Sub-job $SLURM_ARRAY_TASK_ID is processing $entryname"
+
+# Do science here
+DIRNAME=$OUTDIR-${SLURM_ARRAY_JOB_ID}-${SLURM_ARRAY_TASK_ID}
+mkdir $DIRNAME
+pushd $DIRNAME > /dev/null
+
+$entryname
+echo "$entryname" > $DIRNAME/output.log
+
+popd > /dev/null
+
+echo "Done processing $entryname"
diff --git a/Notes/JobArrays/Exercise_nonsequential_maxarray/directories/C1/job.sh b/Notes/JobArrays/Exercise_nonsequential_maxarray/directories/C1/job.sh
@@ -0,0 +1 @@
+echo 'Hello World'
diff --git a/Notes/JobArrays/Exercise_nonsequential_maxarray/directories/C2/job.sh b/Notes/JobArrays/Exercise_nonsequential_maxarray/directories/C2/job.sh
@@ -0,0 +1 @@
+echo 'Hello World'
diff --git a/Notes/JobArrays/Exercise_nonsequential_maxarray/directories/en1/job.sh b/Notes/JobArrays/Exercise_nonsequential_maxarray/directories/en1/job.sh
@@ -0,0 +1 @@
+echo 'Hello World'
diff --git a/Notes/JobArrays/Exercise_nonsequential_maxarray/directories/en2/job.sh b/Notes/JobArrays/Exercise_nonsequential_maxarray/directories/en2/job.sh
@@ -0,0 +1 @@
+echo 'Hello World'
diff --git a/Notes/JobArrays/Exercise_nonsequential_maxarray/main.sh b/Notes/JobArrays/Exercise_nonsequential_maxarray/main.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Bash script to list the number of entries with a given keyword to be
+# processed that are located inside a working directory. The script
+# accepts this keyword as a command line argument. It then submits a
+# Slurm job as an array job of array size equal to the number of
+# entries to be processed with a batch script.
+
+# Usage:
+# ./main.sh input-folder-name type keyword output-folder-name
+# For example: ./main.sh directories f job.sh output
+
+# Change 'WORKDIR' based on desired location
+WORKDIR=$PWD/$1
+
+echo "Workdir is $WORKDIR"
+
+# Check if the type for entries, file (f) or directory (d) is provided
+TYPE=$2
+if [ -z "TYPE" ] || [ $# -lt 4 ]
+then
+    echo "The script needs the type for entries, files (f) or directory (d), to be specified."
+    exit
+fi
+
+echo "type of entry is $TYPE"
+
+# Check if the keyword to search for desired input files is provided
+# by the user or not
+KEYWORD=$3
+if [ -z "$KEYWORD" ] || [ $# -lt 4 ]
+then
+    echo "The script, main.sh, requires the keyword, to search for desired entries, as its 3rd command line argument."
+    exit
+fi
+
+echo "keyword is $KEYWORD"
+
+# Count number of entries inside $WORKDIR with keyword in their
+# names
+ENTRIES=$(find $WORKDIR -type $TYPE -name "*$KEYWORD*" | wc -l)
+
+# Find entries inside $WORKDIR with given keyword in their names and                                                              
+# redirect the output to joblist.txt                                                                                                    
+rm -f joblist.txt
+find $WORKDIR -type $TYPE -name "*$KEYWORD*" 2> /dev/null > joblist.txt
+
+echo "Number of entries are $ENTRIES"
+
+# ENTRIES is the number of files/folders that need to be processed
+# Each iteration of this loop schedules a sbatch with an array size of
+# LIMIT. Thus, each iteration of this loop will process ENTRIES,
+# compare it to MAX_ARRAY_SIZE, and assign the leser value to
+# LIMIT. The OFFSET and ENTRIES will be updated based on their
+# original values and the value of LIMIT.
+OFFSET=0
+MAX_ARRAY_SIZE=1000
+while [[ $ENTRIES -gt 0 ]]
+do
+    LIMIT=$(( ENTRIES > MAX_ARRAY_SIZE ? MAX_ARRAY_SIZE - 1 : ENTRIES - 1 ))
+    sbatch --array=0-$LIMIT array_script.sh $1 $2 $3 $4 $OFFSET
+    OFFSET=$((OFFSET + (LIMIT + 1) ))
+    ENTRIES=$(( ENTRIES - (LIMIT + 1) ))
+done
diff --git a/Notes/JobArrays/Exercise_recursive/recursive-array.sh b/Notes/JobArrays/Exercise_recursive/recursive-array.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+# This script submits an array job on a single node with the 
+# highest array index job launching the next set of job recursively.
+
+# Usage:
+#   sbatch recursive-array.sh multiplier-initial-value jobs-max-limit
+#   For example:
+#   sbatch recursive-array.sh 1 60000
+
+#SBATCH --partition=test # Change partition name based on Cannon or FASSE and resources require 
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=1
+#SBATCH --time=01:00:00
+#SBATCH --mem-per-cpu=100M
+#SBATCH --array=[0-4]
+
+# Does not redirect output or error to files.
+
+# Check if max limit for jobs is provided by the user or not
+JOBLIMIT=$2
+if [ -z "$JOBLIMIT" ] || [ $# -lt 2 ]
+then
+    echo "This script requires total number of jobs needed to be run as its 2nd command line argument"
+    exit
+fi
+
+echo "JOBLIMIT is $JOBLIMIT"
+
+if [ $JOBLIMIT -lt $SLURM_ARRAY_TASK_COUNT ]
+then
+    echo "The max limit for jobs needs to at least match the total number of array jobs being launched"
+    exit
+fi
+
+# Load the latest anaconda module to get the latest python version
+module load anaconda3/2022.05
+
+# To keep track of Iteration, create a counter
+MULTIPLIER=$1
+if [ -z "$MULTIPLIER" ]
+then
+  MULTIPLIER=1
+fi
+
+WORKDIR=$PWD
+echo "Main directory for recursive-array script is: $WORKDIR"
+
+# Create a directory corresponding to each iteration,
+# Slurm Job ID, & Slurm Array Index.
+DIRNAME=d${MULTIPLIER}-${SLURM_ARRAY_JOB_ID}-${SLURM_ARRAY_TASK_ID}
+mkdir $DIRNAME
+
+# Switch to that directory, capture output, and 
+# execute your science there
+echo "Entering directory $DIRNAME" 
+pushd $DIRNAME
+echo "In $DIRNAME" >> screenlog.out
+
+echo "Multiplier is $MULTIPLIER" >> screenlog.out
+
+# Determine my task-id
+MY_TASK=$((MULTIPLIER * 10 + SLURM_ARRAY_TASK_ID))
+
+echo "Job array ID: $SLURM_ARRAY_JOB_ID , sub-job $SLURM_ARRAY_TASK_ID is running!" >> screenlog.out
+echo "Highest job array index value is $SLURM_ARRAY_TASK_MAX" >> screenlog.out
+echo "Number of tasks in array job is $SLURM_ARRAY_TASK_COUNT" >> screenlog.out
+echo "MY_TASK is $MY_TASK" >> screenlog.out
+
+# Science happens here
+python $WORKDIR/test_function_script.py > test-array-express-$SLURM_ARRAY_JOB_ID-$SLURM_ARRAY_TASK_ID.out
+
+# Switch back to the working directory
+echo "Exiting current directory and moving back to directory of origin" >> screenlog.out
+popd
+
+# Quit when MULTIPLIER reaches USERLIMIT
+if [ $MULTIPLIER -eq $((JOBLIMIT / SLURM_ARRAY_TASK_COUNT)) ]
+then
+  exit
+fi
+
+# Start next generation of iterations only when array index matches
+# the highest array index
+if [ $SLURM_ARRAY_TASK_ID -eq $SLURM_ARRAY_TASK_MAX ]
+then
+   echo "Continuing Next Iteration"
+   sbatch recursive-array.sh $((MULTIPLIER + 1)) $JOBLIMIT
+   ERROR=$?
+   if [ $ERROR -ne 0 ] 
+   then    
+     echo "This iteration failed. Submit the script again: sbatch recursive-array.sh $((MULTIPLIER + 1)) $JOBLIMIT"
+     exit
+   fi
+fi
diff --git a/Notes/JobArrays/Exercise_recursive/test_function_script.py b/Notes/JobArrays/Exercise_recursive/test_function_script.py
@@ -0,0 +1,25 @@
+import random as rnd
+import math as math
+
+
+def run_random_func(run_time):
+    test_list = []
+    sum_test = 0
+    for k in range(run_time):
+        for i in range(run_time):
+            rnd.seed(0)
+            test_list.append(rnd.random())
+            print("Random number in list is", test_list[i])
+            for j in test_list:
+                print("Number in test list is", j)
+                sum_test += math.sqrt(j)
+                print("sum is", sum_test)
+        test_list = []
+        print("\n")
+        print("End of iteration", k)
+        print("\n")
+
+
+if __name__ == '__main__':
+    # it will run for ~ 1 min.
+    run_random_func(5)
diff --git a/Notes/JobArrays/Exercise_sequential_basic/d1/a.txt b/Notes/JobArrays/Exercise_sequential_basic/d1/a.txt
@@ -0,0 +1 @@
+Hello, this is directory 1!
diff --git a/Notes/JobArrays/Exercise_sequential_basic/d2/a.txt b/Notes/JobArrays/Exercise_sequential_basic/d2/a.txt
@@ -0,0 +1 @@
+Hello, this is directory 2!
diff --git a/Notes/JobArrays/Exercise_sequential_basic/d3/a.txt b/Notes/JobArrays/Exercise_sequential_basic/d3/a.txt
@@ -0,0 +1 @@
+Hello, this is directory 3!
diff --git a/Notes/JobArrays/Exercise_sequential_basic/d4/a.txt b/Notes/JobArrays/Exercise_sequential_basic/d4/a.txt
@@ -0,0 +1 @@
+Hello, this is directory 4!
diff --git a/Notes/JobArrays/Exercise_sequential_basic/sample_array_job.bash b/Notes/JobArrays/Exercise_sequential_basic/sample_array_job.bash
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+#SBATCH --partition=test  # Change partition name based on Cannon or FASSE and resources require
+#SBATCH --job-name=test_array
+#SBATCH --time=00:05:00
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=1
+#SBATCH --output=%A_%a.out
+#SBATCH --error=%A_%a.err
+#SBATCH --array=1-4%4 #run a 4-job array, 4 jobs will launched at the same time.
+
+echo "Job array ID: $SLURM_ARRAY_JOB_ID , sub-job $SLURM_ARRAY_TASK_ID is running!"
+
+cd d$SLURM_ARRAY_TASK_ID
+cat a.txt
diff --git a/Notes/JobArrays/Exercise_sequential_fastqc/fastqc_arrayjob.slurm.sh b/Notes/JobArrays/Exercise_sequential_fastqc/fastqc_arrayjob.slurm.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+#SBATCH --partition=test  # Change partition name based on Cannon or FASSE and resources require
+#SBATCH --nodes 1
+#SBATCH --cpus-per-task=1
+#SBATCH -t 1:00:00
+#SBATCH --mem=1G
+#SBATCH --job-name="Fastqc_arrayjob"
+#SBATCH --output=%A-%a.out
+#SBATCH --error=%A-%a.err
+#SBATCH --array=1-3%3 ## This will submit an array of 3 jobs, all 3 at the same time
+
+
+#Run fastqc on each sample using the SLURM_ARRAY_TASK_ID environmental variable
+singularity exec /cvmfs/singularity.galaxyproject.org/f/a/fastqc:0.12.1--hdfd78af_0 fastqc wgEncode${SLURM_ARRAY_TASK_ID}_Sub.fq