# RF F90 MPI 66 k

#### Slurm batch script

In [10]:
%%writefile rffm6.srm
#!/bin/bash
#SBATCH --job-name rffm6       # Job name
#SBATCH --partition cpu_small  # Select partition
#SBATCH --ntasks=1             # Total tasks
#SBATCH --time=00:05:00        # Limit execution time
#SBATCH --exclusive            # Exclusive acccess to nodes

echo '========================================'
echo '- Job ID:' $SLURM_JOB_ID
echo '- Tasks per node:' $SLURM_NTASKS_PER_NODE
echo '- # of nodes in the job:' $SLURM_JOB_NUM_NODES
echo '- # of tasks:' $SLURM_NTASKS
echo '- Dir from which sbatch was invoked:' ${SLURM_SUBMIT_DIR##*/}
cd $SLURM_SUBMIT_DIR
echo -n '- List of nodes allocated to the job: '
nodeset -e $SLURM_JOB_NODELIST

# Environment
echo '-- modules ----------------------------'
module load intel_psxe/2020
SCR=/scratch${PWD#"/prj"}
DT1=datasets/asteroid-train-66k.arff
DT2=datasets/asteroid-test-34k.arff
EXEC="parf -t "$DT1" -a "$DT2

# Start
echo '-- run --------------------------------'
echo '$ srun -n' $SLURM_NTASKS $e
echo '-- output -----------------------------'
srun -n $SLURM_NTASKS $EXEC
echo '~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'

Overwriting rffm6.srm


## Check

In [11]:
! sbatch --partition=cpu_dev --ntasks=24 rffm6.srm

Submitted batch job 1344918


In [12]:
! squeue -n rfm -o "%.18i  %.9P  %.2t %.5M %.5D %.4C" --partition=cpu_dev

             JOBID  PARTITION  ST  TIME NODES CPUS


In [13]:
! squeue --partition=cpu_dev -h -r | wc -l

1


In [17]:
! squeue -n rfm -o "%.18i  %.9P  %.2t %.5M %.5D %.4C" --partition=cpu_dev

             JOBID  PARTITION  ST  TIME NODES CPUS


In [18]:
! cat /scratch${PWD#"/prj"}/slurm-1344918.out

- Job ID: 1344918
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 24
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1243
-- modules ----------------------------
-- run --------------------------------
$ srun -n 24
-- output -----------------------------
Trainset classification error is   0.07% of   66000 (kappa: 0.9901 )
 Testset classification error is   0.56% of   34000 (kappa: 0.9158 )
T: 17.4673  |  N: 24
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


# Run

### 1 of (1, 4, 16, 24, 48, 72, 96)

In [19]:
%%bash
sbatch --ntasks=1 rffm6.srm
sbatch --ntasks=1 rffm6.srm
sbatch --ntasks=1 rffm6.srm

Submitted batch job 1344919
Submitted batch job 1344920
Submitted batch job 1344921


In [8]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1344919.out
cat /scratch${PWD#"/prj"}/slurm-1344920.out
cat /scratch${PWD#"/prj"}/slurm-1344921.out

- Job ID: 1344919
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 1
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1450
-- modules ----------------------------
-- run --------------------------------
$ srun -n 1
-- output -----------------------------
Trainset classification error is   0.06% of   66000 (kappa: 0.9913 )
 Testset classification error is   0.75% of   34000 (kappa: 0.8864 )
T: 142.4932  |  N: 1
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1344920
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 1
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1450
-- modules ----------------------------
-- run --------------------------------
$ srun -n 1
-- output -----------------------------
Trainset classification error is   0.05% of   66000 (kappa: 0.9931 )
 Testset classification error is   0.45% of   34000 (kappa: 0.9314 )
T: 138.0251  |  N: 1
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


### 4 of (1, 4, 16, 24, 48, 72, 96)

In [20]:
%%bash
sbatch --ntasks=4 rffm6.srm
sbatch --ntasks=4 rffm6.srm
sbatch --ntasks=4 rffm6.srm

Submitted batch job 1344922
Submitted batch job 1344923
Submitted batch job 1344924


In [1]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1344922.out
cat /scratch${PWD#"/prj"}/slurm-1344923.out
cat /scratch${PWD#"/prj"}/slurm-1344924.out

- Job ID: 1344922
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 4
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1450
-- modules ----------------------------
-- run --------------------------------
$ srun -n 4
-- output -----------------------------
Trainset classification error is   0.06% of   66000 (kappa: 0.9915 )
 Testset classification error is   0.65% of   34000 (kappa: 0.9015 )
T: 46.7112  |  N: 4
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1344923
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 4
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1450
-- modules ----------------------------
-- run --------------------------------
$ srun -n 4
-- output -----------------------------
Trainset classification error is   0.06% of   66000 (kappa: 0.9913 )
 Testset classification error is   0.45% of   34000 (kappa: 0.9323 )
T: 44.1341  |  N: 4
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- 

### 16 of (1, 4, 16, 24, 48, 72, 96)

In [21]:
%%bash
sbatch --ntasks=16 rffm6.srm
sbatch --ntasks=16 rffm6.srm
sbatch --ntasks=16 rffm6.srm

Submitted batch job 1344926
Submitted batch job 1344927
Submitted batch job 1344928


In [7]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1344926.out
cat /scratch${PWD#"/prj"}/slurm-1344927.out
cat /scratch${PWD#"/prj"}/slurm-1344928.out

- Job ID: 1344926
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 16
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1450
-- modules ----------------------------
-- run --------------------------------
$ srun -n 16
-- output -----------------------------
Trainset classification error is   0.07% of   66000 (kappa: 0.9894 )
 Testset classification error is   0.82% of   34000 (kappa: 0.8753 )
T: 21.0287  |  N: 16
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1344927
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 16
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1450
-- modules ----------------------------
-- run --------------------------------
$ srun -n 16
-- output -----------------------------
Trainset classification error is   0.06% of   66000 (kappa: 0.9908 )
 Testset classification error is   0.51% of   34000 (kappa: 0.9220 )
T: 20.8398  |  N: 16
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

### 24 of (1, 4, 16, 24, 48, 72, 96)

In [22]:
%%bash
sbatch --ntasks=24 rffm6.srm
sbatch --ntasks=24 rffm6.srm
sbatch --ntasks=24 rffm6.srm

Submitted batch job 1344929
Submitted batch job 1344930
Submitted batch job 1344931


In [6]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1344929.out
cat /scratch${PWD#"/prj"}/slurm-1344930.out
cat /scratch${PWD#"/prj"}/slurm-1344931.out

- Job ID: 1344929
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 24
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1450
-- modules ----------------------------
-- run --------------------------------
$ srun -n 24
-- output -----------------------------
Trainset classification error is   0.05% of   66000 (kappa: 0.9929 )
 Testset classification error is   0.48% of   34000 (kappa: 0.9269 )
T: 18.2953  |  N: 24
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1344930
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 24
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1450
-- modules ----------------------------
-- run --------------------------------
$ srun -n 24
-- output -----------------------------
Trainset classification error is   0.05% of   66000 (kappa: 0.9927 )
 Testset classification error is   0.45% of   34000 (kappa: 0.9314 )
T: 20.4035  |  N: 24
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

### 48 of (1, 4, 16, 24, 48, 72, 96)

In [23]:
%%bash
sbatch --ntasks=48 rffm6.srm
sbatch --ntasks=48 rffm6.srm
sbatch --ntasks=48 rffm6.srm

Submitted batch job 1344932
Submitted batch job 1344933
Submitted batch job 1344934


In [5]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1344932.out
cat /scratch${PWD#"/prj"}/slurm-1344933.out
cat /scratch${PWD#"/prj"}/slurm-1344934.out

- Job ID: 1344932
- Tasks per node:
- # of nodes in the job: 2
- # of tasks: 48
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1449 sdumont1451
-- modules ----------------------------
-- run --------------------------------
$ srun -n 48
-- output -----------------------------
Trainset classification error is   0.06% of   66000 (kappa: 0.9908 )
 Testset classification error is   0.47% of   34000 (kappa: 0.9287 )
T: 14.7529  |  N: 48
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1344933
- Tasks per node:
- # of nodes in the job: 2
- # of tasks: 48
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1449 sdumont1451
-- modules ----------------------------
-- run --------------------------------
$ srun -n 48
-- output -----------------------------
Trainset classification error is   0.06% of   66000 (kappa: 0.9915 )
 Testset classification error is   0.75% of   34000 (kappa: 0.8864 )
T: 14.7901  |  N: 48
~~ end ~~~~~

### 72 of (1, 4, 16, 24, 48, 72, 96)

In [24]:
%%bash
sbatch --ntasks=72 rffm6.srm
sbatch --ntasks=72 rffm6.srm
sbatch --ntasks=72 rffm6.srm

Submitted batch job 1344935
Submitted batch job 1344936
Submitted batch job 1344937


In [4]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1344935.out
cat /scratch${PWD#"/prj"}/slurm-1344936.out
cat /scratch${PWD#"/prj"}/slurm-1344937.out

- Job ID: 1344935
- Tasks per node:
- # of nodes in the job: 3
- # of tasks: 72
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1437 sdumont1438 sdumont1439
-- modules ----------------------------
-- run --------------------------------
$ srun -n 72
-- output -----------------------------
Trainset classification error is   0.05% of   66000 (kappa: 0.9922 )
 Testset classification error is   1.29% of   34000 (kappa: 0.8049 )
T: 16.1033  |  N: 72
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1344936
- Tasks per node:
- # of nodes in the job: 3
- # of tasks: 72
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1440 sdumont1441 sdumont1442
-- modules ----------------------------
-- run --------------------------------
$ srun -n 72
-- output -----------------------------
Trainset classification error is   0.07% of   66000 (kappa: 0.9901 )
 Testset classification error is   1.39% of   34000 (kappa: 0.7893 )
T: 14.343

### 96 of (1, 4, 16, 24, 48, 72, 96)

In [25]:
%%bash
sbatch --ntasks=96 rffm6.srm
sbatch --ntasks=96 rffm6.srm
sbatch --ntasks=96 rffm6.srm

Submitted batch job 1344938
Submitted batch job 1344939
Submitted batch job 1344940


In [3]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1344938.out
cat /scratch${PWD#"/prj"}/slurm-1344939.out
cat /scratch${PWD#"/prj"}/slurm-1344940.out

- Job ID: 1344938
- Tasks per node:
- # of nodes in the job: 4
- # of tasks: 96
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1446 sdumont1447 sdumont1448 sdumont1449
-- modules ----------------------------
-- run --------------------------------
$ srun -n 96
-- output -----------------------------
Trainset classification error is   0.07% of   66000 (kappa: 0.9892 )
 Testset classification error is   0.47% of   34000 (kappa: 0.9287 )
T: 17.3701  |  N: 96
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1344939
- Tasks per node:
- # of nodes in the job: 4
- # of tasks: 96
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1450 sdumont1451 sdumont1452 sdumont1453
-- modules ----------------------------
-- run --------------------------------
$ srun -n 96
-- output -----------------------------
Trainset classification error is   0.04% of   66000 (kappa: 0.9938 )
 Testset classification error is   1.48% of   34000 (k

In [9]:
! squeue -u $(whoami) -h -t pending,running -r | wc -l

69


In [10]:
! squeue --partition=cpu_small -h -t pending,running -r | wc -l

339


In [27]:
! squeue -n rffm6 -o "%.18i  %.9P  %.2t %.5M %.5D %.4C"

             JOBID  PARTITION  ST  TIME NODES CPUS
           1344919  cpu_small  PD  0:00     1    1
           1344920  cpu_small  PD  0:00     1    1
           1344921  cpu_small  PD  0:00     1    1
           1344922  cpu_small  PD  0:00     1    4
           1344923  cpu_small  PD  0:00     1    4
           1344924  cpu_small  PD  0:00     1    4
           1344926  cpu_small  PD  0:00     1   16
           1344927  cpu_small  PD  0:00     1   16
           1344928  cpu_small  PD  0:00     1   16
           1344929  cpu_small  PD  0:00     1   24
           1344930  cpu_small  PD  0:00     1   24
           1344931  cpu_small  PD  0:00     1   24
           1344932  cpu_small  PD  0:00     2   48
           1344933  cpu_small  PD  0:00     2   48
           1344934  cpu_small  PD  0:00     2   48
           1344935  cpu_small  PD  0:00     3   72
           1344936  cpu_small  PD  0:00     3   72
           1344937  cpu_small  PD  0:00     3   72
           1344938  cpu_small  

In [11]:
! squeue --start --name=rffm6 --format "%S  %.8i  %.9P %.5j %.2t %.5M %.5D %.4C" --sort "i"

START_TIME     JOBID  PARTITION  NAME ST  TIME NODES CPUS
