# RF F2PY MPI 66 k

In [1]:
%%writefile rfnm6.py
import time as tm, parf003mpi

t0 = tm.time()    # time measurement

resu = parf003mpi.random_forest(
    "datasets/asteroid-train-66k.arff",
    "datasets/asteroid-test-34k.arff"
)
p_error_count = resu[0]
p_oob_count = resu[1]
p_kappa_value = resu[2]
p_instance_count = resu[3]
p_error = resu[4]
p_testset_kappa_value = resu[5]
p_time = resu[6]
p_rank = resu[7]
p_size = resu[8]

t1 = tm.time()    # time measurement

if p_rank == 0 :
    print(f'Trainset classification error is',
          f'{p_error_count * 100 / p_oob_count :.2f}%',
          f'of {p_oob_count} (kappa: {p_kappa_value :.4f})')
    print(f' Testset classification error is {p_error * 100 :.2f}%',
          f'of {p_instance_count} (kappa: {p_testset_kappa_value :.4f})')
    print(f'T: {p_time :.4f}  |  N: {p_size :0g}')

Overwriting rfnm6.py


## Copy files to /scratch

In [2]:
! cp rfnm6.py /scratch${PWD#"/prj"}

## Slurm batch script

In [3]:
%%writefile rfnm6.srm
#!/bin/bash
#SBATCH --job-name rfnm6       # Job name
#SBATCH --partition cpu_small  # Select partition
#SBATCH --ntasks=1             # Total tasks
#SBATCH --time=00:05:00        # Limit execution time
#SBATCH --exclusive            # Exclusive acccess to nodes

echo '========================================'
echo '- Job ID:' $SLURM_JOB_ID
echo '- Tasks per node:' $SLURM_NTASKS_PER_NODE
echo '- # of nodes in the job:' $SLURM_JOB_NUM_NODES
echo '- # of tasks:' $SLURM_NTASKS
echo '- Dir from which sbatch was invoked:' ${SLURM_SUBMIT_DIR##*/}
cd $SLURM_SUBMIT_DIR
echo -n '- List of nodes allocated to the job: '
nodeset -e $SLURM_JOB_NODELIST

# Environment
echo '-- modules ----------------------------'
module load intel_psxe
source /opt/intel/parallel_studio_xe_2020/intelpython3/etc/profile.d/conda.sh
cd                                              
SCR=/scratch${PWD#/prj}
conda activate --stack $SCR/env4
export I_MPI_VAR_CHECK_SPELLING=0
cd $SCR/rf

# Executable config
EXEC="python rfnm6.py"

# Start
echo '-- run --------------------------------'
echo '$ srun --mpi=pmi2 -n' $SLURM_NTASKS ${EXEC##*/}
echo '-- output -----------------------------'
srun --mpi=pmi2 -n $SLURM_NTASKS $EXEC
echo '~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'

Overwriting rfnm6.srm


## Check

In [4]:
! sbatch --partition cpu_dev --ntasks=1 rfnm6.srm

Submitted batch job 1349256


In [5]:
! squeue --name=rfnm6 --partition=cpu_dev --format="%.8i  %.9P %.5j %.2t %.5M %.5D %.4C"

   JOBID  PARTITION  NAME ST  TIME NODES CPUS
 1349256    cpu_dev rfnm6  R  0:06     1   24


In [7]:
! squeue --name=rfnm6 --partition=cpu_dev --format="%.8i  %.9P %.5j %.2t %.5M %.5D %.4C"

   JOBID  PARTITION  NAME ST  TIME NODES CPUS


In [8]:
! cat /scratch${PWD#"/prj"}/slurm-1349256.out

- Job ID: 1349256
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 1
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1264
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 1 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.07% of 66000 (kappa: 0.9901)
 Testset classification error is 0.44% of 34000 (kappa: 0.9332)
T: 144.0629  |  N: 1
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


# Run

### 1 of (1, 4, 16, 24, 48, 72, 96)

In [11]:
%%bash
sbatch --ntasks=1 rfnm6.srm
sbatch --ntasks=1 rfnm6.srm
sbatch --ntasks=1 rfnm6.srm

Submitted batch job 1349296
Submitted batch job 1349297
Submitted batch job 1349298


In [2]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1349296.out
cat /scratch${PWD#"/prj"}/slurm-1349297.out
cat /scratch${PWD#"/prj"}/slurm-1349298.out

- Job ID: 1349296
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 1
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1286
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 1 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.05% of 66000 (kappa: 0.9917)
 Testset classification error is 0.63% of 34000 (kappa: 0.9051)
T: 144.4652  |  N: 1
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1349297
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 1
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1286
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 1 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.07% of 66000 (kappa: 0.9888)
 Testset classification error is 0.48% of 34000 (kappa: 0.9269)
T: 140.4211  |  N: 1
~~ end

### 4 of (1, 4, 16, 24, 48, 72, 96)

In [12]:
%%bash
sbatch --ntasks=4 rfnm6.srm
sbatch --ntasks=4 rfnm6.srm
sbatch --ntasks=4 rfnm6.srm

Submitted batch job 1349299
Submitted batch job 1349300
Submitted batch job 1349301


In [3]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1349299.out
cat /scratch${PWD#"/prj"}/slurm-1349300.out
cat /scratch${PWD#"/prj"}/slurm-1349301.out

- Job ID: 1349299
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 4
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1286
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 4 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.05% of 66000 (kappa: 0.9917)
 Testset classification error is 0.63% of 34000 (kappa: 0.9042)
T: 46.2226  |  N: 4
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1349300
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 4
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1489
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 4 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.06% of 66000 (kappa: 0.9906)
 Testset classification error is 0.51% of 34000 (kappa: 0.9234)
T: 48.3499  |  N: 4
~~ end ~

### 16 of (1, 4, 16, 24, 48, 72, 96)

In [13]:
%%bash
sbatch --ntasks=16 rfnm6.srm
sbatch --ntasks=16 rfnm6.srm
sbatch --ntasks=16 rfnm6.srm

Submitted batch job 1349302
Submitted batch job 1349303
Submitted batch job 1349304


In [1]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1349302.out
cat /scratch${PWD#"/prj"}/slurm-1349303.out
cat /scratch${PWD#"/prj"}/slurm-1349304.out

- Job ID: 1349302
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 16
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1489
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 16 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.06% of 66000 (kappa: 0.9915)
 Testset classification error is 0.88% of 34000 (kappa: 0.8672)
T: 22.8765  |  N: 16
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1349303
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 16
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1286
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 16 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.06% of 66000 (kappa: 0.9908)
 Testset classification error is 0.69% of 34000 (kappa: 0.8949)
T: 23.0209  |  N: 16
~~

### 24 of (1, 4, 16, 24, 48, 72, 96)

In [14]:
%%bash
sbatch --ntasks=24 rfnm6.srm
sbatch --ntasks=24 rfnm6.srm
sbatch --ntasks=24 rfnm6.srm

Submitted batch job 1349305
Submitted batch job 1349306
Submitted batch job 1349307


In [5]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1349305.out
cat /scratch${PWD#"/prj"}/slurm-1349306.out
cat /scratch${PWD#"/prj"}/slurm-1349307.out

- Job ID: 1349305
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 24
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1482
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 24 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.05% of 66000 (kappa: 0.9917)
 Testset classification error is 0.44% of 34000 (kappa: 0.9336)
T: 20.4023  |  N: 24
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1349306
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 24
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1483
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 24 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.07% of 66000 (kappa: 0.9901)
 Testset classification error is 0.66% of 34000 (kappa: 0.8998)
T: 17.3848  |  N: 24
~~

### 48 of (1, 4, 16, 24, 48, 72, 96)

In [15]:
%%bash
sbatch --ntasks=48 rfnm6.srm
sbatch --ntasks=48 rfnm6.srm
sbatch --ntasks=48 rfnm6.srm

Submitted batch job 1349308
Submitted batch job 1349309
Submitted batch job 1349310


In [4]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1349308.out
cat /scratch${PWD#"/prj"}/slurm-1349309.out
cat /scratch${PWD#"/prj"}/slurm-1349310.out

- Job ID: 1349308
- Tasks per node:
- # of nodes in the job: 2
- # of tasks: 48
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1482 sdumont1483
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 48 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.08% of 66000 (kappa: 0.9885)
 Testset classification error is 0.61% of 34000 (kappa: 0.9082)
T: 17.0600  |  N: 48
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1349309
- Tasks per node:
- # of nodes in the job: 2
- # of tasks: 48
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1478 sdumont1479
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 48 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.05% of 66000 (kappa: 0.9931)
 Testset classification error is 0.51% of 34000 (kappa: 0.9234)

### 72 of (1, 4, 16, 24, 48, 72, 96)

In [16]:
%%bash
sbatch --ntasks=72 rfnm6.srm
sbatch --ntasks=72 rfnm6.srm
sbatch --ntasks=72 rfnm6.srm

Submitted batch job 1349311
Submitted batch job 1349312
Submitted batch job 1349313


In [3]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1349311.out
cat /scratch${PWD#"/prj"}/slurm-1349312.out
cat /scratch${PWD#"/prj"}/slurm-1349313.out

- Job ID: 1349311
- Tasks per node:
- # of nodes in the job: 3
- # of tasks: 72
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1286 sdumont1482 sdumont1483
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 72 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.06% of 66000 (kappa: 0.9913)
 Testset classification error is 1.13% of 34000 (kappa: 0.8294)
T: 18.3066  |  N: 72
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1349312
- Tasks per node:
- # of nodes in the job: 3
- # of tasks: 72
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1478 sdumont1479 sdumont1480
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 72 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.05% of 66000 (kappa: 0.9929)
 Testset classification error is 0.51% 

### 96 of (1, 4, 16, 24, 48, 72, 96)

In [17]:
%%bash
sbatch --ntasks=96 rfnm6.srm
sbatch --ntasks=96 rfnm6.srm
sbatch --ntasks=96 rfnm6.srm

Submitted batch job 1349314
Submitted batch job 1349315
Submitted batch job 1349316


In [2]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1349314.out
cat /scratch${PWD#"/prj"}/slurm-1349315.out
cat /scratch${PWD#"/prj"}/slurm-1349316.out

- Job ID: 1349314
- Tasks per node:
- # of nodes in the job: 4
- # of tasks: 96
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1478 sdumont1479 sdumont1480 sdumont1481
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 96 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.06% of 66000 (kappa: 0.9910)
 Testset classification error is 0.56% of 34000 (kappa: 0.9158)
T: 18.5568  |  N: 96
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1349315
- Tasks per node:
- # of nodes in the job: 4
- # of tasks: 96
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1278 sdumont1286 sdumont1482 sdumont1483
-- modules ----------------------------
-- run --------------------------------
$ srun --mpi=pmi2 -n 96 python rfnm6.py
-- output -----------------------------
Trainset classification error is 0.05% of 66000 (kappa: 0.9924)
 Testset classi

In [12]:
! squeue -u $(whoami) -h -r | wc -l

78


In [13]:
! squeue --partition=cpu_small -h -r | wc -l

430


In [9]:
! squeue --start --name=rfnm6 -o "%S  %.8i  %.9P %.5j %.2t %.5M %.5D %.4C" --sort "i"

START_TIME     JOBID  PARTITION  NAME ST  TIME NODES CPUS


In [1]:
! squeue --start --name=rfnm6 -o "%S  %.8i  %.9P %.5j %.2t %.5M %.5D %.4C" --sort "i"

START_TIME     JOBID  PARTITION  NAME ST  TIME NODES CPUS
