# RF NUMBA MPI 66 k

## Test

in a terminal:

    ipython profile create mpi --parallel
    ipcontroller --ip="*" --profile=mpi --quiet &
    mpirun -n 4 ipengine --location=$(hostname) --profile=mpi --quiet &

In [3]:
%%writefile rfam6.py
import argparse, logging, os, sys, datetime
import pandas as pd, numpy as np
from joblib import ( Parallel, parallel_backend, 
                     register_parallel_backend )
from joblib import delayed, cpu_count
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from scipy.io import arff
import ipyparallel as ipp
from ipyparallel.joblib import IPythonParallelBackend
from numba import jit, objmode
from time import time
t0 = time()

def eng01(clf, X_train, y_train) :
    with parallel_backend('ipyparallel') :
        clf.fit(X_train, y_train)
    return clf

@jit(forceobj=True)
def rfamf(trainset, testset) :

    # Get & prepare data
    data = arff.loadarff(trainset)
    df = pd.DataFrame(data[0])
    df = df.replace(b'N', 0)
    df = df.replace(b'Y', 1)
    df['class'] = df['class'].str.decode('utf-8').fillna(df['class'])
    y_train = df['class']
    X_train = df.drop(columns=['class'])
    imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    df2 = pd.DataFrame(imp.fit_transform(X_train))
    df2.columns = X_train.columns
    df2.index = X_train.index
    X_train = df2

    datat = arff.loadarff(testset)
    df = pd.DataFrame(datat[0])
    df = df.replace(b'N', 0)
    df = df.replace(b'Y', 1)
    df['class'] = df['class'].str.decode('utf-8').fillna(df['class'])
    y_test = df['class']
    X_test = df.drop(columns = ['class'])
    imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    df2 = pd.DataFrame(imp.fit_transform(X_test))
    df2.columns = X_test.columns
    df2.index = X_test.index
    X_test = df2

    clf = RandomForestClassifier(n_estimators = 100)
    clf = eng01(clf, X_train, y_train)
    y_pred_test  = clf.predict(X_test)
    y_pred_train = clf.predict(X_train)
    accu = metrics.accuracy_score(y_train, y_pred_train,
                                  normalize = False)
    trtrsi = y_train.size
    trperr = ((trtrsi - accu) / (trtrsi)) * 100
    trkapp = metrics.cohen_kappa_score(y_train, y_pred_train)
    
    accu = metrics.accuracy_score(y_test, y_pred_test, 
                                  normalize = False)
    tetrsi = y_test.size
    teperr = ((tetrsi - accu) / (tetrsi)) * 100
    tekapp = metrics.cohen_kappa_score(y_test, y_pred_test)

    return trtrsi, trperr, trkapp, tetrsi, teperr, tekapp

# Main
trainset = "datasets/asteroid-train-66k.arff"
testset  = "datasets/asteroid-test-34k.arff"
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--profile", required=True,
    help="Name of IPython profile to use")
profile = parser.parse_args().profile

# Prepare the engines
c = ipp.Client(profile = profile)
ncli = len(c.ids)
bview = c.load_balanced_view()
register_parallel_backend('ipyparallel',
    lambda : IPythonParallelBackend(view = bview) )

# Call Numba Code
( trtrsi, trperr, trkapp, tetrsi, teperr, tekapp,
     ) = rfamf(trainset, testset)

# Shutdown the engines
c.shutdown(hub=True, block=False)

# Result
t1 = time() - t0
print(f'Trainset classification error is {trperr:.2f}% ',
      f'of {trtrsi} (kappa: {trkapp:.4f})')
print(f' Testset classification error is {teperr:.2f}% ',
      f'of {tetrsi} (kappa: {tekapp:.4f})')
print(f"T: {t1:.4f}  |  N: {ncli:0g}")

Overwriting rfam6.py


In [None]:
! time python rfam6.py -p mpi

In [None]:
import ipyparallel as ipp
from ipyparallel.joblib import IPythonParallelBackend
c = ipp.Client(profile = 'mpi')
c.shutdown(hub=True, block=False)

## Copy to /scratch

In [4]:
! cp rfam6* /scratch${PWD#/prj}

## SLURM script

In [2]:
%%writefile rfam6.srm
#!/bin/bash -l
#   -l option tells bash to read all the various "profile" scripts, from
#      /etc and from your home directory. Bash normally only does this
#      for interactive sessions

#SBATCH --job-name rfam6        # Job name
#SBATCH --partition cpu_small  # Select partition
#SBATCH --ntasks=1             # Total tasks(CPUs)
#SBATCH --time=00:10:00        # Limit execution time
#SBATCH --exclusive            # Exclusive acccess to nodes

echo '========================================'
echo '- Job ID:' $SLURM_JOB_ID
echo '- # of nodes in the job:' $SLURM_JOB_NUM_NODES
echo '- # of tasks:' $SLURM_NTASKS
echo '- Dir from which sbatch was invoked:' ${SLURM_SUBMIT_DIR##*/}
echo -n '- Nodes allocated to the job: '
nodeset -e $SLURM_JOB_NODELIST

# Set path
cd
SCR=/scratch${PWD#/prj}
cd $SCR/rf
# path to a directory which IPython will use for user data
export IPYTHONDIR=$SCR/.ipython
              
# Load Python environment and MPI module
source $SCR/env2/etc/profile.d/conda.sh
conda activate $SCR/env2
module load openmpi/gnu/4.0.1

echo -n '<1. starting ipython>        ' && date
# create a new ipython profile appended with the job id number
PROFILE=job_${SLURM_JOB_ID}
ipython profile create ${PROFILE} --parallel --quiet

echo -n '<2. starting ipcontroller>   ' && date
# run ipcontroler on one core
ipcontroller --ip="*" --profile=${PROFILE} --quiet &
sleep 10

echo -n '<3. starting srun ipengine>  ' && date
# run ipengine on each available core
srun --mpi=pmi2 -n $SLURM_NTASKS \
    ipengine --location=$(hostname) --profile=${PROFILE} --quiet &
sleep 25

# Executable
EXEC='rfam6.py'

# run the script
echo -n '<4. starting python script > ' && date
echo '-- output -----------------------------'
python ${EXEC} --profile ${PROFILE}
echo '-- end --------------------------------'
echo -n '<5. quit>                    ' && date

Overwriting rfam6.srm


## Check

In [8]:
! scancel 1357077

In [9]:
! sbatch --partition cpu_dev --ntasks=96 rfam6.srm

Submitted batch job 1357078


In [10]:
! squeue --name=rfam6 --partition=cpu_dev --format="%.20S %.8i %.9P %.5j %.2t %.5M %.5D %.4C"

          START_TIME    JOBID PARTITION  NAME ST  TIME NODES CPUS
 2021-09-20T19:50:51  1357078   cpu_dev rfam6  R  0:02     4   96


In [13]:
! squeue --name=rfam6 --partition=cpu_dev --format="%.20S %.8i %.9P %.5j %.2t %.5M %.5D %.4C"

          START_TIME    JOBID PARTITION  NAME ST  TIME NODES CPUS


In [14]:
! cat /scratch${PWD#/prj}/slurm-1357078.out

- Job ID: 1357078
- # of nodes in the job: 4
- # of tasks: 96
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1243 sdumont1244 sdumont1245 sdumont1246
<1. starting ipython>        Seg Set 20 19:50:52 -03 2021
<2. starting ipcontroller>   Seg Set 20 19:51:04 -03 2021
<3. starting srun ipengine>  Seg Set 20 19:51:14 -03 2021
<4. starting python script > Seg Set 20 19:51:39 -03 2021
-- output -----------------------------
Trainset classification error is 0.00%  of 66000 (kappa: 1.0000)
 Testset classification error is 0.00%  of 34000 (kappa: 0.9997)
T: 19.4812  |  N: 96
-- end --------------------------------
<5. quit>                    Seg Set 20 19:53:07 -03 2021


## Run

### 1 of (1, 4, 16, 24, 48, 72, 96)

In [15]:
%%bash
sbatch --ntasks=1 rfam6.srm
sbatch --ntasks=1 rfam6.srm
sbatch --ntasks=1 rfam6.srm

Submitted batch job 1357090
Submitted batch job 1357091
Submitted batch job 1357092


In [2]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1357090.out
cat /scratch${PWD#"/prj"}/slurm-1357091.out
cat /scratch${PWD#"/prj"}/slurm-1357092.out

- Job ID: 1357090
- # of nodes in the job: 1
- # of tasks: 1
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1479
<1. starting ipython>        Ter Set 21 02:12:48 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:13:04 -03 2021
<3. starting srun ipengine>  Ter Set 21 02:13:14 -03 2021
<4. starting python script > Ter Set 21 02:13:39 -03 2021
-- output -----------------------------
Trainset classification error is 0.00%  of 66000 (kappa: 1.0000)
 Testset classification error is 0.00%  of 34000 (kappa: 0.9997)
T: 35.2194  |  N: 1
-- end --------------------------------
<5. quit>                    Ter Set 21 02:14:57 -03 2021
- Job ID: 1357091
- # of nodes in the job: 1
- # of tasks: 1
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1480
<1. starting ipython>        Ter Set 21 02:12:48 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:13:04 -03 2021
<3. starting srun ipengine>  Ter Set 21 02:13:14 -03 2021
<4. starting python 

### 4 of (1, 4, 16, 24, 48, 72, 96)

In [16]:
%%bash
sbatch --ntasks=4 rfam6.srm
sbatch --ntasks=4 rfam6.srm
sbatch --ntasks=4 rfam6.srm

Submitted batch job 1357095
Submitted batch job 1357096
Submitted batch job 1357097


In [3]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1357095.out
cat /scratch${PWD#"/prj"}/slurm-1357096.out
cat /scratch${PWD#"/prj"}/slurm-1357097.out

- Job ID: 1357095
- # of nodes in the job: 1
- # of tasks: 4
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1471
<1. starting ipython>        Ter Set 21 02:16:07 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:16:13 -03 2021
<3. starting srun ipengine>  Ter Set 21 02:16:23 -03 2021
<4. starting python script > Ter Set 21 02:16:48 -03 2021
-- output -----------------------------
Trainset classification error is 0.00%  of 66000 (kappa: 1.0000)
 Testset classification error is 0.00%  of 34000 (kappa: 0.9997)
T: 16.4930  |  N: 4
-- end --------------------------------
<5. quit>                    Ter Set 21 02:17:13 -03 2021
- Job ID: 1357096
- # of nodes in the job: 1
- # of tasks: 4
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1472
<1. starting ipython>        Ter Set 21 02:16:07 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:16:13 -03 2021
<3. starting srun ipengine>  Ter Set 21 02:16:23 -03 2021
<4. starting python 

### 16 of (1, 4, 16, 24, 48, 72, 96)

In [17]:
%%bash
sbatch --ntasks=16 rfam6.srm
sbatch --ntasks=16 rfam6.srm
sbatch --ntasks=16 rfam6.srm

Submitted batch job 1357098
Submitted batch job 1357099
Submitted batch job 1357100


In [4]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1357098.out
cat /scratch${PWD#"/prj"}/slurm-1357099.out
cat /scratch${PWD#"/prj"}/slurm-1357100.out

- Job ID: 1357098
- # of nodes in the job: 1
- # of tasks: 16
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1479
<1. starting ipython>        Ter Set 21 02:16:07 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:16:13 -03 2021
<3. starting srun ipengine>  Ter Set 21 02:16:23 -03 2021
<4. starting python script > Ter Set 21 02:16:48 -03 2021
-- output -----------------------------
Trainset classification error is 0.00%  of 66000 (kappa: 1.0000)
 Testset classification error is 0.00%  of 34000 (kappa: 0.9997)
T: 13.4995  |  N: 16
-- end --------------------------------
<5. quit>                    Ter Set 21 02:17:10 -03 2021
- Job ID: 1357099
- # of nodes in the job: 1
- # of tasks: 16
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1480
<1. starting ipython>        Ter Set 21 02:16:07 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:16:13 -03 2021
<3. starting srun ipengine>  Ter Set 21 02:16:23 -03 2021
<4. starting pyth

### 24 of (1, 4, 16, 24, 48, 72, 96)

In [18]:
%%bash
sbatch --ntasks=24 rfam6.srm
sbatch --ntasks=24 rfam6.srm
sbatch --ntasks=24 rfam6.srm

Submitted batch job 1357101
Submitted batch job 1357102
Submitted batch job 1357103


In [5]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1357101.out
cat /scratch${PWD#"/prj"}/slurm-1357102.out
cat /scratch${PWD#"/prj"}/slurm-1357103.out

- Job ID: 1357101
- # of nodes in the job: 1
- # of tasks: 24
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1471
<1. starting ipython>        Ter Set 21 02:17:48 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:17:52 -03 2021
<3. starting srun ipengine>  Ter Set 21 02:18:02 -03 2021
<4. starting python script > Ter Set 21 02:18:27 -03 2021
-- output -----------------------------
Trainset classification error is 0.00%  of 66000 (kappa: 1.0000)
 Testset classification error is 0.00%  of 34000 (kappa: 0.9997)
T: 13.8960  |  N: 24
-- end --------------------------------
<5. quit>                    Ter Set 21 02:18:47 -03 2021
srun: Job step aborted: Waiting up to 302 seconds for job step to finish.
- Job ID: 1357102
- # of nodes in the job: 1
- # of tasks: 24
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1472
<1. starting ipython>        Ter Set 21 02:17:48 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:17:52 -03 2021
<

### 48 of (1, 4, 16, 24, 48, 72, 96)

In [19]:
%%bash
sbatch --ntasks=48 rfam6.srm
sbatch --ntasks=48 rfam6.srm
sbatch --ntasks=48 rfam6.srm

Submitted batch job 1357104
Submitted batch job 1357105
Submitted batch job 1357106


In [6]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1357104.out
cat /scratch${PWD#"/prj"}/slurm-1357105.out
cat /scratch${PWD#"/prj"}/slurm-1357106.out

- Job ID: 1357104
- # of nodes in the job: 2
- # of tasks: 48
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1479 sdumont1480
<1. starting ipython>        Ter Set 21 02:17:48 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:17:52 -03 2021
<3. starting srun ipengine>  Ter Set 21 02:18:02 -03 2021
<4. starting python script > Ter Set 21 02:18:27 -03 2021
-- output -----------------------------
Trainset classification error is 0.00%  of 66000 (kappa: 1.0000)
 Testset classification error is 0.01%  of 34000 (kappa: 0.9994)
T: 14.5762  |  N: 48
-- end --------------------------------
<5. quit>                    Ter Set 21 02:18:48 -03 2021
srun: Job step aborted: Waiting up to 302 seconds for job step to finish.
- Job ID: 1357105
- # of nodes in the job: 2
- # of tasks: 48
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1470 sdumont1471
<1. starting ipython>        Ter Set 21 02:19:03 -03 2021
<2. starting ipcontroller>   Ter Se

### 72 of (1, 4, 16, 24, 48, 72, 96)

In [20]:
%%bash
sbatch --ntasks=72 rfam6.srm
sbatch --ntasks=72 rfam6.srm
sbatch --ntasks=72 rfam6.srm

Submitted batch job 1357107
Submitted batch job 1357108
Submitted batch job 1357109


In [7]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1357107.out
cat /scratch${PWD#"/prj"}/slurm-1357108.out
cat /scratch${PWD#"/prj"}/slurm-1357109.out

- Job ID: 1357107
- # of nodes in the job: 3
- # of tasks: 72
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1472 sdumont1480 sdumont1503
<1. starting ipython>        Ter Set 21 02:19:06 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:19:09 -03 2021
<3. starting srun ipengine>  Ter Set 21 02:19:19 -03 2021
<4. starting python script > Ter Set 21 02:19:44 -03 2021
-- output -----------------------------
Trainset classification error is 0.00%  of 66000 (kappa: 1.0000)
 Testset classification error is 0.01%  of 34000 (kappa: 0.9991)
T: 16.1404  |  N: 72
-- end --------------------------------
<5. quit>                    Ter Set 21 02:20:08 -03 2021
srun: Job step aborted: Waiting up to 302 seconds for job step to finish.
- Job ID: 1357108
- # of nodes in the job: 3
- # of tasks: 72
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1470 sdumont1471 sdumont1472
<1. starting ipython>        Ter Set 21 02:20:18 -03 2021
<2. startin

### 96 of (1, 4, 16, 24, 48, 72, 96)

In [21]:
%%bash
sbatch --ntasks=96 rfam6.srm
sbatch --ntasks=96 rfam6.srm
sbatch --ntasks=96 rfam6.srm

Submitted batch job 1357110
Submitted batch job 1357111
Submitted batch job 1357112


In [8]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1357110.out
cat /scratch${PWD#"/prj"}/slurm-1357111.out
cat /scratch${PWD#"/prj"}/slurm-1357112.out

- Job ID: 1357110
- # of nodes in the job: 4
- # of tasks: 96
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1470 sdumont1471 sdumont1472 sdumont1503
<1. starting ipython>        Ter Set 21 02:23:08 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:23:16 -03 2021
<3. starting srun ipengine>  Ter Set 21 02:23:26 -03 2021
<4. starting python script > Ter Set 21 02:23:51 -03 2021
-- output -----------------------------
Trainset classification error is 0.00%  of 66000 (kappa: 1.0000)
 Testset classification error is 0.00%  of 34000 (kappa: 0.9997)
T: 15.4500  |  N: 96
-- end --------------------------------
<5. quit>                    Ter Set 21 02:24:22 -03 2021
- Job ID: 1357111
- # of nodes in the job: 4
- # of tasks: 96
- Dir from which sbatch was invoked: rf
- Nodes allocated to the job: sdumont1470 sdumont1471 sdumont1472 sdumont1503
<1. starting ipython>        Ter Set 21 02:24:54 -03 2021
<2. starting ipcontroller>   Ter Set 21 02:24:59 -03 2021
<3.

In [11]:
! squeue -u $(whoami) -h -r | wc -l

64


In [12]:
! squeue --partition=cpu_small -h -r | wc -l

279


In [22]:
! squeue -n rfam6 -o "%.18i  %.9P  %.2t %.5M %.5D %.4C"

             JOBID  PARTITION  ST  TIME NODES CPUS
           1357107  cpu_small  PD  0:00     3   72
           1357108  cpu_small  PD  0:00     3   72
           1357109  cpu_small  PD  0:00     3   72
           1357110  cpu_small  PD  0:00     4   96
           1357111  cpu_small  PD  0:00     4   96
           1357112  cpu_small  PD  0:00     4   96
           1357104  cpu_small  PD  0:00     2   48
           1357105  cpu_small  PD  0:00     2   48
           1357106  cpu_small  PD  0:00     2   48
           1357103  cpu_small  PD  0:00     1   24
           1357102  cpu_small  PD  0:00     1   24
           1357100  cpu_small  PD  0:00     1   16
           1357101  cpu_small  PD  0:00     1   24
           1357098  cpu_small  PD  0:00     1   16
           1357099  cpu_small  PD  0:00     1   16
           1357096  cpu_small  PD  0:00     1    4
           1357097  cpu_small  PD  0:00     1    4
           1357095  cpu_small  PD  0:00     1    4
           1357092  cpu_small  

In [1]:
! squeue -n rfam6 -o "%.18i  %.9P  %.2t %.5M %.5D %.4C"

             JOBID  PARTITION  ST  TIME NODES CPUS


In [17]:
! squeue -u $(whoami) -h -r | wc -l

64


In [20]:
! squeue --start --user=$(whoami) -o "%S  %.8i  %.9P  %.2t %.5M %.5D %.4C" --sort "i"

START_TIME     JOBID  PARTITION  ST  TIME NODES CPUS
N/A   1345073        cpu  PD  0:00    21   96
N/A   1345074        cpu  PD  0:00    21   96
N/A   1345075        cpu  PD  0:00    21   96
N/A   1345081        cpu  PD  0:00    21   72
N/A   1346988  cpu_small  PD  0:00     1   24
N/A   1346989  cpu_small  PD  0:00     1   24
N/A   1346990  cpu_small  PD  0:00     1   24
N/A   1346994   cpu_long  PD  0:00     3   72
N/A   1346995   cpu_long  PD  0:00     3   72
N/A   1346996   cpu_long  PD  0:00     3   72
N/A   1346997  cpu_small  PD  0:00     2   48
N/A   1346998  cpu_small  PD  0:00     2   48
N/A   1346999  cpu_small  PD  0:00     2   48
N/A   1347001  cpu_small  PD  0:00     1   16
N/A   1347002  cpu_small  PD  0:00     1   16
N/A   1347003  cpu_small  PD  0:00     1   16
N/A   1347004  cpu_small  PD  0:00     1    4
N/A   1347005  cpu_small  PD  0:00     1    4
N/A   1347006  cpu_small  PD  0:00     1    4
N/A   1347007  cpu_small  PD  0:00     1    1
N/A   1347008  cpu_small  P