# RANDOM FOREST CYTHON SERIAL

Preloaded:

    source /scratch/xxx/rfcs/env2/etc/profile.d/conda.sh
    conda activate /scratch/xxx/rfcs/env2

In [7]:
%load_ext cython

In [9]:
%%writefile rfcs.pyx
#cython: boundscheck=False, wraparound=False, cdivision=True
#cython: initializedcheck=False, language_level=3, infer_types=True
def rfcsf(trainset, testset) :
    import pandas as pd
    import numpy as np
    import sys
    from scipy.io import arff
    from sklearn.impute import SimpleImputer
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import metrics

    data = arff.loadarff(trainset)
    df = pd.DataFrame(data[0])
    df = df.replace(b'N', 0)
    df = df.replace(b'Y', 1)
    df['class'] = df['class'].str.decode('utf-8').fillna(df['class'])
    y_train = df['class']
    X_train = df.drop(columns=['class'])
    imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    df2 = pd.DataFrame(imp.fit_transform(X_train))
    df2.columns = X_train.columns
    df2.index = X_train.index
    X_train = df2

    datat = arff.loadarff(testset)
    df = pd.DataFrame(datat[0])
    df = df.replace(b'N', 0)
    df = df.replace(b'Y', 1)
    df['class'] = df['class'].str.decode('utf-8').fillna(df['class'])
    y_test = df['class']
    X_test = df.drop(columns = ['class'])
    imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    df2 = pd.DataFrame(imp.fit_transform(X_test))
    df2.columns = X_test.columns
    df2.index = X_test.index
    X_test = df2

    clf = RandomForestClassifier(n_estimators = 100)
    clf.fit(X_train, y_train)
    y_pred_test  = clf.predict(X_test)
    y_pred_train = clf.predict(X_train)
    accu = metrics.accuracy_score(y_train, y_pred_train, normalize = False)
    trtrsi = y_train.size
    trperr = ((trtrsi - accu) / (trtrsi)) * 100
    trkapp = metrics.cohen_kappa_score(y_train, y_pred_train)
    
    accu = metrics.accuracy_score(y_test, y_pred_test, normalize = False)
    tetrsi = y_test.size
    teperr = ((tetrsi - accu) / (tetrsi)) * 100
    tekapp = metrics.cohen_kappa_score(y_test, y_pred_test)
    
    return trtrsi, trperr, trkapp, tetrsi, teperr, tekapp

Writing rfcs.pyx


In [10]:
%%writefile setups.py
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
setup(
    name='rfcs',
    ext_modules=[
        Extension('rfcs',
            sources=['rfcs.pyx'],
            extra_compile_args=['-O3']
        )
    ],
    cmdclass = {'build_ext': build_ext}
)

Writing setups.py


In [11]:
%%bash
rm -f rfcs*.so
python setups.py build_ext --inplace

running build_ext
cythoning rfcs.pyx to rfcs.c
building 'rfcs' extension
creating build
creating build/temp.linux-x86_64-3.8
/scratch/ampemi/xxxx.xxxx/env2/bin/x86_64-conda_cos6-linux-gnu-cc -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -Wstrict-prototypes -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -pipe -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -pipe -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /scratch/ampemi/xxxx.xxxx/env2/include -I/scratch/app/openmpi/4.0_gnu/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /scratch/ampemi/xxxx.xxxx/env2/include -fPIC -I/scratch/ampemi/xxxx.xxxx/env2/include/python3.8 -c rfcs.c -o build/temp.linux-x86_64-3.8/rfcs.o -O3
x86_64-conda_cos6-linux-gnu-gcc -pthread -shared -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,-rpath,/scratch/ampemi/x

In [12]:
! ls rfcs*.so

rfcs.cpython-38-x86_64-linux-gnu.so


In [18]:
import rfcs
help(rfcs)

Help on module rfcs:

NAME
    rfcs

FUNCTIONS
    rfcsf(...)

DATA
    __test__ = {}

FILE
    /prj/ampemi/xxxx.xxxx/rf/rfcs.cpython-38-x86_64-linux-gnu.so




In [2]:
%%writefile rfcsc6.py
from time import time
from rfcs import rfcsf

t0 = time()
trainset = "datasets/asteroid-train-66k.arff"
testset  = "datasets/asteroid-test-34k.arff"
trtrsi, trperr, trkapp, tetrsi, teperr, tekapp = rfcsf(trainset, testset)
t1 = time() - t0
print(f'Trainset classification error is {trperr:.2f}% ',
      f'of {trtrsi} (kappa: {trkapp:.4f})')
print(f' Testset classification error is {teperr:.2f}% ',
      f'of {tetrsi} (kappa: {tekapp:.4f})')
print(f"T: {t1:.4f}")

Overwriting rfcsc6.py


In [4]:
! cp rfcs*.* /scratch${PWD#"/prj"}

In [13]:
%%writefile rfcs6.srm
#!/bin/bash
#SBATCH --job-name rfcs6        # Job name
#SBATCH --partition cpu_small  # Select partition
#SBATCH --ntasks=1             # Total tasks
#SBATCH --time=00:05:00        # Limit execution time
#SBATCH --exclusive            # Exclusive acccess to nodes

echo '========================================'
echo '- Job ID:' $SLURM_JOB_ID
echo '- Tasks per node:' $SLURM_NTASKS_PER_NODE
echo '- # of nodes in the job:' $SLURM_JOB_NUM_NODES
echo '- # of tasks:' $SLURM_NTASKS
echo '- Dir from which sbatch was invoked:' ${SLURM_SUBMIT_DIR##*/}
cd $SLURM_SUBMIT_DIR
echo -n '- List of nodes allocated to the job: '
nodeset -e $SLURM_JOB_NODELIST

# Environment
cd
dir=/scratch${PWD#"/prj"}
cd $dir
source $dir/env2/etc/profile.d/conda.sh
conda activate $dir/env2
cd rf

# Executable config
EXEC="python rfcsc6.py"

# Start
echo '$ srun --mpi=pmi2 -n' $SLURM_NTASKS ${EXEC##*/}
echo '-- output -----------------------------'
srun --mpi=pmi2 -n $SLURM_NTASKS $EXEC
echo '~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'

Overwriting rfcs6.srm


<hr style="height:10px;border-width:0;background-color:green">

## Run

In [1]:
%%bash
sbatch rfcs6.srm
sbatch rfcs6.srm
sbatch rfcs6.srm

Submitted batch job 1347031
Submitted batch job 1347032
Submitted batch job 1347033


In [2]:
%%bash
cat /scratch${PWD#"/prj"}/slurm-1347031.out
cat /scratch${PWD#"/prj"}/slurm-1347032.out
cat /scratch${PWD#"/prj"}/slurm-1347033.out

- Job ID: 1347031
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 1
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1477
$ srun --mpi=pmi2 -n 1 python rfcsc6.py
-- output -----------------------------
Trainset classification error is 0.00%  of 66000 (kappa: 1.0000)
 Testset classification error is 0.00%  of 34000 (kappa: 0.9997)
T: 29.3792
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1347032
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 1
- Dir from which sbatch was invoked: rf
- List of nodes allocated to the job: sdumont1437
$ srun --mpi=pmi2 -n 1 python rfcsc6.py
-- output -----------------------------
Trainset classification error is 0.00%  of 66000 (kappa: 1.0000)
 Testset classification error is 0.01%  of 34000 (kappa: 0.9991)
T: 30.5746
~~ end ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Job ID: 1347033
- Tasks per node:
- # of nodes in the job: 1
- # of tasks: 1
- Dir from which sbatch was invoked: rf
- List of nodes alloc

In [16]:
! squeue -u $(whoami) -h -t pending,running -r | wc -l

96


In [17]:
! squeue --partition=cpu_small -h -t pending,running -r | wc -l

449


<hr style="height:10px;border-width:0;background-color:red">

## Version

In [1]:
! python --version

Python 3.8.5


In [2]:
! cython --version

Cython version 0.29.21
