# Test on Large StarAI DS

Sanity check to assess performance and to make sure I do not make mortal mistakes.

In [1]:
import mercs
import numpy as np

import os
import pandas as pd

from mercs.core import Mercs
from os.path import dirname

# Setup

In [2]:
root = dirname(dirname(os.getcwd()))
data = os.path.join(root, 'data', 'step-01')

ds = 'ad'
fns = ['{}-train.csv'.format(ds), '{}-test.csv'.format(ds)]

train = pd.read_csv(os.path.join(data, fns[0]), header=None).values
test = pd.read_csv(os.path.join(data,fns[1]), header=None).values

In [3]:

clf = Mercs(
    max_depth=16,
    selection_algorithm="random",
    fraction_missing=0.3,
    nb_targets=4,
    nb_iterations=4,
    n_jobs=8,
    verbose=1,
    inference_algorithm="dask",
    max_steps=8,
    prediction_algorithm="it",
    random_state=800
)

In [4]:
nominal_attributes = set(range(train.shape[1]))

In [5]:
clf.fit(train, nominal_attributes=nominal_attributes)

        Training is being parallellized using Joblib. Number of jobs = 8
        
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    7.7s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   13.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   21.6s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:   32.1s
[Parallel(n_jobs=8)]: Done 1556 out of 1556 | elapsed:   39.3s finished


In [6]:
from joblib import dump, load
import blosc
import dill as pkl

In [7]:
with open('dill.pkl', 'wb') as f:
    pkl.dump(clf, f, protocol=4) 

In [12]:
dump(clf, 'model.lz4', compress='lz4') 

30 s ± 540 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
clf.m_codes = blosc.pack_array(clf.m_codes)
clf.m_score = blosc.pack_array(clf.m_score)
clf.m_fimps = blosc.pack_array(clf.m_fimps)

In [None]:

%timeit clf = load('model.lz4') 

In [11]:
with open('dill.pkl', 'wb') as f:
    pkl.dump(clf, f, protocol=4) 