# Towards Scalability

Integration test of some more recent innovations in MERCS in order to make it scale to datasets in the region of 10**3 attributes.

# Prelims

In [1]:
import mercs
import numpy as np
from mercs.tests import load_iris, default_dataset
from mercs.core import Mercs

# Setup

# Sandbox

## Fit

In [2]:
train, test = default_dataset(n_features=100)

clf = Mercs(
    max_depth=8,
    selection_algorithm="random",
    fraction_missing=0.3,
    nb_targets=5,
    nb_iterations=40,
    n_jobs=8,
    verbose=1,
    inference_algorithm="ndask",
    max_steps=8,
    prediction_algorithm="vit",
    random_state=800
)

In [3]:
clf.fit(train, nominal_attributes={train.shape[1]-1})

        Training is being parallellized using Joblib. Number of jobs = 8
        
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 569 tasks      | elapsed:    5.1s
[Parallel(n_jobs=8)]: Done 825 out of 840 | elapsed:    6.5s remaining:    0.1s
[Parallel(n_jobs=8)]: Done 840 out of 840 | elapsed:    6.6s finished


## Predict

Now the more challeging part.

In [16]:
q_code = np.zeros(test.shape[1], dtype=np.int8)
q_code[-2:] = 1

percentage_missing = 0.2

q_code[0:int(q_code.shape[0]*percentage_missing)] = -1
q_code

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1],
      dtype=int8)

In [17]:
y_pred = clf.predict(test, q_code=q_code, beta=True, prediction_algorithm="vmi",  max_steps=1,) 

In [18]:
y_pred

array([[ 0.78222369,  1.        ],
       [-0.89673495,  0.        ],
       [-0.06850258,  0.        ],
       [-0.05679976,  1.        ],
       [-0.22353182,  0.        ],
       [-0.76430179,  1.        ],
       [-1.10348502,  0.        ],
       [-0.28936421,  1.        ],
       [-1.15360028,  1.        ],
       [ 0.03813297,  1.        ],
       [ 0.59157194,  1.        ],
       [ 0.29846669,  1.        ],
       [ 0.59923386,  0.        ],
       [ 0.09757691,  1.        ],
       [-0.59856471,  1.        ],
       [-0.79036176,  1.        ],
       [-1.00834355,  1.        ],
       [-1.04441468,  1.        ],
       [-0.61165098,  1.        ],
       [-0.24555098,  1.        ],
       [-0.06088563,  0.        ],
       [-1.40852268,  1.        ],
       [-1.0692894 ,  1.        ],
       [ 0.37180343,  0.        ],
       [ 1.08615256,  0.        ],
       [-0.03411427,  1.        ],
       [-1.75818665,  0.        ],
       [-0.42872755,  1.        ],
       [ 0.36558583,

In [8]:
clf.m_codes[clf.m_sel, -1]

array([ 0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,
        0,  0, -1,  0,  0, -1], dtype=int8)

In [19]:
y_pred = clf.predict(test, q_code=q_code, beta=True, prediction_algorithm="vit",  max_steps=3,) 

#y_true = test[:, -1]
#from sklearn.metrics import f1_score
#f1_score(y_true, y_pred)

In [None]:
clf.m_sel

In [None]:
clf.m_sel

In [None]:
clf.q_diagram.node[('D', 100)]["dask_proba"].compute()

In [None]:
clf.q_diagram.node[('D', 97)]

In [None]:
clf.q_diagram.node[('D', 97)]["dask"].compute()

In [None]:
from sklearn.metrics import f1_score

In [None]:
%debug

In [None]:
clf.m_sel

# Graph-Tool

Towards scalable graph representation. **ONE MORE TIME WITH FEELING**

In [None]:
from mercs.graph.gt import build_graph, build_diagram
from mercs.algo.turbo_inference import inference_algorithm

from mercs.utils.encoding import code_to_query

In [None]:
m_list = clf.m_list
m_codes = clf.m_codes
q_code = clf.q_code
m_sel = clf.m_sel

_, q_targ, _ = code_to_query(clf.q_code)

In [None]:
g = build_graph(clf.m_codes, clf.m_list)
clf.g = g

In [None]:
data_nodes = [(k, idx) for k, idx in clf.g.v_map if k=='D' if idx > 2000]

In [None]:
data_nodes

In [None]:
%%prun
clf.q_diagram = build_diagram(clf.g, clf.m_list, clf.m_sel, clf.q_code, prune=True)

In [None]:
%%prun
clf.dask = inference_algorithm(clf.q_diagram, clf.m_list, clf.i_list, test, clf.metadata.get('nominal_attributes'))

v_idx = clf.g.v_map[('D', q_targ[0])]
clf.dask[v_idx].compute()