# Towards Scalability

Integration test of some more recent innovations in MERCS in order to make it scale to datasets in the region of 10**3 attributes.

# Prelims

In [82]:
import mercs
import numpy as np
from mercs.tests import load_iris, default_dataset
from mercs.core import Mercs

## Helpers

# Setup

In [83]:
train, test, nominal_ids = load_iris() 

In [84]:
nominal_ids

{4}

In [86]:
train.shape

(120, 5)

# Sandbox

## Fit

In [87]:
clf = Mercs(max_depth=2)
clf.fit(train, nominal_attributes=nominal_ids)

In [97]:
train, test = default_dataset(n_features=2*10**1)
clf = Mercs(max_depth=4, selection_algorithm="random", fraction_missing=0.6, nb_targets =3, nb_iterations=2, n_jobs=8, verbose=1, inference_algorithm="ndask", max_steps=8, prediction_algorithm="vit")

In [98]:
clf.fit(train, )

        Training is being parallellized using Joblib. Number of jobs = 8
        
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  14 out of  14 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  14 out of  14 | elapsed:    0.0s finished


In [99]:
clf.m_list[:5]

[DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=2433, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=2433, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=

In [101]:
clf.m_list[10].targ_ids

[2, 18, 20]

### Introspection

Let's examine if everything is as it should be.

lgtm

## Predict

Now the more challeging part.

In [8]:
q_code = np.zeros(test.shape[1], dtype=np.int8)
q_code[-1] = 1
q_code[0:5] = -1
q_code

array([-1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1],
      dtype=int8)

In [9]:
y_pred = clf.predict(test, q_code=q_code, beta=True)

# Graph-Tool

Towards scalable graph representation.

In [10]:
from graph_tool.all import *
from mercs.graph.q_diagram import v_name
from mercs.utils import code_to_query



In [63]:
def build_graph(m_codes, m_list):
    n_models, n_attributes = m_codes.shape

    g = Graph()
    
    v_map = {}
    names = g.new_vertex_property("object")
    
    v_atts = g.add_vertex(n_attributes)
    v_mods = g.add_vertex(n_models)
    v_imps = g.add_vertex(n_attributes)

    for v_idx, v in enumerate(v_atts):
        v_n = v_name(v_idx, kind="data")
        v_map[v_n] = int(v)
        names[v] = v_n

    for v_idx, v in enumerate(v_mods):
        v_n = v_name(v_idx, kind="model")
        v_map[v_n] = int(v)
        names[v] = v_n

        in_edges = ((d, v) for d in m_list[v_idx].desc_ids)
        out_edges = ((v, t) for t in m_list[v_idx].targ_ids)

        g.add_edge_list(in_edges)
        g.add_edge_list(out_edges)
        
    for v_idx, v in enumerate(v_imps):
        v_n = v_name(v_idx, kind="imputation")
        v_map[v_n] = int(v)
        names[v] = v_n
    
    g.vp.names = names
    g.v_map = v_map
    return g

In [61]:
m_list = clf.m_list
m_codes = clf.m_codes
q_code = clf.q_code
m_layer = clf.m_sel[0]

In [75]:
g = build_graph(clf.m_codes, clf.m_list)
g

<Graph object, directed, with 404 vertices and 8123 edges at 0x7faab16f9f50>

In [None]:
def build_diagram(g, m_list, m_sel, q_code, g, prune=False):
    if not isinstance(m_sel[0], (list, np.ndarray)):
        m_sel = [m_sel]
        
    # Init (graph properties)
    g_a_src = g.new_vertex_property("bool", False)
    g_f_tgt = g.new_vertex_property("bool", False)

    v_filter = g.new_vertex_property("bool", False)
    e_filter = g.new_edge_property("bool", False)
    
    # Availability of attributes (= available sources and forbidden targets)
    f_tgt = set([])
    a_src, a_tgt, _ = code_to_query(q_code, return_sets=True)
    
    a_src = [v_map[v_name(a, kind="data")] for a in a_src]
    f_tgt = [v_map[v_name(a, kind="data")] for a in f_tgt]

    models = [(m_idx, m_list[m_idx]) for m_idx in m_layer]

    for a in a_src:
        g_a_src[a] = True

    for a in f_tgt:
        g_f_tgt[a] = True

    # Attributes based on query
    # g.desc_ids = a_src
    # g.targ_ids = a_tgt

    for m_layer in m_sel:
        models = [(m_idx, m_list[m_idx]) for m_idx in m_layer]

        a_src, f_tgt, g = build_diagram_SL(models, g_a_src, g_f_tgt, v_filter, e_filter, g)

    if prune:
        _prune(g)

    return g

In [None]:
def build_diagram_single_layer(models, g_a_src, g_f_tgt, v_filter, e_filter, g):
    
    if imputation_edges=None:
        imputation_edges = []

    for m_idx, m in models:
        v_idx = g.v_map[v_name(m_idx, kind="model")]
        vertex = g.vertex(v_idx)

        for e in vertex.in_edges():
            a = e.source()
            if g_a_src[a]:
                e_filter[e] = True
                g_f_tgt[a] = True            
            else:
                i_idx = g.v_map[v_name(int(a), kind="imputation")]
                print(i_idx)
                imputation_edges.append([i_idx, v_idx, True])

    for m_idx, m in models:
        v_idx = g.v_map[v_name(m_idx, kind="model")]
        vertex = g.vertex(v_idx)

        for e in vertex.out_edges():
            a = e.target()

            if not g_f_tgt[a]:
                e_filter[e] = True
                g_a_src[a] = True     

    # g.add_edge_list(imputation_edges, eprops=[e_filter]) 
 
    return g_a_src, g_f_tgt, v_filter, e_filter, imputation_edges

In [77]:
g_a_src = g.new_vertex_property("bool", False)
g_f_tgt = g.new_vertex_property("bool", False)

v_filter = g.new_vertex_property("bool", False)
e_filter = g.new_edge_property("bool", False)

In [78]:
f_tgt = set([])
a_src, a_tgt, _ = code_to_query(q_code, return_sets=True)

a_src = [v_map[v_name(a, kind="data")] for a in a_src]
f_tgt = [v_map[v_name(a, kind="data")] for a in f_tgt]

models = [(m_idx, m_list[m_idx]) for m_idx in m_layer]

for a in a_src:
    g_a_src[a] = True
    
for a in f_tgt:
    g_a_tgt[a] = True

In [68]:
valid_src = lambda a: a in a_src
valid_tgt = lambda a: a not in f_tgt

In [69]:
[m_idx for m_idx, m in models]


[1, 12, 18, 27, 33, 65, 111, 112, 128, 166]

In [79]:
imputation_edges = []

for m_idx, m in models:
    v_idx = v_map[v_name(m_idx, kind="model")]
    vertex = g.vertex(v_idx)

    for e in vertex.in_edges():
        a = e.source()
        if g_a_src[a]:
            e_filter[e] = True
            g_f_tgt[a] = True            
        else:
            i_idx = v_map[v_name(int(a), kind="imputation")]
            print(i_idx)
            imputation_edges.append([i_idx, v_idx, True])

for m_idx, m in models:
    v_idx = v_map[v_name(m_idx, kind="model")]
    vertex = g.vertex(v_idx)

    for e in vertex.out_edges():
        a = e.target()

        if not g_f_tgt[a]:
            e_filter[e] = True
            g_a_src[a] = True     

g.add_edge_list(imputation_edges, eprops=[e_filter]) 

403
303
304
306
307
303
403
306
307
303
306
403
305
305
304
305


In [80]:
l = []
for e in find_edge(g, e_filter, True):
    n_src = names[e.source()]
    n_tgt = names[e.target()]
    l.append((n_src, n_tgt))

In [81]:
l

[(('D', 5), ('M', 1)),
 (('D', 5), ('M', 112)),
 (('D', 5), ('M', 128)),
 (('D', 6), ('M', 18)),
 (('D', 6), ('M', 65)),
 (('D', 6), ('M', 111)),
 (('D', 7), ('M', 12)),
 (('D', 7), ('M', 27)),
 (('D', 7), ('M', 33)),
 (('D', 8), ('M', 1)),
 (('D', 8), ('M', 33)),
 (('D', 8), ('M', 112)),
 (('D', 8), ('M', 166)),
 (('D', 9), ('M', 1)),
 (('D', 9), ('M', 12)),
 (('D', 9), ('M', 18)),
 (('D', 9), ('M', 33)),
 (('D', 9), ('M', 111)),
 (('D', 9), ('M', 112)),
 (('D', 10), ('M', 18)),
 (('D', 10), ('M', 27)),
 (('D', 10), ('M', 33)),
 (('D', 10), ('M', 112)),
 (('D', 11), ('M', 27)),
 (('D', 11), ('M', 111)),
 (('D', 11), ('M', 112)),
 (('D', 11), ('M', 166)),
 (('D', 12), ('M', 1)),
 (('D', 12), ('M', 12)),
 (('D', 12), ('M', 18)),
 (('D', 12), ('M', 33)),
 (('D', 12), ('M', 65)),
 (('D', 12), ('M', 128)),
 (('D', 13), ('M', 1)),
 (('D', 13), ('M', 27)),
 (('D', 13), ('M', 33)),
 (('D', 13), ('M', 65)),
 (('D', 13), ('M', 128)),
 (('D', 14), ('M', 1)),
 (('D', 14), ('M', 27)),
 (('D', 14),

In [37]:
vn

''

In [20]:
sorted(t)

[]