# Parallel Composition

## Initialization

In [1]:
import os, sys
import pandas as pd
import numpy as np
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from os.path import dirname

  from numpy.core.umath_tests import inner1d


In [2]:
note_dir = os.getcwd()
root_dir = dirname(note_dir)
data_dir = os.path.join(root_dir, 'resc', 'data', 'tidy','nltcs')

# Filter relevant stuff out
rel_fnames = [os.path.join(data_dir, f) for f in os.listdir(data_dir)
              if 'F00' in f
              if 'bayesfusion' not in f]

In [3]:
train_fn, test_fn = rel_fnames[0], rel_fnames[1]

In [4]:
# Get dfs
df_train, df_test = pd.read_csv(train_fn, header=None), pd.read_csv(test_fn, header=None)

# Get np.arrays
train, test = df_train.values, df_test.values

## Train some models

### Selection

In [5]:
nb_rows, nb_atts = train.shape

In [6]:
m_codes = np.eye(nb_atts, dtype=np.int64)

# Add second target
for i in range(m_codes.shape[0]-1):
    m_codes[i, i+1]=1

m_codes

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

### Induction

In [7]:
def learn_model(m_code, train, **kwargs):
    desc_ids, targ_ids = np.where(m_code==0)[0], np.where(m_code==1)[0]
    X, Y = train[:, desc_ids], train[:, targ_ids]
    
    clf = RandomForestClassifier(**kwargs)
    clf.desc_ids = desc_ids
    clf.targ_ids = targ_ids
    
    if X.shape[1]==1: X = X.ravel()
    if Y.shape[1]==1: Y = Y.ravel()

    clf.fit(X,Y)
    return clf

In [8]:
m_list = []
for m_idx, m_code in enumerate(m_codes):
    m_list.append(learn_model(m_code, train, max_depth=4))

## Combine Models

### Query

In [9]:
m_codes

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [10]:
q_targ = np.array([1])
q_desc = np.arange(6,16)

So model 0 and 1 can predict our target.

In [11]:
rel_models = m_list[0:2]

In [13]:
X_test, Y_true = test[:, q_desc], test[:, q_targ]

### Parallel Composition

A new, powerful way of combining models.

In [64]:
class ParallelComposition(object):
    
    def __init__(self, estimators, targ_ids=None):
        
        self.estimators_ = estimators
        
        # Bookkeeping desc_ids/targ_ids
        self.desc_ids = np.unique(np.concatenate([e.desc_ids for e in estimators]))
        self.targ_ids = np.unique(np.concatenate([e.targ_ids for e in estimators]))
        
        if targ_ids is not None:
            assert np.intersect1d(self.targ_ids, targ_ids).shape[0] > 0
            self.targ_ids = targ_ids
        
        # Bookkeeping classes_ - init
        self.classes_ = [[]] * len(targ_ids)
        
        # Bookkeeping classes_ - doing it
        for e in self.estimators:
            self._add_classes_e(e)
        
        
        return
    
    def _add_classes_e(self, e):
        
        idx_map = _map_elements_idx(e.targ_ids, self.targ_ids)
        
        for idx_e, idx_s in idx_map:                    # `s` stands for `self`
            e_classes_ = e.classes_[idx_e]
            s_classes_ = self.classes_[idx_s]
            
            self.classes_[idx_s] = np.unique(np.concatenate(e_classes_, s_classes_))
    
        return
    
    def _map_elements_idx(a1, a2):
        """
        Create a map that connects elements that occur in both arrays.

        The output is a tuple list, with a tuple being;
            (index of element e in a1, index of element e in a2)

        N.b.:   Does not crash in case of double entries (behaviour is still correct),
                but there are some ambiguities involved. I.e., do not do this.
        """
        idx_a1 = np.where(np.in1d(a1, a2))[0]
        idx_a2 = np.where(np.in1d(a2, a1))[0]

        return list(zip(idx_a1, idx_a2))

In [18]:
comp = ParallelComposition(rel_models)

In [23]:
comp.estimators[0].classes_

[array([0, 1]), array([0, 1])]

In [35]:
a1 = np.array([1,1,3])
np.where(a1==1)[0][0]

0

In [52]:
m = np.array([4,6,6])
c = np.array([2,3,4,5,5,6])

In [78]:
x = [1,2,3]

In [81]:
y = np.array(["numeric"])

In [88]:
z = []

In [93]:
b = [[]] * 4
b

[[], [], [], []]

In [94]:
b[1] = [1]

In [95]:
b

[[], [1], [], []]

In [87]:
np.concatenate((z,y))

array(['numeric'], dtype='<U32')

In [53]:
map_elements_idx(m,c)

[(0, 2), (1, 5)]

In [58]:
m.extend(2)

AttributeError: 'numpy.ndarray' object has no attribute 'extend'