# Parallel Composition

## Initialization

In [1]:
import os, sys
import pandas as pd
import numpy as np
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from os.path import dirname

  from numpy.core.umath_tests import inner1d


In [2]:
note_dir = os.getcwd()
root_dir = dirname(note_dir)
data_dir = os.path.join(root_dir, 'resc', 'data', 'tidy','nltcs')

# Filter relevant stuff out
rel_fnames = [os.path.join(data_dir, f) for f in os.listdir(data_dir)
              if 'F00' in f
              if 'bayesfusion' not in f]

In [3]:
train_fn, test_fn = rel_fnames[0], rel_fnames[1]

In [4]:
# Get dfs
df_train, df_test = pd.read_csv(train_fn, header=None), pd.read_csv(test_fn, header=None)

# Get np.arrays
train, test = df_train.values, df_test.values

## Train some models

### Selection

In [5]:
nb_rows, nb_atts = train.shape

In [6]:
m_codes = np.eye(nb_atts, dtype=np.int64)

# Add second target
for i in range(m_codes.shape[0]-1):
    m_codes[i, i+1]=1

m_codes

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

### Induction

In [7]:
def learn_model(m_code, train, **kwargs):
    desc_ids, targ_ids = np.where(m_code==0)[0], np.where(m_code==1)[0]
    X, Y = train[:, desc_ids], train[:, targ_ids]
    
    clf = RandomForestClassifier(**kwargs)
    clf.desc_ids = desc_ids
    clf.targ_ids = targ_ids
    
    if X.shape[1]==1: X = X.ravel()
    if Y.shape[1]==1: Y = Y.ravel()

    clf.fit(X,Y)
    return clf

In [8]:
m_list = []
for m_idx, m_code in enumerate(m_codes):
    m_list.append(learn_model(m_code, train, max_depth=4))

## Combine Models

### Query

In [9]:
m_codes

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [10]:
q_targ = np.array([1])
q_desc = np.arange(6,16)

So model 0 and 1 can predict our target.

In [11]:
rel_models = m_list[0:2]

In [12]:
X_test, Y_true = test[:, q_desc], test[:, q_targ]

### Parallel Composition

A new, powerful way of combining models.

In [13]:
class MonoModel(object):
    def fit(X, **kwargs):
        return
    
    def predict(X, **kwargs):
        return

In [14]:
class ParallelComposition(object):
    
    def __init__(self):
        
        self.estimators = []
        
        self.desc_ids = np.array([])
        self.targ_ids = np.array([])
        
        self.classes_ = [np.array([])]
        
        self.n_classes_ = 0
        self.n_outputs_ = 0
        self.n_features_ = 0
        
        return
    
    def fit(X, Y, **kwargs):
        return
    
    def predict_proba(X, **kwargs):
        nb_rows, nb_atts = X.shape
        
        s_proba = [np.zeros(nb_rows, n) for n in self.n_classes_]
        
        for e in self.estimators_:
            e_proba = self._predict_proba_estimator(e, X, **kwargs)
            s_proba = self._add_proba_estimator(e, e_proba, s_proba)
        
        # redo sklearn convention from hell
        if len(s_proba) == 1:
            return s_proba[0]
        else:
            return s_proba
    
    def predict(X, **kwargs):
        nb_rows, nb_atts = X.shape
        
        s_pred = np.zeros(nb_rows, self.n_outputs)
        
        # redo sklearn convention from hell
        if s_pred.shape[1] == 1:
            return s_pred.ravel()
        else:
            return s_pred

    # Updates (i.e., recalculate)
    def _update_classes_(self):
        # Re-initialize (easier)
        self.classes_ = [np.array([])] * len(self.targ_ids)    
        
        for e in self.estimators_:
            self._add_classes_estimator(e)
        return
    
    def _update_n_classes_(self):
        self.n_classes_ = [len(c) for c in self.classes_]
        return 
    
    def _update_n_outputs_(self):
        self.n_outputs_ = len(self.targ_ids)
        return 
    
    def _update_n_features_(self):
        self.n_features = len(self.desc_ids)
        return 
    
    # Add (i.e., incremental update)
    def add_estimator(self, e):
        self.estimators_.append(e)
        
        self._add_ids_estimator(e)

        self._update_classes()
        
        self._update_n_classes_()
        self._update_n_outputs_()
        self._update_n_features_()
        
        return
    
    def _add_ids_estimator(self, e):
        self.desc_ids = np.unique(np.concatenate((self.desc_ids, e.desc_ids)))
        self.targ_ids = np.unique(np.concatenate((self.targ_ids, e.targ_ids)))
        return
    
    def _add_classes_estimator(self, e):
        
        idx_map = self._map_elements_idx(e.targ_ids, self.targ_ids)
        
        def combine(classes_1, classes_2):
            return np.unique(np.concatenate((classes_1, classes_2)))
        
        for idx_e, idx_s in idx_map:                    # `s` stands for `self`
            e_classes_ = e.classes_[idx_e]
            s_classes_ = self.classes_[idx_s]
            
            self.classes_[idx_s] = combine(e_classes_, s_classes_)
    
        return
    
    def _add_proba_estimator(self, e, e_proba, s_proba):
        
        t_idx_map = self._map_elements_idx(e.targ_ids, self.targ_ids)
        
        for t_idx_e, t_idx_s in idx_map:                    # `s` stands for `self`
            l_idx_map = self._map_elements_idx(e.classes_[t_idx_e], self.classes_[t_idx_s])
            l_idx_map = np.array(l_idx_map)
            
            l_idx_e, l_idx_s = l_idx_map[:,0], l_idx_map[:,1]
            
            s_proba[idx_s][:, l_idx_s] += e_proba[t_idx_e][:, l_idx_e]
            
        return s_proba
    
    # Estimator - utilities
    def _predict_estimator_tidy(self, e, X, **kwargs):
        """
        Ensure matrix.
        """
        e_pred = e.predict(X, **kwargs)
        
        # undo sklearn convention from hell
        return np.atleast_2d(e_pred)
    
    def _predict_proba_estimator_tidy(self, e, X, **kwargs):
        """
        Ensure it is returned as a list.
        """
        e_proba = e.predict_proba(X, **kwargs)
        
        # undo sklearn convention from hell
        if isinstance(e_proba, np.ndarray):
            return [e_proba]
        elif isinstance(e_proba, list):
            return e_proba
        else:
            msg = """
            e_proba has to be {np.ndarray, list},
            instead the type was:   {}
            """.format(type(e_proba))
            raise TypeError(msg)
            
    
    # Random utilities
    def _map_elements_idx(self, a1, a2):
        """
        Create a map that connects elements that occur in both arrays.

        The output is a tuple list, with a tuple being;
            (index of element e in a1, index of element e in a2)

        N.b.:   Does not crash in case of double entries (behaviour is still correct),
                but there are some ambiguities involved. I.e., do not do this.
        """
        idx_a1 = np.where(np.in1d(a1, a2))[0]
        idx_a2 = np.where(np.in1d(a2, a1))[0]

        return list(zip(idx_a1, idx_a2))
    
    def filter_matrix(matrix, ids_1, ids_2):
        
        idx_map = np.array(self._map_elements_idx(ids_1, ids_2))
        relevant_idx = idx_map[:, 0]
        
        if isinstance(matrix, np.ndarray):
            # Case 1: Outcome of predict_proba of a single targets
            # Case 2: Outcome of predict
            return matrix[:, relevant_idx]
        elif isinstance(matrix, list):
            # Case 1: Outcome of predict_proba of a multiple targets
            return [c for idx, c in enumerate(matrix) if idx in relevant_idx]
        else:
            msg = """
            Matrix has to be {np.ndarray, list},
            instead the type was:   {}
            """.format(type(matrix))
            raise TypeError(msg)
            return

In [15]:
a = np.array([1,2,3])
a

array([1, 2, 3])

In [16]:
a = np.atleast_2d(a)

In [17]:
a

array([[1, 2, 3]])

In [18]:
a.shape

(1, 3)

In [19]:
a[0]

array([1, 2, 3])

In [20]:
def consistent

SyntaxError: invalid syntax (<ipython-input-20-854c961f0974>, line 1)

In [None]:
def _map_elements_idx(self, a1, a2):
    """
    Create a map that connects elements that occur in both arrays.

    The output is a tuple list, with a tuple being;
        (index of element e in a1, index of element e in a2)

    N.b.:   Does not crash in case of double entries (behaviour is still correct),
            but there are some ambiguities involved. I.e., do not do this.
    """
    idx_a1 = np.where(np.in1d(a1, a2))[0]
    idx_a2 = np.where(np.in1d(a2, a1))[0]

    return list(zip(idx_a1, idx_a2))

In [None]:
_map_elements_idx(3, [1,2], [2])

In [None]:
def filter_output(f, in_ids, out_ids):
    
    idx_map = _map_elements_idx(3, [1,2], [2])
    
    array = f()
    
    
    return

In [None]:
def get_two():
    return 2

In [None]:
def alter_function(f):
    return f() + 4

In [None]:
alter_function(get_two)

In [None]:
x[0] = np.array([1,2])

In [None]:
x = []

In [None]:
x.append(4)

In [None]:
x

In [None]:
class ParallelComposition(object):
    
    def __init__(self, estimators, targ_ids=None):
        
        # Estimators
        self.estimators_ = estimators
        
        # Bookkeeping desc_ids/targ_ids
        self.desc_ids = np.unique(np.concatenate([e.desc_ids for e in estimators]))
        self.targ_ids = np.unique(np.concatenate([e.targ_ids for e in estimators]))
        
        if targ_ids is not None:
            assert np.intersect1d(self.targ_ids, targ_ids).shape[0] > 0
            self.targ_ids = targ_ids
        
        # Bookkeeping classes_
        self.classes_ = [[]] * len(self.targ_ids)    # Init
    
        for e in self.estimators_:
            self._add_classes_estimator(e)
          
        # n_classes_/n_outputs_/n_features_
        self.n_classes_ = self.get_n_classes_()
        self.n_outputs_ = self.get_n_outputs_() 
        self.n_features_ = self.get_n_features_()
        
        
        return
    
    def get_n_classes_(self):
        return [len(c) for c in self.classes_]
    
    def get_n_outputs_(self):
        return len(self.targ_ids)
    
    def gen_n_features_(self):
        return len(self.desc_ids)
    
    
    
    def _add_proba_estimator(self, e, e_proba, s_proba):
        
        t_idx_map = self._map_elements_idx(e.targ_ids, self.targ_ids)
        
        for t_idx_e, t_idx_s in idx_map:                    # `s` stands for `self`
            l_idx_map = self._map_elements_idx(e.classes_[t_idx_e], self.classes_[t_idx_s])
            l_idx_map = np.array(l_idx_map)
            
            l_idx_e, l_idx_s = l_idx_map[:,0], l_idx_map[:,1]
            
            s_proba[idx_s][:, l_idx_e] += e_proba[t_idx_e][:, l_idx_s]
            
        return s_proba
    
    def _add_classes_estimator(self, e):
        
        idx_map = self._map_elements_idx(e.targ_ids, self.targ_ids)
        
        def combine(classes_1, classes_2):
            return np.unique(np.concatenate((classes_1, classes_2)))
        
        for idx_e, idx_s in idx_map:                    # `s` stands for `self`
            e_classes_ = e.classes_[idx_e]
            s_classes_ = self.classes_[idx_s]
            
            self.classes_[idx_s] = combine(e_classes_, s_classes_)
    
        return
    
    def _map_elements_idx(self, a1, a2):
        """
        Create a map that connects elements that occur in both arrays.

        The output is a tuple list, with a tuple being;
            (index of element e in a1, index of element e in a2)

        N.b.:   Does not crash in case of double entries (behaviour is still correct),
                but there are some ambiguities involved. I.e., do not do this.
        """
        idx_a1 = np.where(np.in1d(a1, a2))[0]
        idx_a2 = np.where(np.in1d(a2, a1))[0]

        return list(zip(idx_a1, idx_a2))

### Demo

#### Toy Dataset

In [None]:
numeric_part = np.random.rand(40, 2)

In [None]:
template = np.zeros((40,3))
template[:, [0,2]] = numeric_part
template

In [None]:
numeric_part = 

In [None]:
nominal_part = np.zeros((40,2))
nominal_part[:, [0]] = np.random.randint(0,2,size=(40,1))
nominal_part[:, [1]] = np.random.randint(1,4,size=(40,1))

In [None]:
dataset = np.zeros((40,4))
dataset[:, 0:2] = numeric_part
dataset[:, 2:] = nominal_part
dataset

In [None]:
pc = ParallelComposition(rel_models)

In [None]:
pc.n_classes_

In [None]:
tl = [(1,2),(3,4)]

In [None]:
np.array(tl)[:,0]

In [None]:
pc.targ_ids

In [None]:
pc.classes_

In [None]:
rf.n_classes_

In [None]:
import numpy as np
np.zeros(10)