# Parallel Composition Demo

Basically, designing some tests.

## Prelims

In [1]:
import os, sys
import pandas as pd
import numpy as np
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from os.path import dirname

  from numpy.core.umath_tests import inner1d


In [2]:
note_dir = os.getcwd()
root_dir = dirname(note_dir)
data_dir = os.path.join(root_dir, 'resc', 'data', 'tidy','nltcs')
src_dir = os.path.join(root_dir, "src")

sys.path.append(src_dir)

In [3]:
import morpheus

In [4]:
# Filter relevant stuff out
rel_fnames = [os.path.join(data_dir, f) for f in os.listdir(data_dir)
              if 'F00' in f
              if 'bayesfusion' not in f]

train_fn, test_fn = rel_fnames[0], rel_fnames[1]

# Get dfs
df_train, df_test = pd.read_csv(train_fn, header=None), pd.read_csv(test_fn, header=None)

# Get np.arrays
train, test = df_train.values, df_test.values

## Creating Dataset

In [5]:
def add_extra_column(a):
    assert isinstance(a, np.ndarray)
    n_rows, n_atts = a.shape
    
    extra_col = np.random.randint(1,4, size=(n_rows, 1))
    extra_col.sort(axis=0)
    
    return np.concatenate((a, extra_col), axis=1)

In [6]:
train, test = train[:,0:6], test[:, 0:6]

In [7]:
train, test = add_extra_column(train), add_extra_column(test)

## Merging identical models

This really should not be an issue.

### Learning individual estimators

In [8]:
def learn_model(train, desc_ids, targ_ids, **kwargs):
    X, Y = train[:, desc_ids], train[:, targ_ids]
    
    clf = RandomForestClassifier(**kwargs)
    clf.desc_ids = desc_ids
    clf.targ_ids = targ_ids
    
    if X.shape[1]==1: X = X.ravel()
    if Y.shape[1]==1: Y = Y.ravel()

    clf.fit(X, Y)
    return clf

In [9]:
n_rows, n_atts = train.shape
print(n_rows, n_atts)

19416 7


In [10]:
atts_ids = list(range(n_atts))
atts_ids

[0, 1, 2, 3, 4, 5, 6]

We assume the last n attributes to be the targets.

In [11]:
n_targs = 1
desc_ids = atts_ids[:-n_targs]
targ_ids = atts_ids[-n_targs:]

In [12]:
m_list = []
for m_idx in range(2):
    m = learn_model(train, desc_ids, targ_ids)
    m.desc_ids = desc_ids
    m.targ_ids = targ_ids
    m_list.append(m)

### Composing estimators

In [13]:
pc = morpheus.ParallelComposition()

for m in m_list:
    pc.add_estimator(m)

In [14]:
pc.desc_ids, pc.targ_ids

(array([0., 1., 2., 3., 4., 5.]), array([6.]))

In [15]:
pc.classes_

[array([1., 2., 3.])]

In [16]:
m_list[0].classes_, m_list[1].classes_

(array([1, 2, 3]), array([1, 2, 3]))

### Predictions

In [17]:
pc.targ_weights

array([2.])

In [18]:
pc.targ_types 

['nominal']

In [19]:
pc.predict_proba(test[:, desc_ids])

array([[0.34161002, 0.33561552, 0.32277446],
       [0.32641046, 0.33854101, 0.33504854],
       [0.35685126, 0.33205136, 0.31109737],
       ...,
       [0.34161002, 0.33561552, 0.32277446],
       [0.34161002, 0.33561552, 0.32277446],
       [0.35685126, 0.33205136, 0.31109737]])

## Merging slightly different models.

### Challenge one: different targets.

In [20]:
n_targs = 1

targ_ids_1 = [6]
desc_ids_1 = [0,1,2,3]

targ_ids_2 = [5]
desc_ids_2 = [0,1,2,3]

all_desc_ids = [desc_ids_1, desc_ids_2]
all_targ_ids = [targ_ids_1, targ_ids_2]



In [21]:
atts_ids

[0, 1, 2, 3, 4, 5, 6]

In [22]:
m_list = []
ids = zip(all_desc_ids, all_targ_ids)
for desc_ids, targ_ids in ids:
    print(desc_ids, targ_ids)
    m = learn_model(train, desc_ids, targ_ids)
    m.desc_ids = desc_ids
    m.targ_ids = targ_ids
    m_list.append(m)

[0, 1, 2, 3] [6]
[0, 1, 2, 3] [5]


In [32]:
pc2 = morpheus.ParallelComposition()

for m in m_list:
    pc2.add_estimator(m)

In [51]:
pc2.add_estimator(m_list[0])

In [52]:
pc2.pretty_print()


        # Main
        Descriptive attributes:     [0. 1. 2. 3.]
        Target attributes:          [5. 6.]
        
        # On types (mostly nominals)
        Target attribute types:                     ['nominal', 'nominal']
        N_classes of nominal target attributes:     [2, 3]
        Classes of nominal target attributes:       [array([0., 1.]), array([1., 2., 3.])]
        
        # Weights
        Total weights of target attributes:         [11.  6.]
        


In [53]:
pc2.predict_proba(test[:, desc_ids_1])

[array([[0.87787441, 0.12212559],
        [0.31215805, 0.68784195],
        [0.87787441, 0.12212559],
        ...,
        [0.87787441, 0.12212559],
        [0.87787441, 0.12212559],
        [0.87787441, 0.12212559]]),
 array([[0.33534155, 0.33880594, 0.32585251],
        [0.33579801, 0.33093043, 0.33327156],
        [0.33534155, 0.33880594, 0.32585251],
        ...,
        [0.33534155, 0.33880594, 0.32585251],
        [0.33534155, 0.33880594, 0.32585251],
        [0.33534155, 0.33880594, 0.32585251]])]