# Parallel Composition Demo

Basically, designing some tests.

## Prelims

In [1]:
import os, sys
import pandas as pd
import numpy as np
import sklearn
from sklearn.tree import (DecisionTreeClassifier,
                          DecisionTreeRegressor)
from sklearn.ensemble import (RandomForestClassifier,
                              RandomForestRegressor)

from os.path import dirname

  from numpy.core.umath_tests import inner1d


In [2]:
note_dir = os.getcwd()
root_dir = dirname(note_dir)
data_dir = os.path.join(root_dir, 'resc', 'data', 'tidy','nltcs')
src_dir = os.path.join(root_dir, "src")

sys.path.append(src_dir)

In [3]:
import morpheus

In [4]:
# Filter relevant stuff out
rel_fnames = [os.path.join(data_dir, f) for f in os.listdir(data_dir)
              if 'F00' in f
              if 'bayesfusion' not in f]

train_fn, test_fn = rel_fnames[0], rel_fnames[1]

# Get dfs
df_train, df_test = pd.read_csv(train_fn, header=None), pd.read_csv(test_fn, header=None)

# Get np.arrays
train, test = df_train.values, df_test.values

## Creating Dataset

In [5]:
def add_extra_column(a, nominal=True):
    assert isinstance(a, np.ndarray)
    n_rows, n_atts = a.shape
    
    if nominal:
        extra_col = np.random.randint(1,4, size=(n_rows, 1))
        extra_col.sort(axis=0)
    else:
        extra_col = np.random.rand(n_rows,1)
    
    
    return np.concatenate((a, extra_col), axis=1)

In [6]:
train, test = train[:,0:5], test[:, 0:5]

In [7]:
train, test = add_extra_column(train), add_extra_column(test)

In [8]:
train

array([[0, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1],
       ...,
       [0, 0, 0, 1, 1, 3],
       [0, 0, 0, 0, 0, 3],
       [0, 0, 1, 1, 1, 3]])

## Merging identical models

This really should not be an issue.

### Learning individual estimators

In [9]:
def learn_model(train, desc_ids, targ_ids, **kwargs):
    X, Y = train[:, desc_ids], train[:, targ_ids]
    
    if X.shape[1]==1: X = X.ravel()
    if Y.shape[1]==1: Y = Y.ravel()
    
    try:
        clf = RandomForestClassifier(**kwargs)
        clf.fit(X, Y)
    except ValueError:
        clf = RandomForestRegressor(**kwargs)
        clf.fit(X, Y)
    
    clf.desc_ids = desc_ids
    clf.targ_ids = targ_ids
    return clf

In [10]:
n_rows, n_atts = train.shape
print(n_rows, n_atts)

19416 6


In [11]:
atts_ids = list(range(n_atts))
atts_ids

[0, 1, 2, 3, 4, 5]

We assume the last n attributes to be the targets.

In [12]:
n_targs = 1
desc_ids = atts_ids[:-n_targs]
targ_ids = atts_ids[-n_targs:]

In [13]:
m_list = []
for m_idx in range(2):
    m = learn_model(train, desc_ids, targ_ids)
    m.desc_ids = desc_ids
    m.targ_ids = targ_ids
    m_list.append(m)

### Composing estimators

In [14]:
pc = morpheus.ParallelComposition()

for m in m_list:
    pc.add_estimator(m)

In [15]:
pc.desc_ids, pc.targ_ids

(array([0., 1., 2., 3., 4.]), array([5.]))

In [16]:
pc.classes_

[array([1., 2., 3.])]

In [17]:
m_list[0].classes_, m_list[1].classes_

(array([1, 2, 3]), array([1, 2, 3]))

### Predictions

In [18]:
pc.targ_weights

array([2.])

In [19]:
pc.targ_types 

['nominal']

In [20]:
pc.predict_proba(test[:, desc_ids])

array([[0.34583191, 0.33682909, 0.317339  ],
       [0.33921641, 0.32625181, 0.33453179],
       [0.33237788, 0.34099974, 0.32662237],
       ...,
       [0.34583191, 0.33682909, 0.317339  ],
       [0.34583191, 0.33682909, 0.317339  ],
       [0.33237788, 0.34099974, 0.32662237]])

## Merging slightly different models.

### Challenge one: different targets.

In [21]:
targ_ids_1 = [5]
desc_ids_1 = [0,1,2,3]

targ_ids_2 = [4]
desc_ids_2 = [0,1,2,3]

all_desc_ids = [desc_ids_1, desc_ids_2]
all_targ_ids = [targ_ids_1, targ_ids_2]



In [22]:
atts_ids

[0, 1, 2, 3, 4, 5]

In [23]:
m_list = []
ids = zip(all_desc_ids, all_targ_ids)
for desc_ids, targ_ids in ids:
    print(desc_ids, targ_ids)
    m = learn_model(train, desc_ids, targ_ids)
    m.desc_ids = desc_ids
    m.targ_ids = targ_ids
    m_list.append(m)

[0, 1, 2, 3] [5]
[0, 1, 2, 3] [4]


In [24]:
pc2 = morpheus.ParallelComposition()

for m in m_list:
    pc2.add_estimator(m)

In [25]:
pc2.add_estimator(m_list[0])

In [26]:
pc2.pretty_print()


        # Main
        Descriptive attributes:     [0. 1. 2. 3.]
        Target attributes:          [4. 5.]
        
        ## On types (mostly nominals)
        Target attribute types:                     ['nominal', 'nominal']
        N_classes of nominal target attributes:     [2, 3]
        Classes of nominal target attributes:       [array([0., 1.]), array([1., 2., 3.])]
        
        ## Weights
        Total weights of target attributes:         [1. 2.]
        


In [27]:
pc2.predict_proba(test[:, desc_ids_1])

[array([[0.68445329, 0.31554671],
        [0.25336968, 0.74663032],
        [0.68445329, 0.31554671],
        ...,
        [0.68445329, 0.31554671],
        [0.68445329, 0.31554671],
        [0.68445329, 0.31554671]]),
 array([[0.33791571, 0.33926048, 0.32282382],
        [0.34386912, 0.32814362, 0.32798726],
        [0.33791571, 0.33926048, 0.32282382],
        ...,
        [0.33791571, 0.33926048, 0.32282382],
        [0.33791571, 0.33926048, 0.32282382],
        [0.33791571, 0.33926048, 0.32282382]])]

### Challenge two: (Partly overlapping targets)

In [28]:
targ_ids_1 = [4,5]
desc_ids_1 = [0,1,2]

targ_ids_2 = [3,4]
desc_ids_2 = [0,1,2]

all_desc_ids = [desc_ids_1, desc_ids_2]
all_targ_ids = [targ_ids_1, targ_ids_2]

In [29]:
ids = zip(all_desc_ids, all_targ_ids) # Make iterator here.
m_list = []

for desc_ids, targ_ids in ids:
    print(desc_ids, targ_ids)
    m = learn_model(train, desc_ids, targ_ids)
    m.desc_ids = desc_ids
    m.targ_ids = targ_ids
    m_list.append(m)

[0, 1, 2] [4, 5]
[0, 1, 2] [3, 4]


In [30]:
pc2 = morpheus.ParallelComposition()

for m in m_list:
    pc2.add_estimator(m)

In [31]:
pc2.pretty_print()


        # Main
        Descriptive attributes:     [0. 1. 2.]
        Target attributes:          [3. 4. 5.]
        
        ## On types (mostly nominals)
        Target attribute types:                     ['nominal', 'nominal', 'nominal']
        N_classes of nominal target attributes:     [2, 2, 3]
        Classes of nominal target attributes:       [array([0., 1.]), array([0., 1.]), array([1., 2., 3.])]
        
        ## Weights
        Total weights of target attributes:         [1. 2. 1.]
        


In [32]:
pc2.predict_proba(test[:, desc_ids_1])

[array([[0.64370031, 0.35629969],
        [0.64370031, 0.35629969],
        [0.64370031, 0.35629969],
        ...,
        [0.64370031, 0.35629969],
        [0.64370031, 0.35629969],
        [0.64370031, 0.35629969]]), array([[0.53238421, 0.46761579],
        [0.53238421, 0.46761579],
        [0.53238421, 0.46761579],
        ...,
        [0.53238421, 0.46761579],
        [0.53238421, 0.46761579],
        [0.53238421, 0.46761579]]), array([[0.34301389, 0.33438796, 0.32259815],
        [0.34301389, 0.33438796, 0.32259815],
        [0.34301389, 0.33438796, 0.32259815],
        ...,
        [0.34301389, 0.33438796, 0.32259815],
        [0.34301389, 0.33438796, 0.32259815],
        [0.34301389, 0.33438796, 0.32259815]])]

In [33]:
# Add dumbass model => Changes predictions.
m = learn_model(train[1:10,:], desc_ids_2, targ_ids_2)
m.desc_ids = desc_ids_2
m.targ_ids = targ_ids_2

In [34]:
pc2.add_estimator(m)

In [35]:
pc2.pretty_print()


        # Main
        Descriptive attributes:     [0. 1. 2.]
        Target attributes:          [3. 4. 5.]
        
        ## On types (mostly nominals)
        Target attribute types:                     ['nominal', 'nominal', 'nominal']
        N_classes of nominal target attributes:     [2, 2, 3]
        Classes of nominal target attributes:       [array([0., 1.]), array([0., 1.]), array([1., 2., 3.])]
        
        ## Weights
        Total weights of target attributes:         [2. 3. 1.]
        


In [36]:
pc2.predict_proba(test[:, desc_ids_1])

[array([[0.66649301, 0.33350699],
        [0.66649301, 0.33350699],
        [0.66649301, 0.33350699],
        ...,
        [0.66649301, 0.33350699],
        [0.66649301, 0.33350699],
        [0.66649301, 0.33350699]]), array([[0.54420852, 0.45579148],
        [0.54420852, 0.45579148],
        [0.54420852, 0.45579148],
        ...,
        [0.54420852, 0.45579148],
        [0.54420852, 0.45579148],
        [0.54420852, 0.45579148]]), array([[0.34301389, 0.33438796, 0.32259815],
        [0.34301389, 0.33438796, 0.32259815],
        [0.34301389, 0.33438796, 0.32259815],
        ...,
        [0.34301389, 0.33438796, 0.32259815],
        [0.34301389, 0.33438796, 0.32259815],
        [0.34301389, 0.33438796, 0.32259815]])]

## Adding regressors to the mix

### Adding numerical target

In [37]:
train, test = add_extra_column(train, nominal=False), add_extra_column(test, nominal=False)

In [38]:
pd.DataFrame(train).head() # Useful for inspection.

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,0.0,0.0,0.0,1.0,0.127645
1,1.0,0.0,1.0,1.0,1.0,1.0,0.374071
2,1.0,1.0,1.0,1.0,1.0,1.0,0.420345
3,0.0,0.0,0.0,0.0,0.0,1.0,0.89706
4,0.0,0.0,0.0,0.0,1.0,1.0,0.968476


In [39]:
targ_ids_1 = [6]
desc_ids_1 = [0,1,2,3]

targ_ids_2 = [4,5]
desc_ids_2 = [0,1,2,3]

targ_ids_3 = [4]
desc_ids_3 = [0,1,2,3]

all_desc_ids = [desc_ids_1, desc_ids_2, desc_ids_3]
all_targ_ids = [targ_ids_1, targ_ids_2, targ_ids_3]

In [40]:
m_list = []
ids = zip(all_desc_ids, all_targ_ids)
for desc_ids, targ_ids in ids:
    print(desc_ids, targ_ids)
    m = learn_model(train, desc_ids, targ_ids)
    m.desc_ids = desc_ids
    m.targ_ids = targ_ids
    m_list.append(m)

[0, 1, 2, 3] [6]
[0, 1, 2, 3] [4, 5]
[0, 1, 2, 3] [4]


### Composing estimators

In [41]:
pc3 = morpheus.ParallelComposition()

for m in m_list:
    pc3.add_estimator(m)

In [42]:
pc3.pretty_print()


        # Main
        Descriptive attributes:     [0. 1. 2. 3.]
        Target attributes:          [4. 5. 6.]
        
        ## On types (mostly nominals)
        Target attribute types:                     ['nominal', 'nominal', 'numeric']
        N_classes of nominal target attributes:     [2, 3]
        Classes of nominal target attributes:       [array([0., 1.]), array([1., 2., 3.])]
        
        ## Weights
        Total weights of target attributes:         [2. 1. 1.]
        


In [43]:
pc3.predict_proba(test[:, desc_ids_1])

[array([[0.68556959, 0.31443041],
        [0.25755724, 0.74244276],
        [0.68556959, 0.31443041],
        ...,
        [0.68556959, 0.31443041],
        [0.68556959, 0.31443041],
        [0.68556959, 0.31443041]]),
 array([[0.33949402, 0.33753417, 0.32297181],
        [0.33984499, 0.32831287, 0.33184214],
        [0.33949402, 0.33753417, 0.32297181],
        ...,
        [0.33949402, 0.33753417, 0.32297181],
        [0.33949402, 0.33753417, 0.32297181],
        [0.33949402, 0.33753417, 0.32297181]])]