# Build a super tiny MERCS.

In [1]:
import os, sys
import pandas as pd
import numpy as np
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from os.path import dirname

  from numpy.core.umath_tests import inner1d


In [2]:
note_dir = os.getcwd()
root_dir = dirname(note_dir)
data_dir = os.path.join(root_dir, 'resc', 'data', 'tidy','nltcs')

In [3]:
rel_fnames = [os.path.join(data_dir, f) for f in os.listdir(data_dir)
              if 'F00' in f
              if 'bayesfusion' not in f]

In [4]:
train_fn, test_fn = rel_fnames[0], rel_fnames[1]

In [5]:
df_train, df_test = pd.read_csv(train_fn, header=None), pd.read_csv(test_fn, header=None)

In [6]:
train, test = df_train.values, df_test.values

## Train a meaningful selection

### Selection

In [7]:
nb_rows, nb_atts = train.shape

In [8]:
m_codes = np.eye(nb_atts, dtype=np.int64)
m_codes

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

### Induction

In [9]:
def learn_model(m_code, train):
    desc_idx, targ_idx = np.where(m_code==0)[0], np.where(m_code==1)[0]
    X, Y = train[:, desc_idx], train[:, targ_idx]
    
    clf = RandomForestClassifier()
    clf.desc_idx = desc_idx
    clf.targ_idx = targ_idx
    
    if X.shape[1]==1: X = X.ravel()
    if Y.shape[1]==1: Y = Y.ravel()

    clf.fit(X,Y)
    return clf

In [10]:
m_list = []
for m_idx, m_code in enumerate(m_codes):
    m_list.append(learn_model(m_code, train))

In [11]:
for m_idx, m_code in enumerate(m_codes):
    m_list.append(learn_model(m_code, train))

In [12]:
len(m_list)

32

## Parallel Composition

= Ensembles.

In [13]:
nb_atts

16

In [14]:
q_targ = np.array([0])
q_desc = np.arange(1,16)

In [20]:
X_test, Y_true = test[:, q_desc], test[:, q_targ]

In [16]:
rel_models = [model for idx, model in enumerate(m_list)
                   if np.intersect1d(model.targ_idx, q_targ).shape[0] > 0]

In [17]:
rel_models

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False)]

In [97]:
rf = rel_models[0]

In [98]:
rf.classes_

array([0, 1])

In [25]:
rf.predict_proba(X_test)

array([[0.97972365, 0.02027635],
       [0.9846809 , 0.0153191 ],
       [0.99285714, 0.00714286],
       ...,
       [0.99380278, 0.00619722],
       [0.97972365, 0.02027635],
       [0.85936508, 0.14063492]])

Stap 1, verzamel alle predicties..

In [33]:
all_proba = []
all_targ = []
for m in rel_models:
    all_proba.append(m.predict_proba(X_test))
    all_targ.append(m.targ_idx)

In [34]:
all_targ

[array([0]), array([0])]

In [136]:
all_proba

array([[0.97972365, 0.02027635],
       [0.9846809 , 0.0153191 ],
       [0.99285714, 0.00714286],
       ...,
       [0.99380278, 0.00619722],
       [0.97972365, 0.02027635],
       [0.85936508, 0.14063492]])

In [137]:
rf.n_outputs_

1

In [107]:
class ParallelComposition(object):
    
    def __init__(self, estimators, targ_idx = None):
        
        self.estimators = estimators
        
        self.desc_idx = np.unique(np.concatenate([e.desc_idx for e in estimators]))
        self.targ_idx = np.unique(np.concatenate([e.targ_idx for e in estimators]))
        
        if targ_idx is not None:
            assert np.intersect1d(self.targ_idx, targ_idx).shape[0] > 0
            self.targ_idx = targ_idx
        
        return
    
    def _all_proba(self, X):
        return (e.predict_proba(X[:, np.isin(self.desc_idx, e.desc_idx)]) for e in self.estimators)
    
    def _all_classes(self):
        return [e.classes_ for e in self.estimators]
    
    def _all_targ_idx(self):
        return [e.targ_idx for e in self.estimators]
    
    def _all_desc_idx(self):
        return [e.desc_idx for e in self.estimators]
    
    def predict_proba(self, X):
        
        for e in self.estimators:
            
            # Get proba
            proba = e.predict_proba(X[:, np.isin(self.desc_idx, e.desc_idx)])
            
            # Filter proba based on target
            if e.n_outputs_ > 1:
                # Filter relevant targets
                proba = [c for c_idx, c in enumerate(proba)
                         if np.isin(self.targ_idx, e.targ_idx[c_idx])]
            else:
                proba = [proba] # For uniformity
        
            # Enter in correct entry in all_proba
            
            for targ_idx_e, targ in enumerate(e.targ_idx):
                targ_idx_a = np.where(self.targ_idx==targ)[0][0]
                
                all_proba[targ_idx_a] = proba[targ_idx_e] 
        
        return all_proba
    

In [138]:
a1 = np.array([1,0,2,3])
a2 = np.array([2,3])

In [145]:
np.where(a1==1)[0][0]

0

In [77]:
np.isin(a1,a2)

array([False, False,  True,  True])

In [72]:
new = X_test[:, 0:4]
new

array([[0, 0, 0, 0],
       [0, 0, 1, 1],
       [0, 0, 0, 1],
       ...,
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 1]])

In [78]:
new[:, np.isin(a1,a2)]

array([[0, 0],
       [1, 1],
       [0, 1],
       ...,
       [0, 0],
       [0, 0],
       [0, 1]])

In [41]:
a = np.unique(np.concatenate([a1,a2]))
a

array([1, 2])

In [110]:
ens = ParallelComposition(rel_models, targ_idx=[0])

In [111]:
gen = ens.predict_proba(X_test)

In [112]:
all_proba = next(gen)

In [113]:
all_proba

array([[0.97972365, 0.02027635],
       [0.9846809 , 0.0153191 ],
       [0.99285714, 0.00714286],
       ...,
       [0.99380278, 0.00619722],
       [0.97972365, 0.02027635],
       [0.85936508, 0.14063492]])

In [114]:
ens._all_classes()

[array([0, 1]), array([0, 1])]

In [116]:
ens._all_desc_idx()

[array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])]

In [117]:
ens._all_targ_idx()

[array([0]), array([0])]

In [127]:
l1 = [0,1,2,3]

In [124]:
slice(1,2)

slice(1, 2, None)

In [131]:
l1[slice(1,2),slice(2,2)]

TypeError: list indices must be integers or slices, not tuple

In [135]:
proba = [a1[:-1]/32, a2/20]
proba

[array([0.03125, 0.     , 0.0625 ]), array([0.1 , 0.15])]

In [None]:
proba = [c for c in proba
         if ]

## Sequential Composition
= Chains.