# MERCS 101 - Lecture 01: Classification

This is the first part of the tutorial, focusing on MERCS as a simple classifier.

## Preliminaries

### External Imports

In [1]:
import numpy as np
import os
import sys
from sklearn.metrics import f1_score, accuracy_score, classification_report
import pandas as pd

### MERCS imports

In [2]:
sys.path.insert(0, '..') # We add the parent dir to the path
from src.mercs.core import MERCS
from src.mercs.utils import *

import src.datasets as datasets

  from numpy.core.umath_tests import inner1d


## Induction

### Importing Data

First, we import the nursery dataset.

In [3]:
train, test = datasets.load_nursery()

This is a fully nominal dataset

In [4]:
train.head()

Unnamed: 0,Var0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8
0,2,3,0,0,0,0,0,2,2
1,2,3,0,0,0,0,0,1,1
2,2,3,0,0,0,0,0,0,0
3,2,3,0,0,0,0,2,2,2
4,2,3,0,0,0,0,2,1,1


### Training

In [5]:
model = MERCS()

In [6]:
ind_parameters = {'ind_type':           'RF',
                  'ind_n_estimators':   30}

sel_parameters = {'sel_type':           'Base',
                  'sel_its':            4,
                  'sel_param':          1}

In [7]:
train.head()

Unnamed: 0,Var0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8
0,2,3,0,0,0,0,0,2,2
1,2,3,0,0,0,0,0,1,1
2,2,3,0,0,0,0,0,0,0
3,2,3,0,0,0,0,2,2,2
4,2,3,0,0,0,0,2,1,1


In [8]:
model.fit(train, **ind_parameters, **sel_parameters)

is_nominal in this model is: [1 1 1 1 1 1 1 1 1]



## Inference

### Prediction

In [9]:
code = [0,0,0,0,0,0,0,0,1]
len(code)

9

In [10]:
pred_parameters = {'pred_type':    'MI',
                   'pred_param':   1.0,
                   'pred_its':     8}

In [11]:
y_pred = model.predict(test,
                       **pred_parameters,
                       qry_code=code)

SETTINGS.PY: I AM READING A SINGLE QUERY CODE, I.E: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Predicting q_code: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6, 7] [8]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_sta

In [12]:
y_pred

array([[4.],
       [0.],
       [4.],
       ...,
       [1.],
       [3.],
       [0.]])

### Evaluation 

In [13]:
y_true = test[test.columns.values[np.array(code)==1]].values

In [14]:
obs = f1_score(y_true, y_pred, average='macro')
obs

0.9808938294010889

In [15]:
assert isinstance(obs, (int, float))
assert 0 <= obs <= 1

## Missing attributes

In [16]:
train.iloc[1:8000,2] = np.nan
model.fit(train, **ind_parameters, **sel_parameters)

is_nominal in this model is: [1 1 1 1 1 1 1 1 1]



In [17]:
train

Unnamed: 0,Var0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8
0,2,3,0.0,0,0,0,0,2,2
1,2,3,,0,0,0,0,1,1
2,2,3,,0,0,0,0,0,0
3,2,3,,0,0,0,2,2,2
4,2,3,,0,0,0,2,1,1
5,2,3,,0,0,0,2,0,0
6,2,3,,0,0,0,1,2,1
7,2,3,,0,0,0,1,1,1
8,2,3,,0,0,0,1,0,0
9,2,3,,0,0,1,0,2,4


In [18]:
y_pred = model.predict(test,
                       **pred_parameters,
                       qry_code=code)

obs = f1_score(y_true, y_pred, average='macro')


SETTINGS.PY: I AM READING A SINGLE QUERY CODE, I.E: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Predicting q_code: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6, 7] [8]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_sta

  'precision', 'predicted', average, warn_for)


In [19]:
obs

0.6011381377741974

In [20]:
md = model.s['metadata']

In [21]:
md

{'FI': array([[5.79202658e-04, 1.01637576e-01, 3.90927923e-02, 4.41132298e-02,
         5.51740771e-02, 2.37647782e-02, 6.18721639e-02, 6.73766180e-01,
         0.00000000e+00],
        [2.56172342e-04, 2.67051193e-01, 2.20448651e-01, 0.00000000e+00,
         1.62871653e-01, 7.40922132e-02, 1.47425482e-01, 7.27570584e-02,
         5.50975781e-02],
        [1.01122574e-03, 2.33393762e-01, 0.00000000e+00, 2.54147861e-01,
         1.72857588e-01, 7.42995190e-02, 1.51176373e-01, 6.58778367e-02,
         4.72358337e-02],
        [7.66635637e-04, 0.00000000e+00, 2.28883222e-01, 2.07564829e-01,
         1.42178871e-01, 7.95316780e-02, 1.42922433e-01, 9.20642731e-02,
         1.06088058e-01],
        [5.84219248e-04, 2.47887939e-01, 2.10378861e-01, 2.06157638e-01,
         0.00000000e+00, 6.46402526e-02, 1.30896966e-01, 7.21702679e-02,
         6.72838571e-02],
        [3.20659190e-04, 2.30420792e-01, 1.99729867e-01, 1.86818742e-01,
         1.41888435e-01, 0.00000000e+00, 1.31738087e-01, 5.30

In [22]:
md = {k:v for k,v in md.items() if k is not 'FI'}

In [23]:
nominal_attributes = [x for x in range(md['nb_atts']) if md['is_nominal'][x] == 1]
numeric_attributes = [x for x in range(md['nb_atts']) if md['is_nominal'][x] == 0]
nominal_attributes

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [24]:
np.unique(md['is_nominal'])

array([1])

In [25]:
np.unique(md['types']).shape[0]

2

In [26]:
a = model.m_codes.copy()
b = model.m_codes.copy()

In [27]:
a.copy()

array([[0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0,

In [28]:
a = np.array([[1,2,3]])
b = np.array([[3,4,5]])
c = None

In [29]:
np.concatenate((a,b,c))

ValueError: all the input arrays must have same number of dimensions

In [None]:
a.shape

In [None]:
a = np.array([1,0,0,0])

In [None]:
u = np.unique(a)
#u.sort()
u

In [None]:
np.array_equal(np.unique(a),[1,0])