# MERCS 101 - Lecture 01: Classification

This is the first part of the tutorial, focusing on MERCS as a simple classifier.

## Preliminaries

### External Imports

In [1]:
import numpy as np
import os
import sys
from sklearn.metrics import f1_score, accuracy_score, classification_report
import pandas as pd

### MERCS imports

In [2]:
sys.path.insert(0, '..') # We add the parent dir to the path
from src.mercs.core import MERCS
from src.mercs.utils import *

import src.datasets as datasets

  from numpy.core.umath_tests import inner1d


## Induction

### Importing Data

First, we import the nursery dataset.

In [3]:
train, test = datasets.load_nursery()

This is a fully nominal dataset

In [4]:
train.head()

Unnamed: 0,Var0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8
0,2,3,0,0,0,0,0,2,2
1,2,3,0,0,0,0,0,1,1
2,2,3,0,0,0,0,0,0,0
3,2,3,0,0,0,0,2,2,2
4,2,3,0,0,0,0,2,1,1


### Training

In [5]:
model = MERCS()

In [6]:
ind_parameters = {'ind_type':           'RF',
                  'ind_n_estimators':   30}

sel_parameters = {'sel_type':           'Base',
                  'sel_its':            4,
                  'sel_param':          2}

In [7]:
train.head()

Unnamed: 0,Var0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8
0,2,3,0,0,0,0,0,2,2
1,2,3,0,0,0,0,0,1,1
2,2,3,0,0,0,0,0,0,0
3,2,3,0,0,0,0,2,2,2
4,2,3,0,0,0,0,2,1,1


In [8]:
model.fit(train, **ind_parameters, **sel_parameters)

is_nominal in this model is: [1 1 1 1 1 1 1 1 1]



## Inference

### Prediction

In [9]:
code = [0,0,0,0,0,0,0,0,1]
len(code)

9

In [10]:
pred_parameters = {'pred_type':    'MI',
                   'pred_param':   1.0,
                   'pred_its':     8}

In [11]:
y_pred = model.predict(test,
                       **pred_parameters,
                       qry_code=code)

SETTINGS.PY: I AM READING A SINGLE QUERY CODE, I.E: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Predicting q_code: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6] [7, 8]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_sta

In [12]:
y_pred

array([[0.],
       [1.],
       [1.],
       ...,
       [3.],
       [3.],
       [3.]])

### Evaluation 

In [13]:
y_true = test[test.columns.values[np.array(code)==1]].values

In [14]:
obs = f1_score(y_true, y_pred, average='macro')
obs

0.22148465043119814

In [15]:
assert isinstance(obs, (int, float))
assert 0 <= obs <= 1

## Missing attributes

In [16]:
train.iloc[1:8000,2] = np.nan
model.fit(train, **ind_parameters, **sel_parameters)

is_nominal in this model is: [1 1 1 1 1 1 1 1 1]



In [17]:
train

Unnamed: 0,Var0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8
0,2,3,0.0,0,0,0,0,2,2
1,2,3,,0,0,0,0,1,1
2,2,3,,0,0,0,0,0,0
3,2,3,,0,0,0,2,2,2
4,2,3,,0,0,0,2,1,1
5,2,3,,0,0,0,2,0,0
6,2,3,,0,0,0,1,2,1
7,2,3,,0,0,0,1,1,1
8,2,3,,0,0,0,1,0,0
9,2,3,,0,0,1,0,2,4


In [18]:
y_pred = model.predict(test,
                       **pred_parameters,
                       qry_code=code)

obs = f1_score(y_true, y_pred, average='macro')


SETTINGS.PY: I AM READING A SINGLE QUERY CODE, I.E: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Predicting q_code: [0, 0, 0, 0, 0, 0, 0, 0, 1]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 7] [6, 8]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_sta

In [19]:
obs

0.7699549860818815