# MERCS 101 - Lecture 03: Mix Classification & Regression

This is the third part of the tutorial, combining classification and regression

## Preliminaries

### External Imports

In [1]:
import numpy as np
import os
import sys

from sklearn.metrics import (mean_absolute_error,
                             mean_squared_error,
                             mean_squared_log_error,
                             f1_score)
import pandas as pd

  from numpy.core.umath_tests import inner1d


### MERCS imports

In [2]:
sys.path.insert(0, '..') # We add the parent dir to the path
from src.mercs.core import MERCS
from src.mercs.utils import *

import src.datasets as datasets

## Induction

### Importing Data

First, we import the fertility dataset.

In [3]:
train, test = datasets.load_fertility()

In [4]:
train.head()

Unnamed: 0,season,age,child_diseases,accident,surgical_intervention,high_fever,alco,smoking,h_seating,diagnosis
0,-0.33,0.69,0,1,1,0,0.8,0,0.88,1
1,-0.33,0.94,1,0,1,0,0.8,1,0.31,0
2,-0.33,0.5,1,0,0,0,1.0,-1,0.5,1
3,-0.33,0.75,0,1,1,0,1.0,-1,0.38,1
4,-0.33,0.67,1,1,0,0,0.8,-1,0.5,0


We observe that some attributes are nominal, whereas others appear numerical. MERCS can handle both, but of course has to somehow figure out which kind each attribute is.

This is, in general, very hard to do correctly and an genuine A.I. problem in its own right (type inference). 

So we won't get into that. **MERCS obeys a very, very simple policy**, i.e.: sklearn assumptions. Let us demonstrate.

In [25]:
model = MERCS()
model.fit(train)

is_nominal in this model is: [1 0 1 1 1 1 0 1 0 1]



Let us see what happened here. MERCS remembers some useful things about the datasets it encounters. Amongst which, the classlabels. Since a numeric type has no classlabels in the real sense of the word, MERCS uses a placeholder there.

Nevertheless, the classlabels-datastructure present in MERCS tells us all we need to know.

In [6]:
model.s['metadata']['clf_labels']

[['numeric'],
 ['numeric'],
 array([0., 1.]),
 array([0., 1.]),
 array([0., 1.]),
 array([-1.,  0.,  1.]),
 ['numeric'],
 array([-1.,  0.,  1.]),
 ['numeric'],
 array([0., 1.])]

So, what about the assumptions? Well, 2 things:

    1) They work
        Training went well, without any issues. This tells us that MERCS at least makes assumptions that ensure the component models can handle the data that they are fed and the outputs that they are expected to provide
    2) They're too simple
        The assumptions do not really correspond to reality, as we will see.
        
        
To see for ourselves how these assumptions compare to reality, let us simply look at reality. How many unique values are present in the DataFrame we provided?
  

In [7]:
train.nunique()

season                    3
age                      14
child_diseases            2
accident                  2
surgical_intervention     2
high_fever                3
alco                      5
smoking                   3
h_seating                13
diagnosis                 2
dtype: int64

It seems like MERCS made a mistake in the first attribute, `season`. MERCS thinks it is numeric, but it really appears more of a nominal attribute. All the rest seems to correspond.

How did this happen?

Well, MERCS knowns about numeric and nominal, and makes it decisions in utils.py, in the method `get_metadata_df`. 

An attribute is `numeric`

UNLESS:

    1) Its type is `int` (necessary for sklearn)
    2) It has 10 or less distinct values (this is a choice made by MERCS)

We can solve this by simple preprocessing and converting season to ints.

In [8]:
train['season'] = pd.factorize(train['season'])[0]
train.head()

Unnamed: 0,season,age,child_diseases,accident,surgical_intervention,high_fever,alco,smoking,h_seating,diagnosis
0,0,0.69,0,1,1,0,0.8,0,0.88,1
1,0,0.94,1,0,1,0,0.8,1,0.31,0
2,0,0.5,1,0,0,0,1.0,-1,0.5,1
3,0,0.75,0,1,1,0,1.0,-1,0.38,1
4,0,0.67,1,1,0,0,0.8,-1,0.5,0


In [9]:
train.dtypes

season                     int64
age                      float64
child_diseases             int64
accident                   int64
surgical_intervention      int64
high_fever                 int64
alco                     float64
smoking                    int64
h_seating                float64
diagnosis                  int64
dtype: object

### Preprocessing

Let us take this again from the top.

In [10]:
train, test = datasets.load_fertility()

train['season'] = pd.factorize(train['season'])[0]
test['season'] = pd.factorize(test['season'])[0]

In [11]:
train.head(13)

Unnamed: 0,season,age,child_diseases,accident,surgical_intervention,high_fever,alco,smoking,h_seating,diagnosis
0,0,0.69,0,1,1,0,0.8,0,0.88,1
1,0,0.94,1,0,1,0,0.8,1,0.31,0
2,0,0.5,1,0,0,0,1.0,-1,0.5,1
3,0,0.75,0,1,1,0,1.0,-1,0.38,1
4,0,0.67,1,1,0,0,0.8,-1,0.5,0
5,0,0.67,1,0,1,0,0.8,0,0.5,1
6,0,0.67,0,0,0,-1,0.8,-1,0.44,1
7,0,1.0,1,1,1,0,0.6,-1,0.38,1
8,1,0.64,0,0,1,0,0.8,-1,0.25,1
9,1,0.61,1,0,0,0,1.0,-1,0.25,1


### Training

In [12]:
model = MERCS()

In [13]:
ind_parameters = {'ind_type':           'RF',
                  'ind_n_estimators':   10,
                  'ind_max_depth':      4}

sel_parameters = {'sel_type':           'Base',
                  'sel_its':            8,
                  'sel_param':          2}

In [14]:
model.fit(train, **ind_parameters, **sel_parameters)

is_nominal in this model is: [1 0 1 1 1 1 0 1 0 1]



## Introspection

### Identification of types

MERCS makes some decisions regarding the attribute types automatically.

In [15]:
model.s['metadata']['clf_labels']

[array([0., 1., 2.]),
 ['numeric'],
 array([0., 1.]),
 array([0., 1.]),
 array([0., 1.]),
 array([-1.,  0.,  1.]),
 ['numeric'],
 array([-1.,  0.,  1.]),
 ['numeric'],
 array([0., 1.])]

In [16]:
model.s['metadata']['nb_values']

array([ 3, 14,  2,  2,  2,  3,  5,  3, 13,  2])

## Inference

### Prediction

In [17]:
code = [0]*model.s['metadata']['nb_atts']
code[-1] = 1
print(code)

target_boolean = np.array(code) == 1
y_true = test[test.columns.values[target_boolean]].values

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [18]:
pred_parameters = {'pred_type':     'IT',
                   'pred_param':    0.1,
                   'pred_its':      4}

In [19]:
y_pred = model.predict(test,
                       **pred_parameters,
                       qry_code=code)

SETTINGS.PY: I AM READING A SINGLE QUERY CODE, I.E: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
Predicting q_code: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [0, 1, 2, 3, 4, 5, 6, 7, 8] [9]
Model under consideration
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_

In [20]:
y_pred

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]])

### Evaluation 

In [21]:
clf_labels_targets = [model.s['metadata']['clf_labels'][t]
                      for t, check in enumerate(target_boolean)
                      if check]

clf_labels_targets 

[array([0., 1.])]

In [22]:
def verify_numeric_prediction(y_true, y_pred):
    obs_1 = mean_absolute_error(y_true, y_pred)
    obs_2 = mean_squared_error(y_true, y_pred)
    obs_3 = mean_squared_log_error(y_true, y_pred)

    obs = [obs_1, obs_2, obs_3]

    for o in obs:
        assert isinstance(o, (int, float))
        assert 0 <= o 
    return

In [23]:
def verify_nominal_prediction(y_true, y_pred):
    obs = f1_score(y_true, y_pred, average='macro')

    assert isinstance(obs, (int, float))
    assert 0 <= obs <= 1
    return

In [24]:
# Ensure every target is nominal
for t_idx, clf_labels_targ in clf_labels_targets:
    single_y_true = y_true[:][t_idx]
    single_y_pred = y_pred[:][t_idx]
    
    if isinstance(clf_labels_targ, np.ndarray):
        # Nominal target
        verify_nominal_prediction(single_y_true, single_y_pred)
    elif isinstance(clf_labels_targ, list):
        # Numeric target
        assert clf_labels_targ == ['numeric']
        
        verify_numeric_prediction(single_y_pred, single_y_pred)
    else:
        msg = """clf_labels of MERCS are either:\n
        np.ndarray, shape (classlabels,)\n
        \t for nominal attributes\n
        list, shape (1,)\n
        \t ['numeric] for numeric attributes \n"""
        raise TypeError(msg)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices