# Mercs on StarAI

Renewed version of the eternal experiment on starai benchmark. This experiment assesses the multi-directionality of mercs and compares this to Bayesian networks. Ultimately, this shows that in terms of flexibility, Mercs can get basically equivalent performance (if we consider MAP, at least) to classical BN. The eternal criticism is, of course, that these BN approaches are not state of the art. But that does not matter. What matters is that such a multi-directional ensemble can, in fact, be used in this kind of changing prediction task, and still keep up.

# Preliminaries

## Imports

In [1]:
import pandas as pd
import os
import numpy as np
import aaai20
import mercs

from os.path import dirname
from aaai20.io import filename_dataset, filename_query, filename_model, experiment_suffix, filename_results
from aaai20.exp import collect_results, process_outcomes, save_outcome

from sklearn.metrics import f1_score

RANDOM_STATE = 42

KeyboardInterrupt: 

In [None]:
import mercs

In [None]:
from mercs.core import Mercs
from mercs.utils.encoding import query_to_code, code_to_query, encode_attribute, get_att

In [None]:
import dill as pkl 

In [None]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Functions

## Fit

In [None]:
fit_config = dict(
    random_state=42,
    classifier_algorithm="DT",
    regressor_algorithm="DT",
    clf_criterion="gini",
    rgr_criterion="mse",
    selection_algorithm="base",
    nb_targets=1,
    fraction_missing=0.2,
    nb_iterations=1,
    min_samples_leaf=2,
    max_depth=None,
)

In [None]:
def fit_mercs(dataset, **fit_config):

    # Load data
    fn_train = filename_dataset(dataset, step=2, suffix="train", extension="csv")
    df = pd.read_csv(fn_train, header=None, index_col=None)
    train = df.values
    train = train.astype(float)

    # Everything is nominal here
    nominal_ids = set(range(train.shape[1]))

    # Train
    clf = Mercs(**fit_config)
    clf.fit(train, nominal_attributes=nominal_ids)

    return clf

In [None]:
def save_mercs(dataset, classifier, keyword="default"):
    
    suffix = "mercs-{}".format(keyword)
    fn_mod = filename_model(dataset, suffix=suffix)
    
    with open(fn_mod, 'wb') as f:
        pkl.dump(classifier, f)
    return

def load_mercs(dataset, keyword="default"):
    suffix = "mercs-{}".format(keyword)
    fn_mod = filename_model(dataset, suffix=suffix)
    
    with open(fn_mod, 'rb') as f:
        clf = pkl.load(f)
    return clf

## Predict

In [None]:
predict_config = dict(prediction_algorithm="mi")


def predict_mercs(dataset, classifier, q_idx=None, **predict_config):
    result = []
    f1_micro = []
    f1_macro = []
    q_codes_return = []
    q_idx_return = []
    inf_time = []

    # Load queries
    fn_qry = filename_query(dataset, suffix="default")
    q_codes = np.load(fn_qry)
    
    # Load data
    fn_test = filename_dataset(dataset, step=2, suffix="test", extension="csv")
    df = pd.read_csv(fn_test, header=None, index_col=None)
    
    # Load ind_time
    ind_time = classifier.model_data.get('ind_time')
    
    # Filter for query_idx
    include = {
        type(None): lambda x: True,
        list: lambda x: x in q_idx,
        int: lambda x: x == q_idx,
    }
    ok = include[type(q_idx)]
    
    for query_idx, q_code in enumerate(q_codes):
        if ok(query_idx):
            q_idx_return.append(query_idx)
            q_codes_return.append(q_code)
    
            # Preprocessing
            test = df.values
            test = test.astype(float)
            target_ids = get_att(q_code, kind='targ').tolist()
            y_true = test[:, target_ids].copy() # Extract ground truth
            test[:, target_ids] = np.nan        # Ensure the answers do never touch the algorithm even
            
            # Predictions and evaluation
            y_pred = classifier.predict(test, q_code=q_code, **predict_config)
            q_inf_time = classifier.model_data['inf_time']

            q_f1_micro, q_f1_macro = f1_score(y_true, y_pred, average='micro'), f1_score(y_true, y_pred, average='macro')
            
            inf_time.append(q_inf_time)
            f1_micro.append(q_f1_micro)
            f1_macro.append(q_f1_macro)

    q_codes_return = np.vstack(q_codes_return)
    results = dict(f1_micro=f1_micro, f1_macro=f1_macro)
    timings = dict(ind_time=ind_time, inf_time=inf_time)

    return q_idx_return, q_codes_return, results, timings

# Flows

## Model-Building

In [None]:
def flow_build_mercs_model(dataset, model_keyword="default", **fit_config):

    default_fit_config = dict(
        random_state=42,
        classifier_algorithm="DT",
        regressor_algorithm="DT",
        clf_criterion="gini",
        rgr_criterion="mse",
        selection_algorithm="base",
        nb_targets=1,
        fraction_missing=0.2,
        nb_iterations=1,
        min_samples_leaf=2,
        max_depth=None,
    )
    
    fit_config = {**default_fit_config, **fit_config}
    
    # Build model
    clf = fit_mercs(dataset, **fit_config)
    
    # Save model
    save_mercs(dataset, clf, keyword=model_keyword)

    return

### Test

In [None]:
flow_build_mercs_model("nltcs", model_keyword="default", **fit_config)

## Predictions

In [None]:
def flow_mercs_predict(
    dataset,
    exp_idx=0,
    qry_idx=None,
    model_keyword="default",
    exp_keyword=None,
    **predict_config
):

    default_predict_config = dict(prediction_algorithm="mi")

    predict_config = {**default_predict_config, **predict_config}

    # Load model
    clf = load_mercs(dataset, keyword=model_keyword)

    # Do predictions
    q_idx_return, q_codes_return, results = predict_mercs(
        dataset, clf, q_idx=qry_idx, **predict_config
    )

    # Get and save results
    if exp_keyword is None:
        exp_keyword = "mercs-{}".format(predict_config.get("prediction_algorithm", "UNKNOWN"))

    df = collect_results(
        dataset,
        q_codes_return,
        results,
        q_idx=q_idx_return,
        identifier=exp_keyword,
        exp_idx=exp_idx,
    )

    suffix = experiment_suffix(qry_idx)
    fn_res = filename_results(exp_keyword, suffix=suffix, exp_idx=exp_idx)
    
    df.to_csv(fn_res)

    return clf
    

In [None]:
clf = flow_mercs_predict("voting", exp_keyword="mercs-rw", prediction_algorithm="rw", max_steps=8, model_keyword="md4-base")

df.head()

In [None]:
clf.model_data['ind_time']

In [None]:
a = 5

In [None]:
a = 3 if a is None else a
a