# MERCS Exp Def

MERCS Experiment

# Preliminaries

## Imports

In [1]:
import pandas as pd
import arff
import os
import numpy as np
import aaai20
import PxW

from os.path import dirname

from aaai20.io import filename_dataset, filename_query
from aaai20.exp import collect_results
from aaai20.wrangling import arff_to_df
from aaai20.exp import collect_results, process_outcomes, save_outcome
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

from modulo.utils.encoding import query_to_code, code_to_query, encode_attribute

RANDOM_STATE = 42

In [2]:
from mercs.core import MERCS

## Helpers

In [3]:
def fn_to_ok_df(filename, return_qry=True, return_nominal=False):
    """
    Convenience function. Preprocess so its ready for sklearn.
    """
    
    df = arff_to_df(filename, return_af=False, encode_nominal=True)
    
    if return_qry:
        qry = qry_from_df(df)

    """
    df_nominal = df.select_dtypes(exclude=['number'])
    df_nominal = df_nominal.apply(LabelEncoder().fit_transform)
    df[df_nominal.columns] = df_nominal
    """
    
    nominal = np.where(df.dtypes==int)[0].tolist()
    
    c1 = return_qry
    c2 = return_nominal
    
    if c1 and c2:
        return df, qry, nominal
    elif c1 and not c2:
        return df, qry
    elif not c1 and c2:
        return df, nominal
    else:
        return df
    
def qry_from_df(df):
    qry = np.zeros(len(df.columns), dtype=int)
    miss_ids = df.columns[df.isna().any()].tolist()
    targ_ids = df.columns[-1]
    
    qry[miss_ids] = -1
    qry[targ_ids] = 1
    return qry

## Functions

In [4]:
def fit_mercs(dataset,
              target_idx=-1,
              max_depth=8,
              min_samples_leaf=10,
              selection_algorithm="base",):
    # Preprocess
    fn_train = filename_dataset(dataset, step=1, suffix='train')

    df, nominal = fn_to_ok_df(fn_train, return_qry=False, return_nominal=True)
    train = df.values
    target_id = list(range(df.shape[1]))[target_idx] # Assumption: Last attribute is target

    nominal_ids = set(list(nominal) + [target_id])

    # Train
    clf = MERCS()
    clf.fit(df,
            ind_max_depth=max_depth,
            ind_min_samples_leaf=min_samples_leaf,
            sel_algo=selection_algorithm,)
    return clf

In [5]:
def predict_mercs(dataset,
                  classifier,
                  target_idx=-1,
                  prediction_algorithm='mi',
                  prediction_iterations=0.1,
                  prediction_parameter=0.95):
    result = []
    
    # Load queries
    fn_qry = filename_query(dataset, suffix="default")
    q_codes = np.load(fn_qry)
    
    
    for q_idx, q_code in enumerate(q_codes):
        fn = filename_dataset(dataset, step=2, suffix='q_{}'.format(str(q_idx).zfill(3)))
        
        df_qry, q_code_2 = fn_to_ok_df(fn, return_qry=True)
        target_id = list(range(df_qry.shape[1]))[target_idx] # Assumption: Last
        
        msg = """
        q_code from file: {}
        q_code from data: {}
        """.format(q_code, q_code_2)
        #print(msg)
        
        assert(np.array_equal(q_code, q_code_2))
        
        test = df_qry.values
        y_true = test[:,target_id].copy()
        y_true = y_true.astype(int)
        
        df_qry.iloc[:, target_id] = np.nan
        
        # Predict
        y_pred = classifier.predict(df_qry,
                                    qry_code=q_code,
                                    pred_algo=prediction_algorithm,
                                    pred_param=prediction_parameter,
                                    pred_its=prediction_iterations)
        
        y_pred = y_pred.astype(int).ravel()
        
        f1 = f1_score(y_true, y_pred, average='micro')
        result.append(f1)
        
    return q_codes, result

# Experiment 01 - Baseline

In [6]:
#FRACTION_MISSING = 0.3
#ITERATIONS = 5
MAX_DEPTH = 8
MIN_SAMPLES_LEAF = 10

datasets = ['iris',
            'balance-scale',
            'heart-statlog',
            'glass',
            'lymph',
            'diabetes',
            'vehicle',
            'ionosphere',
            'vowel']

print(len(datasets))

9


In [7]:
dataframes = {k:[] for k in datasets}

for ds in datasets:
    print(ds)
    clf = fit_mercs(ds,
                    target_idx=-1,
                    selection_algorithm="Base",
                    min_samples_leaf=MIN_SAMPLES_LEAF,
                    max_depth=MAX_DEPTH)
    
    q_codes, results = predict_mercs(ds,
                                     clf,
                                     prediction_algorithm='mi',
                                     prediction_iterations=0.1,
                                     prediction_parameter=0.95)
    
    dataframes[ds] = collect_results(ds, q_codes, results, algorithm='mercs-mi')
    
df = process_outcomes(dataframes)
save_outcome(df, filename='mercs-mi')

iris
4
[0, 1, 2, 3, 4]


TypeError: '(slice(None, None, None), [0, 1, 2, 3, 4])' is an invalid key

# Experiment 02 - SL

In [None]:
dataframes_SL = {k:[] for k in datasets}

for ds in datasets:
    print(ds)
    clf = fit_mercs(ds,
                    target_idx=-1,
                    selection_algorithm="Base",
                    min_samples_leaf=MIN_SAMPLES_LEAF,
                    max_depth=MAX_DEPTH)
    
    q_codes, results = predict_mercs(ds,
                                     clf,
                                     prediction_algorithm='MAFI',
                                     prediction_iterations=0.1,
                                     prediction_parameter=0.95)
    
    dataframes[ds] = collect_results(ds, q_codes, results, algorithm='mercs-mrai')
    
df = process_outcomes(dataframes)
save_outcome(df, filename='mercs-mrai')

# Experiment 03 - ML

In [None]:
dataframes_ML = {k:[] for k in datasets}

for ds in datasets:
    print(ds)
    clf = fit_modulo(ds,
                     target_idx=-1,
                     random_state=RANDOM_STATE,
                     prediction_algorithm='it',
                     clf_criterion="entropy",
                     rgr_criterion="mae",
                     selection_algorithm="random",
                     nb_iterations=2,
                     fraction_missing=[0, 0.1, 0.3],
                     min_samples_leaf=10,
                     max_steps=3)
    
    q_codes, results = predict_modulo(ds, clf)
    
    dataframes_ML[ds] = collect_results(ds, q_codes, results, algorithm='it')
    
df = process_outcomes(dataframes_ML)
save_outcome(df, filename='ML')