# Modulo Exp Def

Final notebook which conducts the Modulo experiment.

# Preliminaries

## Imports

In [1]:
import pandas as pd
import arff
import os
import numpy as np
import aaai20
#import PxW

from xgboost import XGBClassifier, XGBRegressor, XGBRFRegressor

from os.path import dirname
from aaai20.io import filename_dataset, filename_query, filename_results
from aaai20.exp import collect_results
from aaai20.wrangling import arff_to_df
from aaai20.exp import collect_results, process_outcomes, save_outcome

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score


RANDOM_STATE = 42

In [2]:
from mercs.core import Mercs as Modulo
from mercs.utils.encoding import query_to_code, code_to_query, encode_attribute

In [3]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [4]:
from xgboost import XGBClassifier

In [5]:
import mercs

In [6]:
mercs.__file__

'/cw/dtailocal/repos/mercs/src/mercs/__init__.py'

## Helpers

In [7]:
def fn_to_ok_df(filename, encode=True):
    """
    Convenience function. Preprocess so its ready for sklearn.
    """
    
    df = arff_to_df(filename, return_af=False, encode_nominal=False)
    qry = qry_from_df(df)
    
    if encode:
        df_nominal = df.select_dtypes(exclude=['float'])
        
        label_encoders = {}
        for c in df_nominal.columns:
            label_encoders[c] = LabelEncoder()
            label_encoders[c].fit(df_nominal[c])
            df_nominal[c] = label_encoders[c].transform(df_nominal[c])

        df[df_nominal.columns] = df_nominal.copy()    
        nominal = df_nominal.columns.values

        return df, qry, nominal, label_encoders
    else:
        return df, qry
    
def qry_from_df(df):
    qry = np.zeros(len(df.columns), dtype=int)
    
    miss_ids = df.columns[df.isna().any()].tolist()
    targ_ids = df.columns[-1]
    
    qry[miss_ids] = -1
    qry[targ_ids] = 1
    return qry

In [8]:
def detect_nominal(df):
    df_nominal = df.select_dtypes(exclude=['float'])
    
    nominal = [idx for idx, c in enumerate(df) if c in df_nominal.columns]
    
    return nominal

## Functions

In [9]:
def fit_modulo(
    dataset,
    target_idx=-1,
    random_state=42,
    prediction_algorithm="mi",
    classifier_algorithm="DT",
    regressor_algorithm="DT",
    clf_criterion="gini",
    rgr_criterion="mse",
    selection_algorithm="base",
    nb_targets=1,
    fraction_missing=0.2,
    nb_iterations=1,
    min_samples_leaf=2,
    min_impurity_decrease=0.0,
    max_steps=8,
    max_depth=None,
    n_estimators=10,
):

    # Preliminaries
    fn_train = filename_dataset(dataset, step=1, suffix="train", extension="csv")
    df = pd.read_csv(fn_train, header=None, index_col=None)
    train = df.values

    nominal = detect_nominal(df)

    msg = """
    Nominal attributes detected in dataset: {}
    Nominal: {}
    """.format(
        dataset, nominal
    )
    # print(msg)

    target_id = list(range(df.shape[1]))[
        target_idx
    ]  # Assumption: Last attribute is target
    nominal_ids = set(list(nominal) + [target_id])
    # print(nominal_ids)

    # Train
    clf = Modulo(
        random_state=random_state,
        nb_targets=nb_targets,
        classifier_algorithm=classifier_algorithm,
        regressor_algorithm=regressor_algorithm,
        prediction_algorithm=prediction_algorithm,
        clf_criterion=clf_criterion,
        rgr_criterion=rgr_criterion,
        selection_algorithm=selection_algorithm,
        fraction_missing=fraction_missing,
        nb_iterations=nb_iterations,
        min_samples_leaf=min_samples_leaf,
        min_impurity_decrease=min_impurity_decrease,
        max_depth=max_depth,
        max_steps=max_steps,
        n_estimators=n_estimators,
    )

    clf.fit(train, nominal_attributes=nominal_ids)

    return clf

In [10]:
def predict_modulo(dataset, classifier, target_idx=-1, prediction_algorithm=None, **prediction_kwargs):
    result = {}
    f1_micro = []
    f1_macro = []
    
    # Load queries
    fn_qry = filename_query(dataset, suffix="default")
    q_codes = np.load(fn_qry)
    q_codes_return = q_codes.copy()
    
    for q_idx, q_code in enumerate(q_codes):
        fn = filename_dataset(dataset, step=2, suffix='q_{}'.format(str(q_idx).zfill(3)), extension='csv')
        df_qry = pd.read_csv(fn, header=None, index_col=None)
        
        #print(df_qry.head())
        q_code_2 = qry_from_df(df_qry)

        target_id = list(range(df_qry.shape[1]))[target_idx] # Assumption: Last
        
        msg = """
        q_code from file: {}
        q_code from data: {}
        """.format(q_code, q_code_2)
        #print(msg)
        
        assert(np.array_equal(q_code, q_code_2))
        
        test = df_qry.values
        y_true = test[:,target_id].copy()
        #y_true = y_true.astype(int)
        test[:, target_id] = np.nan
        
        if prediction_algorithm is None:
            y_pred = classifier.predict(test, q_code=q_code)
        else:
            y_pred = classifier.predict(test, q_code=q_code, prediction_algorithm=prediction_algorithm, **prediction_kwargs)

        f1_micro.append(f1_score(y_true, y_pred, average='micro'))
        f1_macro.append(f1_score(y_true, y_pred, average='macro'))
    
    result['f1_micro'] = f1_micro
    result['f1_macro'] = f1_macro
        
    return q_codes_return, result

In [11]:
def run_experiment(ds):
    clf = fit_modulo(ds,
                     target_idx=-1,
                     random_state=RANDOM_STATE,
                     classifier_algorithm="DT",
                     regressor_algorithm="DT",
                     nb_targets=1,
                     selection_algorithm="random",
                     nb_iterations=ITERATIONS,
                     fraction_missing=FRACTION_MISSING,
                     max_depth=MAX_DEPTH,
                     #clf_criterion="gini",
                     #rgr_criterion="friedman_mse",
                     #min_samples_leaf=MIN_SAMPLES_LEAF,
                     )

    q_codes, results = predict_modulo(ds, clf, prediction_algorithm='mi')
    mi = collect_results(ds, q_codes, results, identifier='sklearn')
    print("mi done")

    _, results = predict_modulo(ds, clf, prediction_algorithm='mrai',)
    mrai = collect_results(ds, q_codes, results, identifier='mrai')
    print("mrai done")

    _, results = predict_modulo(ds, clf, prediction_algorithm='it', max_steps=10, stepsize=0.2)
    it = collect_results(ds, q_codes, results, identifier='it')
    print("it done")
    
    _, results = predict_modulo(ds, clf, prediction_algorithm='rw', max_steps=10, nb_walks=40)
    rw = collect_results(ds, q_codes, results, identifier='rw')
    print("rw done")
    return mi, mrai, it, rw

# Experiment

In [12]:
FRACTION_MISSING = [0.3,]
ITERATIONS = 5

RANDOM_STATE = 98
MAX_DEPTH = 8

datasets = ['glass',
             'credit-g',
             'ionosphere',
             'lymph',
             'vehicle',
             'iris',
             'splice',
             'sonar',
             'vowel',
             'segment',
             'zoo',
             'heart-statlog',
             'waveform-5000',
             'kr-vs-kp',
             'diabetes',
             'letter',
             'balance-scale']

print(len(datasets))

 m_list = Parallel(n_jobs=n_jobs, verbose=verbose)(delayed(_learn_model)(*t, **k) for t, k in parameters)

17


In [13]:
from joblib import Parallel, delayed

dfs =  Parallel(n_jobs=8, verbose=51)(delayed(run_experiment)(ds) for ds in datasets)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:   13.3s
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   50.1s
[Parallel(n_jobs=8)]: Done   3 out of  17 | elapsed:   59.8s remaining:  4.7min
[Parallel(n_jobs=8)]: Done   4 out of  17 | elapsed:  1.1min remaining:  3.4min
[Parallel(n_jobs=8)]: Done   5 out of  17 | elapsed:  1.3min remaining:  3.2min
[Parallel(n_jobs=8)]: Done   6 out of  17 | elapsed:  1.4min remaining:  2.5min
[Parallel(n_jobs=8)]: Done   7 out of  17 | elapsed:  1.6min remaining:  2.3min
[Parallel(n_jobs=8)]: Done   8 out of  17 | elapsed:  1.6min remaining:  1.8min
[Parallel(n_jobs=8)]: Done   9 out of  17 | elapsed:  1.6min remaining:  1.4min
[Parallel(n_jobs=8)]: Done  10 out of  17 | elapsed:  1.6min remaining:  1.1min
[Parallel(n_jobs=8)]: Done  11 out of  17 | elapsed:  1.6min remaining:   52.0s
[Parallel(n_jobs=8)]: Done  12 out of  17 | elapsed:  1.6min remaining:   39.8s
[Pa

KeyboardInterrupt: 

In [None]:
dfs_01 = {k:[] for k in datasets}
dfs_02 = {k:[] for k in datasets}
dfs_03 = {k:[] for k in datasets}
dfs_04 = {k:[] for k in datasets}

for (mi, mrai, it, rw), ds in zip(dfs, datasets):
    dfs_01[ds] = mi
    dfs_02[ds] = mrai
    dfs_03[ds] = it
    dfs_04[ds] = rw
    
for dataframes, algo in zip((dfs_01, dfs_02, dfs_03, dfs_04), ('mi', 'mrai', 'it', 'rw')):
    df = process_outcomes(dataframes)
    
    fn = filename_results(exp_dname='mercs-vs-weka', exp_fname=algo)
    df.to_csv(fn, index=False)

In [None]:
aaai20.__file__