# Modulo Exp Def

This is the final Modulo Experiment. I will make it so that the results are paper-ready.

# Preliminaries

## Imports

In [1]:
import pandas as pd
import arff
import os
import numpy as np
import aaai20
import PxW

from os.path import dirname
from aaai20.io import filename_dataset, filename_query
from aaai20.exp import collect_results
from aaai20.wrangling import arff_to_df
from aaai20.exp import collect_results, process_outcomes, save_outcome
from sklearn.model_selection import train_test_split

from modulo.core import Modulo
from modulo.utils.encoding import query_to_code, code_to_query, encode_attribute

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

RANDOM_STATE = 42

In [2]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

## Helpers

In [3]:
def fn_to_ok_df(filename, encode=True):
    """
    Convenience function. Preprocess so its ready for sklearn.
    """
    
    df = arff_to_df(filename, return_af=False, encode_nominal=False)
    qry = qry_from_df(df)
    
    if encode:
        df_nominal = df.select_dtypes(exclude=['float'])
        
        label_encoders = {}
        for c in df_nominal.columns:
            label_encoders[c] = LabelEncoder()
            label_encoders[c].fit(df_nominal[c])
            df_nominal[c] = label_encoders[c].transform(df_nominal[c])

        df[df_nominal.columns] = df_nominal.copy()    
        nominal = df_nominal.columns.values

        return df, qry, nominal, label_encoders
    else:
        return df, qry
    
def qry_from_df(df):
    qry = np.zeros(len(df.columns), dtype=int)
    
    miss_ids = df.columns[df.isna().any()].tolist()
    targ_ids = df.columns[-1]
    
    qry[miss_ids] = -1
    qry[targ_ids] = 1
    return qry

In [4]:
def detect_nominal(df):
    df_nominal = df.select_dtypes(exclude=['float'])
    
    nominal = [idx for idx, c in enumerate(df) if c in df_nominal.columns]
    
    return nominal

## Functions

In [5]:
def fit_modulo(dataset,
               target_idx=-1,
               random_state=42,
               prediction_algorithm='mi',
               clf_criterion="gini",
               rgr_criterion="mse",
               selection_algorithm="base",
               fraction_missing=0.2,
               nb_iterations=1,
               min_samples_leaf=2,
               min_impurity_decrease=0.,
               max_steps=8,
               max_depth=None):
    
    # Preliminaries
    fn_train = filename_dataset(dataset, step=1, suffix='train', extension='csv')
    df = pd.read_csv(fn_train, header=None, index_col=None)
    train = df.values
    
    nominal = detect_nominal(df)
    
    msg = """
    Nominal attributes detected in dataset: {}
    Nominal: {}
    """.format(dataset, nominal)
    #print(msg)
    
    target_id = list(range(df.shape[1]))[target_idx] # Assumption: Last attribute is target
    nominal_ids = set(list(nominal) + [target_id])
    #print(nominal_ids)
    
    # Train
    clf = Modulo(
        random_state=random_state,
        prediction_algorithm=prediction_algorithm,
        clf_criterion=clf_criterion,
        rgr_criterion=rgr_criterion,
        selection_algorithm=selection_algorithm,
        fraction_missing=fraction_missing,
        nb_iterations=nb_iterations,
        min_samples_leaf=min_samples_leaf,
        min_impurity_decrease=min_impurity_decrease,
        max_depth=max_depth,
        max_steps=max_steps,
    )
    
    clf.fit(train, nominal_attributes=nominal_ids)
    
    return clf

In [6]:
def predict_modulo(dataset, classifier, target_idx=-1, prediction_algorithm=None, **prediction_kwargs):
    result = []
    
    # Load queries
    fn_qry = filename_query(dataset, suffix="default")
    q_codes = np.load(fn_qry)
    
    for q_idx, q_code in enumerate(q_codes):
        fn = filename_dataset(dataset, step=2, suffix='q_{}'.format(str(q_idx).zfill(3)), extension='csv')
        df_qry = pd.read_csv(fn, header=None, index_col=None)
        
        #print(df_qry.head())
        q_code_2 = qry_from_df(df_qry)

        target_id = list(range(df_qry.shape[1]))[target_idx] # Assumption: Last
        
        msg = """
        q_code from file: {}
        q_code from data: {}
        """.format(q_code, q_code_2)
        #print(msg)
        
        assert(np.array_equal(q_code, q_code_2))
        
        test = df_qry.values
        y_true = test[:,target_id].copy()
        #y_true = y_true.astype(int)
        test[:, target_id] = np.nan
        
        if prediction_algorithm is None:
            y_pred = classifier.predict(test, q_code=q_code)
        else:
            y_pred = classifier.predict(test, q_code=q_code, prediction_algorithm=prediction_algorithm, **prediction_kwargs)
        
        #y_pred = y_pred.astype(int).ravel()
        f1 = f1_score(y_true, y_pred, average='macro')
        result.append(f1)
        
    return q_codes, result

# Experiment

In [7]:
FRACTION_MISSING = [0.1, 0.3]
ITERATIONS = 1
MAX_DEPTH = 16
MIN_SAMPLES_LEAF = 10

datasets = ['glass',
             'credit-g',
             'ionosphere',
             'lymph',
             'vehicle',
             'iris',
             'splice',
             'sonar',
             'vowel',
             'segment',
             'zoo',
             'heart-statlog',
             'waveform-5000',
             'kr-vs-kp',
             'diabetes',
             'letter',
             'balance-scale']

#datasets = ['vehicle']


print(len(datasets))

17


In [8]:
dfs_01 = {k:[] for k in datasets}
dfs_02 = {k:[] for k in datasets}
dfs_03 = {k:[] for k in datasets}

for ds in datasets:
    print(ds)
    clf = fit_modulo(ds,
                     target_idx=-1,
                     random_state=RANDOM_STATE,
                     clf_criterion="gini",
                     rgr_criterion="friedman_mse",
                     selection_algorithm="random",
                     nb_iterations=ITERATIONS,
                     fraction_missing=FRACTION_MISSING,
                     min_samples_leaf=MIN_SAMPLES_LEAF,
                     max_depth=MAX_DEPTH)
    
    q_codes, results = predict_modulo(ds, clf, prediction_algorithm='mi')
    dfs_01[ds] = collect_results(ds, q_codes, results, algorithm='sklearn')
    print("mi done")
    
    q_codes, results = predict_modulo(ds, clf, prediction_algorithm='mrai',)
    dfs_02[ds] = collect_results(ds, q_codes, results, algorithm='mrai')
    print("mrai done")
    
    q_codes, results = predict_modulo(ds, clf, prediction_algorithm='it', max_steps=4)
    dfs_03[ds] = collect_results(ds, q_codes, results, algorithm='it')
    print("it done")

for dfs, algo in zip((dfs_01, dfs_02, dfs_03), ('mi', 'mrai', 'it')):
    df = process_outcomes(dfs)
    save_outcome(df, filename=algo)

glass
mi done
mrai done
it done
credit-g
mi done
mrai done
it done
ionosphere
mi done
mrai done
it done
lymph
mi done
mrai done
it done
vehicle
mi done
mrai done
it done
iris
mi done
mrai done
it done
splice


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


mi done


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


mrai done


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


it done
sonar
mi done
mrai done
it done
vowel
mi done
mrai done
it done
segment
mi done
mrai done
it done
zoo
mi done


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


mrai done


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


it done
heart-statlog
mi done
mrai done
it done
waveform-5000
mi done
mrai done
it done
kr-vs-kp
mi done
mrai done
it done
diabetes
mi done
mrai done
it done
letter
mi done
mrai done
it done
balance-scale
mi done
mrai done
it done
