# Modulo Exp Def

This is the final Modulo Experiment. I will make it so that the results are paper-ready.

# Preliminaries

## Imports

In [1]:
import pandas as pd
import arff
import os
import numpy as np
import aaai20
import PxW

from os.path import dirname
from aaai20.io import filename_dataset, filename_query
from aaai20.exp import collect_results
from aaai20.wrangling import arff_to_df
from aaai20.exp import collect_results, process_outcomes, save_outcome
from sklearn.model_selection import train_test_split

from modulo.core import Modulo
from modulo.utils.encoding import query_to_code, code_to_query, encode_attribute

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

RANDOM_STATE = 42



In [2]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

## Helpers

In [3]:
def fn_to_ok_df(filename, encode=True):
    """
    Convenience function. Preprocess so its ready for sklearn.
    """
    
    df = arff_to_df(filename, return_af=False, encode_nominal=False)
    qry = qry_from_df(df)
    
    if encode:
        df_nominal = df.select_dtypes(exclude=['float'])
        
        label_encoders = {}
        for c in df_nominal.columns:
            label_encoders[c] = LabelEncoder()
            label_encoders[c].fit(df_nominal[c])
            df_nominal[c] = label_encoders[c].transform(df_nominal[c])

        df[df_nominal.columns] = df_nominal.copy()    
        nominal = df_nominal.columns.values

        return df, qry, nominal, label_encoders
    else:
        return df, qry
    
def qry_from_df(df):
    qry = np.zeros(len(df.columns), dtype=int)
    
    miss_ids = df.columns[df.isna().any()].tolist()
    targ_ids = df.columns[-1]
    
    qry[miss_ids] = -1
    qry[targ_ids] = 1
    return qry

In [4]:
def detect_nominal(df):
    df_nominal = df.select_dtypes(exclude=['float'])
    
    nominal = [idx for idx, c in enumerate(df) if c in df_nominal.columns]
    
    return nominal

## Functions

In [5]:
def fit_modulo(dataset,
               target_idx=-1,
               random_state=42,
               prediction_algorithm='mi',
               clf_criterion="gini",
               rgr_criterion="mse",
               selection_algorithm="base",
               fraction_missing=0.2,
               nb_iterations=1,
               min_samples_leaf=2,
               min_impurity_decrease=0.,
               max_steps=8,
               max_depth=None):
    
    # Preliminaries
    fn_train = filename_dataset(dataset, step=1, suffix='train', extension='csv')
    df = pd.read_csv(fn_train, header=None, index_col=None)
    train = df.values
    
    nominal = detect_nominal(df)
    
    msg = """
    Nominal attributes detected in dataset: {}
    Nominal: {}
    """.format(dataset, nominal)
    print(msg)
    
    target_id = list(range(df.shape[1]))[target_idx] # Assumption: Last attribute is target
    nominal_ids = set(list(nominal) + [target_id])
    #print(nominal_ids)
    
    # Train
    clf = Modulo(
        random_state=random_state,
        prediction_algorithm=prediction_algorithm,
        clf_criterion=clf_criterion,
        rgr_criterion=rgr_criterion,
        selection_algorithm=selection_algorithm,
        fraction_missing=fraction_missing,
        nb_iterations=nb_iterations,
        min_samples_leaf=min_samples_leaf,
        min_impurity_decrease=min_impurity_decrease,
        max_depth=max_depth,
        max_steps=max_steps,
    )
    
    clf.fit(train, nominal_attributes=nominal_ids)
    
    return clf

In [6]:
def predict_modulo(dataset, classifier, target_idx=-1):
    result = []
    
    # Load queries
    fn_qry = filename_query(dataset, suffix="default")
    q_codes = np.load(fn_qry)
    
    for q_idx, q_code in enumerate(q_codes):
        fn = filename_dataset(dataset, step=2, suffix='q_{}'.format(str(q_idx).zfill(3)), extension='csv')
        df_qry = pd.read_csv(fn, header=None, index_col=None)
        
        #print(df_qry.head())
        q_code_2 = qry_from_df(df_qry)

        target_id = list(range(df_qry.shape[1]))[target_idx] # Assumption: Last
        
        msg = """
        q_code from file: {}
        q_code from data: {}
        """.format(q_code, q_code_2)
        #print(msg)
        
        assert(np.array_equal(q_code, q_code_2))
        
        test = df_qry.values
        y_true = test[:,target_id].copy()
        y_true = y_true.astype(int)
        test[:, target_id] = np.nan
        
        y_pred = classifier.predict(test, q_code=q_code)
        y_pred = y_pred.astype(int).ravel()
        f1 = f1_score(y_true, y_pred, average='macro')
        result.append(f1)
        
    return q_codes, result

# Experiment 01 - Baseline

In [7]:
FRACTION_MISSING = [0.1]
ITERATIONS = 1
MAX_DEPTH = 2
MIN_SAMPLES_LEAF = 2

datasets = ['glass',
             'credit-g',
             'ionosphere',
             'lymph',
             'vehicle',
             'iris',
             'splice',
             'sonar',
             'vowel',
             'segment',
             'zoo',
             'heart-statlog',
             'waveform-5000',
             'kr-vs-kp',
             'diabetes',
             'letter',
             'balance-scale']

print(len(datasets))

17


In [8]:
dataframes = {k:[] for k in datasets}

for ds in datasets:
    print(ds)
    clf = fit_modulo(ds,
                     target_idx=-1,
                     random_state=RANDOM_STATE,
                     prediction_algorithm='mi',
                     clf_criterion="gini",
                     rgr_criterion="friedman_mse",
                     selection_algorithm="random",
                     nb_iterations=ITERATIONS,
                     fraction_missing=FRACTION_MISSING,
                     min_samples_leaf=MIN_SAMPLES_LEAF,
                     max_depth=MAX_DEPTH)
    
    q_codes, results = predict_modulo(ds, clf)
    
    dataframes[ds] = collect_results(ds, q_codes, results, algorithm='sklearn')
    
df = process_outcomes(dataframes)
save_outcome(df, filename='sklearn')

glass

    Nominal attributes detected in dataset: glass
    Nominal: [9]
    
credit-g

    Nominal attributes detected in dataset: credit-g
    Nominal: [0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18, 19, 20]
    
ionosphere

    Nominal attributes detected in dataset: ionosphere
    Nominal: [34]
    


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


lymph

    Nominal attributes detected in dataset: lymph
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
    
vehicle

    Nominal attributes detected in dataset: vehicle
    Nominal: [18]
    
iris

    Nominal attributes detected in dataset: iris
    Nominal: [4]
    
splice

    Nominal attributes detected in dataset: splice
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]
    


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


sonar

    Nominal attributes detected in dataset: sonar
    Nominal: [60]
    


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


vowel

    Nominal attributes detected in dataset: vowel
    Nominal: [0, 1, 2, 13]
    
segment

    Nominal attributes detected in dataset: segment
    Nominal: [19]
    


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


zoo

    Nominal attributes detected in dataset: zoo
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
    
heart-statlog

    Nominal attributes detected in dataset: heart-statlog
    Nominal: [13]
    
waveform-5000

    Nominal attributes detected in dataset: waveform-5000
    Nominal: [40]
    


  fi = fi / norm


kr-vs-kp

    Nominal attributes detected in dataset: kr-vs-kp
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]
    


  fi = fi / norm
  fi = fi / norm


diabetes

    Nominal attributes detected in dataset: diabetes
    Nominal: [8]
    
letter

    Nominal attributes detected in dataset: letter
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
    
balance-scale

    Nominal attributes detected in dataset: balance-scale
    Nominal: [4]
    


# Experiment 02 - SL

In [9]:
dataframes_SL = {k:[] for k in datasets}

for ds in datasets:
    print(ds)
    clf = fit_modulo(ds,
                     target_idx=-1,
                     random_state=RANDOM_STATE,
                     prediction_algorithm='mrai',
                     clf_criterion="gini",
                     rgr_criterion="friedman_mse",
                     selection_algorithm="random",
                     nb_iterations=ITERATIONS,
                     fraction_missing=FRACTION_MISSING,
                     min_samples_leaf=MIN_SAMPLES_LEAF,
                     max_depth=MAX_DEPTH)
    
    q_codes, results = predict_modulo(ds, clf)
    
    dataframes_SL[ds] = collect_results(ds, q_codes, results, algorithm='mrai')
    
df = process_outcomes(dataframes_SL)
save_outcome(df, filename='SL')

glass

    Nominal attributes detected in dataset: glass
    Nominal: [9]
    
credit-g

    Nominal attributes detected in dataset: credit-g
    Nominal: [0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18, 19, 20]
    


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


ionosphere

    Nominal attributes detected in dataset: ionosphere
    Nominal: [34]
    
lymph

    Nominal attributes detected in dataset: lymph
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
    
vehicle

    Nominal attributes detected in dataset: vehicle
    Nominal: [18]
    
iris

    Nominal attributes detected in dataset: iris
    Nominal: [4]
    
splice

    Nominal attributes detected in dataset: splice
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]
    


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


sonar

    Nominal attributes detected in dataset: sonar
    Nominal: [60]
    


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


vowel

    Nominal attributes detected in dataset: vowel
    Nominal: [0, 1, 2, 13]
    
segment

    Nominal attributes detected in dataset: segment
    Nominal: [19]
    


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


zoo

    Nominal attributes detected in dataset: zoo
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
    
heart-statlog

    Nominal attributes detected in dataset: heart-statlog
    Nominal: [13]
    
waveform-5000

    Nominal attributes detected in dataset: waveform-5000
    Nominal: [40]
    


  fi = fi / norm


kr-vs-kp

    Nominal attributes detected in dataset: kr-vs-kp
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]
    


  fi = fi / norm
  fi = fi / norm


diabetes

    Nominal attributes detected in dataset: diabetes
    Nominal: [8]
    
letter

    Nominal attributes detected in dataset: letter
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
    
balance-scale

    Nominal attributes detected in dataset: balance-scale
    Nominal: [4]
    


# Experiment 03 - ML

In [10]:
FI = clf.fi
FI

array([[0.        , 0.        , 0.15801984, 0.        , 0.84198016],
       [0.11936633, 0.        , 0.        , 0.08058187, 0.8000518 ],
       [0.        , 0.17489482, 0.46006017, 0.365045  , 0.        ],
       [0.04406213, 0.        , 0.10383953, 0.        , 0.85209834],
       [0.08341903, 0.        , 0.        , 0.07965067, 0.83693029]])

In [22]:
FI.shape

(5, 5)

In [24]:
avl_desc = np.array([0,1,1,0,0]).reshape(-1,1)
desc_appr = np.dot(FI, desc_ids)

desc_appr

array([[0.15801984],
       [0.        ],
       [0.634955  ],
       [0.10383953],
       [0.        ]])

In [25]:
avl_targ = np.array([0, 0, 0 ,1,  1]).reshape(-1, 1)
targ_appr = np.dot(clf.targ_ids, avl_targ)

targ_appr

array([[0],
       [0],
       [1],
       [1],
       [0]])

In [32]:
(desc_appr * targ_appr)[[3,4], :]

array([[0.10383953],
       [0.        ]])

In [45]:
g = clf.g_list[4]

In [47]:
g_ids = g.graph['id']

In [16]:
desc_ids = 
desc_ids = desc_ids.reshape(-1,1)
desc_ids.shape

(5, 1)

In [21]:
targ_appr = np.dot(clf.targ_ids, avl_targ)


array([[0],
       [0],
       [1],
       [1],
       [0]])

array([[0.15801984],
       [0.        ],
       [0.634955  ],
       [0.10383953],
       [0.        ]])

In [15]:
dataframes_ML = {k:[] for k in datasets}

for ds in datasets:
    print(ds)
    clf = fit_modulo(ds,
                     target_idx=-1,
                     random_state=RANDOM_STATE,
                     prediction_algorithm='it',
                     clf_criterion="gini",
                     rgr_criterion="friedman_mse",
                     selection_algorithm="random",
                     nb_iterations=ITERATIONS,
                     fraction_missing=FRACTION_MISSING,
                     min_samples_leaf=MIN_SAMPLES_LEAF,
                     max_depth=MAX_DEPTH,
                     max_steps=2)
    
    q_codes, results = predict_modulo(ds, clf)
    
    dataframes_ML[ds] = collect_results(ds, q_codes, results, algorithm='it')
    
df = process_outcomes(dataframes_ML)
save_outcome(df, filename='ML')

glass

    Nominal attributes detected in dataset: glass
    Nominal: [9]
    
credit-g

    Nominal attributes detected in dataset: credit-g
    Nominal: [0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18, 19, 20]
    
ionosphere

    Nominal attributes detected in dataset: ionosphere
    Nominal: [34]
    
lymph

    Nominal attributes detected in dataset: lymph
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
    
vehicle

    Nominal attributes detected in dataset: vehicle
    Nominal: [18]
    
iris

    Nominal attributes detected in dataset: iris
    Nominal: [4]
    
splice

    Nominal attributes detected in dataset: splice
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]
    


  fi = fi / norm


sonar

    Nominal attributes detected in dataset: sonar
    Nominal: [60]
    
vowel

    Nominal attributes detected in dataset: vowel
    Nominal: [0, 1, 2, 13]
    
segment


  fi = fi / norm



    Nominal attributes detected in dataset: segment
    Nominal: [19]
    


  fi = fi / norm
  fi = fi / norm
  fi = fi / norm


zoo

    Nominal attributes detected in dataset: zoo
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
    


  fi = fi / norm


heart-statlog

    Nominal attributes detected in dataset: heart-statlog
    Nominal: [13]
    
waveform-5000

    Nominal attributes detected in dataset: waveform-5000
    Nominal: [40]
    
kr-vs-kp

    Nominal attributes detected in dataset: kr-vs-kp
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]
    


  fi = fi / norm
  fi = fi / norm


diabetes

    Nominal attributes detected in dataset: diabetes
    Nominal: [8]
    
letter

    Nominal attributes detected in dataset: letter
    Nominal: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
    
balance-scale

    Nominal attributes detected in dataset: balance-scale
    Nominal: [4]
    
