# Preprocess

This notebook will evolve into the preprocess script.

In [1]:
import pandas as pd
import arff
import os
import numpy as np
from os.path import dirname

from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

In [42]:
def original_filename(basename, extension='arff'):
    
    root_dir = dirname(os.getcwd())
    true_path = os.path.join(root_dir, 'data', 'raw', 'datasets-UCI', 'UCI', "{}.{}".format(basename, extension))

    return os.path.relpath(true_path)

def filename(basename, step=1, prefix="", suffix="", seperator="-", extension="arff", check=True):
    """
    Filename generator for the datafiles of this experiment
    """
    
    filename = build_filename(basename, prefix=prefix, suffix=suffix, separator=seperator, extension=extension)
    
    # FS things
    root_dir = dirname(os.getcwd())
    data_dir = os.path.relpath(os.path.join(root_dir, 'data'))
    step_dir = os.path.join(data_dir, "step-"+str(step).zfill(2))

    # If dir does not exist, make it
    if check:
        if not os.path.exists(step_dir):
            os.makedirs(step_dir)

    return os.path.join(step_dir, filename)

def filename_query(basename, prefix="", suffix="", seperator="-", extension="npy", check=True):
    """
    Filename generator of the query files of this experiment
    """
    filename = build_filename(basename, prefix=prefix, suffix=suffix, separator=seperator, extension=extension)
    
    # FS things
    root_dir = dirname(os.getcwd())
    conf_dir = os.path.relpath(os.path.join(root_dir, 'config'))
    qry_dir = os.path.join(data_dir, "query")

    # If dir does not exist, make it
    if check:
        if not os.path.exists(qry_dir):
            os.makedirs(qry_dir)

    return os.path.join(qry_dir, filename)


def build_filename(basename, prefix="", suffix="", separator="-", extension="csv"):
    return separator.join([x for x in (prefix, basename, suffix) if len(x) > 0])+".{}".format(extension)


def arff_to_df(filename, encode_nominal=False, return_af=True):
    with open(filename, 'r') as f:
        af = arff.load(f, encode_nominal=encode_nominal)
    
    df = pd.DataFrame(af['data'])
    
    if return_af:
        return df, af
    else:
        return df

## 01 - Train-Test Splits

In [3]:
# load dataset
dataset = 'iris'
fn_inn = original_filename(dataset)

df, af = arff_to_df(fn_inn, encode_nominal=False, return_af=True)
df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# filenames
fn_train = filename(dataset, step=1, suffix='train')
fn_test = filename(dataset, step=1, suffix='test')

# split train and test
train, test = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE, stratify=df.iloc[:, -1:])

af_train = af.copy()
af_train['data'] = train.values

with open(fn_train, 'w') as f:
    arff.dump(af_train, f)
    
af_test = af.copy()
af_test['data'] = test.values

with open(fn_test, 'w') as f:
    arff.dump(af_test, f)

### Intermediate test

Now we should see if these in fact can be used to train a weka model, or if there are some fishy things going on.

In [5]:
fn_train

'../data/step-01/iris-train.arff'

In [6]:
import PxW

# Check if the package gets installed correctly.
clf = PxW.J48()

clf.fit(fn_train, verbose=False)

[2019-08-29 11:10:04,839] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'
[2019-08-29 11:10:04,841] INFO - prefect.FlowRunner | Starting flow run.
[2019-08-29 11:10:04,848] INFO - prefect.TaskRunner | Task 'Constant[str]': Starting task run...
[2019-08-29 11:10:04,850] INFO - prefect.TaskRunner | Task 'Constant[str]': finished task run for task with final state: 'Success'
[2019-08-29 11:10:04,851] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...
[2019-08-29 11:10:05,321] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'
[2019-08-29 11:10:05,325] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


<Success: "All reference tasks succeeded.">

In [7]:
out = clf.predict(fn_test, verbose=True)

[2019-08-29 11:10:05,338] INFO - prefect.FlowRunner | Beginning Flow run for 'predict'
[2019-08-29 11:10:05,339] INFO - prefect.FlowRunner | Starting flow run.
[2019-08-29 11:10:05,343] INFO - prefect.TaskRunner | Task 'Constant[str]': Starting task run...
[2019-08-29 11:10:05,344] INFO - prefect.TaskRunner | Task 'Constant[str]': finished task run for task with final state: 'Success'
[2019-08-29 11:10:05,345] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...
[2019-08-29 11:10:05,579] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'
[2019-08-29 11:10:05,580] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


Cf. [this stackoverflow](https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn) to get some insights on encoding. Here it don't matter since I just want an F1 score anyway.

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score

def f1_weka(out, average='macro'):
    out = out.apply(LabelEncoder().fit_transform)
    f1 = f1_score(out['actual'], out['predicted'], average=average)
    return f1


In [9]:
f1_weka(out)

0.9665831244778613

## Generate Queries

In [10]:
# inputs
fn_test = filename(dataset, step=1, suffix='test')

q_idx = 1
fn_qry = filename(dataset, step=2, suffix='q_{}'.format(str(q_idx).zfill(3)))
fn_qry

'../data/step-02/iris-q_001.arff'

In [11]:
df_test, af_test = arff_to_df(fn_test, encode_nominal=False, return_af=True)

#df_test.iloc[:, 0] = np.nan
#df_test.iloc[:, 1] = np.nan
#df_test.iloc[:, 2] = np.nan
df_test.iloc[:, 3] = np.nan

print(df_test.head()) 

af_qry = af_test.copy()
af_qry['data'] = df_test.values

with open(fn_qry, 'w') as f:
    arff.dump(af_qry, f)

     0    1    2   3                4
0  4.4  3.0  1.3 NaN      Iris-setosa
1  6.1  3.0  4.9 NaN   Iris-virginica
2  4.9  2.4  3.3 NaN  Iris-versicolor
3  5.0  2.3  3.3 NaN  Iris-versicolor
4  4.4  3.2  1.3 NaN      Iris-setosa


### Intermediate Test

In [12]:
# Check if it works
clf = PxW.J48()
clf.fit(fn_train, verbose=False)
out = clf.predict(fn_qry, verbose=True)

f1_weka(out, average='macro')

[2019-08-29 11:10:05,679] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'
[2019-08-29 11:10:05,681] INFO - prefect.FlowRunner | Starting flow run.
[2019-08-29 11:10:05,687] INFO - prefect.TaskRunner | Task 'Constant[str]': Starting task run...
[2019-08-29 11:10:05,688] INFO - prefect.TaskRunner | Task 'Constant[str]': finished task run for task with final state: 'Success'
[2019-08-29 11:10:05,690] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...
[2019-08-29 11:10:06,140] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'
[2019-08-29 11:10:06,142] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded
[2019-08-29 11:10:06,145] INFO - prefect.FlowRunner | Beginning Flow run for 'predict'
[2019-08-29 11:10:06,147] INFO - prefect.FlowRunner | Starting flow run.
[2019-08-29 11:10:06,153] INFO - prefect.TaskRunner | Task 'Constant[str]': Starting task run...
[2019-08-29 11:10:06,154] INFO - pref

0.2501414827391058

## Generate Query File

In [13]:
def transfer_contents(list_one, list_two, nb_items_to_transfer=1):
    list_one, list_two = list_one.copy(), list_two.copy() 
    
    idx_to_transfer = np.random.choice(range(len(list_one)), nb_items_to_transfer, replace=False)
    content_to_transfer = [e for idx, e in enumerate(list_one) if idx in idx_to_transfer]
    
    for e in content_to_transfer:
        list_one.remove(e)
        list_two.append(e)
    
    return list_one, list_two

In [14]:
def generate_qry(nb_atts, targ_idx=-1, nb_qry=10):
    # init ids
    attr_ids = list(range(nb_atts))
    targ_ids = [attr_ids[targ_idx]] # Last attribute by default
    desc_ids = [e for e in attr_ids if e not in targ_ids]
    miss_ids = []

    q_targ = [targ_ids]
    q_desc = [desc_ids]
    q_miss = [miss_ids]

    # Start query buiding
    nb_of_attributes_to_make_missing = np.linspace(0, nb_atts-1, nb_qry, endpoint=False, dtype=int)
    nb_items_to_transfer = np.ediff1d(nb_of_attributes_to_make_missing)

    for qry_id, e in enumerate(nb_items_to_transfer):
        desc_ids, miss_ids = transfer_contents(desc_ids, miss_ids, nb_items_to_transfer=e)

        #print(desc_ids, miss_ids, targ_ids)
        q_targ.append(targ_ids)
        q_desc.append(desc_ids)
        q_miss.append(miss_ids)
    
    return q_desc, q_targ, q_miss

In [15]:
from modulo.utils.encoding import query_to_code

In [16]:
def save_queries(filename, queries):
    return

## Multiple Queries Flow

In [38]:
# inputs
fn_test = filename(dataset, step=1, suffix='test')

# Queries
df_test, af_test = arff_to_df(fn_test, encode_nominal=False, return_af=True)
nb_atts = len(df_test.columns)
nb_qry = 4

q_desc, q_targ, q_miss = generate_qry(nb_atts, targ_idx=-1, nb_qry=nb_qry)

q_codes = []

for q_idx, miss_ids in enumerate(q_miss):
    q_codes.append(query_to_code(q_desc[q_idx], q_targ[q_idx], q_miss[q_idx]))
    
    df_qry = df_test.copy()
    af_qry = af_test.copy()
    fn_qry = filename(dataset, step=2, suffix='q_{}'.format(str(q_idx).zfill(3)))
    
    df_qry.iloc[:, miss_ids] = np.nan
    af_qry['data'] = df_qry.values

    with open(fn_qry, 'w') as f:
        arff.dump(af_qry, f)
        
q_codes = np.r_[q_codes] # Convert to proper np.ndarray



In [40]:
np.r_[q_codes]

array([[ 0,  0,  0,  0,  1],
       [-1,  0,  0,  0,  1],
       [-1,  0, -1,  0,  1],
       [-1,  0, -1, -1,  1]])

In [18]:
q_desc[0]

[0, 1, 2, 3]

In [28]:
a = np.zeros(5)

In [36]:
np.c_[None, a]

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [27]:
q_idx = 3
query_to_code(q_desc[q_idx], q_targ[q_idx], q_miss[q_idx])

array([-1,  0, -1, -1,  1])

In [21]:
q_idx = 1
fn_qry = filename(dataset, step=2, suffix='q_{}'.format(str(q_idx).zfill(3)))
fn_qry

'../data/step-02/iris-q_001.arff'