# Preprocess

This notebook will evolve into the preprocess script.

In [15]:
import pandas as pd
import arff
import os
import numpy as np
from os.path import dirname

from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

In [2]:
def original_filename(basename, extension='arff'):
    
    root_dir = dirname(os.getcwd())
    true_path = os.path.join(root_dir, 'data', 'raw', 'datasets-UCI', 'UCI', "{}.{}".format(basename, extension))

    return os.path.relpath(true_path)

def filename(basename, step=1, prefix="", suffix="", extension="arff", check=True):
    
    filename = "-".join([x for x in (prefix, basename, suffix) if len(x) > 0])+".{}".format(extension)
    
    root_dir = dirname(os.getcwd())
    data_dir = os.path.relpath(os.path.join(root_dir, 'data'))
    step_dir = os.path.join(data_dir, "step-"+str(step).zfill(2))

    if check:
        if not os.path.exists(step_dir):
            os.makedirs(step_dir)
    

    return os.path.join(step_dir, filename)


def arff_to_df(filename, encode_nominal=False, return_af=True):
    with open(filename, 'r') as f:
        af = arff.load(f, encode_nominal=encode_nominal)
    
    df = pd.DataFrame(af['data'])
    
    if return_af:
        return df, af
    else:
        return df

## 01 - Train-Test Splits

In [3]:
# load dataset
dataset = 'iris'
fn_inn = original_filename(dataset)

df, af = arff_to_df(fn_inn, encode_nominal=False, return_af=True)
df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# filenames
fn_train = filename(dataset, step=1, suffix='train')
fn_test = filename(dataset, step=1, suffix='test')

# split train and test
train, test = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE, stratify=df.iloc[:, -1:])

af_train = af.copy()
af_train['data'] = train.values

with open(fn_train, 'w') as f:
    arff.dump(af_train, f)
    
af_test = af.copy()
af_test['data'] = test.values

with open(fn_test, 'w') as f:
    arff.dump(af_test, f)

### Intermediate test

Now we should see if these in fact can be used to train a weka model, or if there are some fishy things going on.

In [5]:
import PxW

# Check if the package gets installed correctly.
clf = PxW.J48()

clf.fit(fn_train)

[2019-08-27 15:30:32,246] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'
[2019-08-27 15:30:32,248] INFO - prefect.FlowRunner | Starting flow run.
[2019-08-27 15:30:32,254] INFO - prefect.TaskRunner | Task 'Constant[str]': Starting task run...
[2019-08-27 15:30:32,255] INFO - prefect.TaskRunner | Task 'Constant[str]': finished task run for task with final state: 'Success'
[2019-08-27 15:30:32,256] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...
[2019-08-27 15:30:32,819] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'
[2019-08-27 15:30:32,823] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded



Options: -C 0.25 -M 2 

=== Classifier model (full training set) ===

J48 pruned tree
------------------

petalwidth <= 0.6: Iris-setosa (40.0)
petalwidth > 0.6
|   petalwidth <= 1.6
|   |   petallength <= 4.9: Iris-versicolor (38.0)
|   |   petallength > 4.9: Iris-virginica (4.0/1.0)
|   petalwidth > 1.6: Iris-virginica (38.0/1.0)

Number of Leaves  : 	4

Size of the tree : 	7


Time taken to build model: 0.24 seconds

Time taken to test model on training data: 0.01 seconds

=== Error on training data ===

Correctly Classified Instances         118               98.3333 %
Incorrectly Classified Instances         2                1.6667 %
Kappa statistic                          0.975 
Mean absolute error                      0.0192
Root mean squared error                  0.0979
Relative absolute error                  4.3092 %
Root relative squared error             20.7586 %
Total Number of Instances              120     


=== Detailed Accuracy By Class ===

                 TP Ra

<Success: "All reference tasks succeeded.">

In [6]:
out = clf.predict(fn_test)

[2019-08-27 15:30:32,837] INFO - prefect.FlowRunner | Beginning Flow run for 'predict'
[2019-08-27 15:30:32,838] INFO - prefect.FlowRunner | Starting flow run.
[2019-08-27 15:30:32,843] INFO - prefect.TaskRunner | Task 'Constant[str]': Starting task run...
[2019-08-27 15:30:32,847] INFO - prefect.TaskRunner | Task 'Constant[str]': finished task run for task with final state: 'Success'
[2019-08-27 15:30:32,848] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...
[2019-08-27 15:30:33,079] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'
[2019-08-27 15:30:33,080] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


Cf. [this stackoverflow](https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn) to get some insights on encoding. Here it don't matter since I just want an F1 score anyway.

In [24]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score

def f1_weka(out, average='macro'):
    out = out.apply(LabelEncoder().fit_transform)
    f1 = f1_score(out['actual'], out['predicted'], average=average)
    return f1


In [25]:
f1_weka(out)

0.9665831244778613

## Generate Queries

In [10]:
# inputs
fn_test = filename(dataset, step=1, suffix='test')

q_idx = 1
fn_qry = filename(dataset, step=2, suffix='q_{}'.format(str(q_idx).zfill(3)))
fn_qry

'../data/step-02/iris-q_001.arff'

In [45]:
df_test, af_test = arff_to_df(fn_test, encode_nominal=False, return_af=True)

#df_test.iloc[:, 0] = np.nan
#df_test.iloc[:, 1] = np.nan
#df_test.iloc[:, 2] = np.nan
df_test.iloc[:, 3] = np.nan

print(df_test.head())

af_qry = af_test.copy()
af_qry['data'] = df_test.values

with open(fn_qry, 'w') as f:
    arff.dump(af_qry, f)
    

# Check if the package gets installed correctly.
clf = PxW.J48()
clf.fit(fn_train)
out = clf.predict(fn_qry)

f1_weka(out, average='macro')

[2019-08-27 15:42:30,112] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'
[2019-08-27 15:42:30,113] INFO - prefect.FlowRunner | Starting flow run.
[2019-08-27 15:42:30,119] INFO - prefect.TaskRunner | Task 'Constant[str]': Starting task run...
[2019-08-27 15:42:30,124] INFO - prefect.TaskRunner | Task 'Constant[str]': finished task run for task with final state: 'Success'
[2019-08-27 15:42:30,126] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...


     0    1    2   3                4
0  4.4  3.0  1.3 NaN      Iris-setosa
1  6.1  3.0  4.9 NaN   Iris-virginica
2  4.9  2.4  3.3 NaN  Iris-versicolor
3  5.0  2.3  3.3 NaN  Iris-versicolor
4  4.4  3.2  1.3 NaN      Iris-setosa


[2019-08-27 15:42:30,630] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'
[2019-08-27 15:42:30,631] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded
[2019-08-27 15:42:30,633] INFO - prefect.FlowRunner | Beginning Flow run for 'predict'
[2019-08-27 15:42:30,634] INFO - prefect.FlowRunner | Starting flow run.
[2019-08-27 15:42:30,639] INFO - prefect.TaskRunner | Task 'Constant[str]': Starting task run...
[2019-08-27 15:42:30,642] INFO - prefect.TaskRunner | Task 'Constant[str]': finished task run for task with final state: 'Success'
[2019-08-27 15:42:30,643] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...



Options: -C 0.25 -M 2 

=== Classifier model (full training set) ===

J48 pruned tree
------------------

petalwidth <= 0.6: Iris-setosa (40.0)
petalwidth > 0.6
|   petalwidth <= 1.6
|   |   petallength <= 4.9: Iris-versicolor (38.0)
|   |   petallength > 4.9: Iris-virginica (4.0/1.0)
|   petalwidth > 1.6: Iris-virginica (38.0/1.0)

Number of Leaves  : 	4

Size of the tree : 	7


Time taken to build model: 0.22 seconds

Time taken to test model on training data: 0.01 seconds

=== Error on training data ===

Correctly Classified Instances         118               98.3333 %
Incorrectly Classified Instances         2                1.6667 %
Kappa statistic                          0.975 
Mean absolute error                      0.0192
Root mean squared error                  0.0979
Relative absolute error                  4.3092 %
Root relative squared error             20.7586 %
Total Number of Instances              120     


=== Detailed Accuracy By Class ===

                 TP Ra

[2019-08-27 15:42:30,902] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'
[2019-08-27 15:42:30,903] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded
  'precision', 'predicted', average, warn_for)


0.2501414827391058

### Intermediate test

Let us see if Weka now can handle this dataset with the missing values.

In [26]:
x


[2019-08-27 15:38:41,700] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'
[2019-08-27 15:38:41,702] INFO - prefect.FlowRunner | Starting flow run.
[2019-08-27 15:38:41,711] INFO - prefect.TaskRunner | Task 'Constant[str]': Starting task run...
[2019-08-27 15:38:41,713] INFO - prefect.TaskRunner | Task 'Constant[str]': finished task run for task with final state: 'Success'
[2019-08-27 15:38:41,716] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...
[2019-08-27 15:38:42,205] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'
[2019-08-27 15:38:42,207] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded
[2019-08-27 15:38:42,211] INFO - prefect.FlowRunner | Beginning Flow run for 'predict'
[2019-08-27 15:38:42,212] INFO - prefect.FlowRunner | Starting flow run.
[2019-08-27 15:38:42,219] INFO - prefect.TaskRunner | Task 'Constant[str]': Starting task run...
[2019-08-27 15:38:42,220] INFO - pref


Options: -C 0.25 -M 2 

=== Classifier model (full training set) ===

J48 pruned tree
------------------

petalwidth <= 0.6: Iris-setosa (40.0)
petalwidth > 0.6
|   petalwidth <= 1.6
|   |   petallength <= 4.9: Iris-versicolor (38.0)
|   |   petallength > 4.9: Iris-virginica (4.0/1.0)
|   petalwidth > 1.6: Iris-virginica (38.0/1.0)

Number of Leaves  : 	4

Size of the tree : 	7


Time taken to build model: 0.21 seconds

Time taken to test model on training data: 0 seconds

=== Error on training data ===

Correctly Classified Instances         118               98.3333 %
Incorrectly Classified Instances         2                1.6667 %
Kappa statistic                          0.975 
Mean absolute error                      0.0192
Root mean squared error                  0.0979
Relative absolute error                  4.3092 %
Root relative squared error             20.7586 %
Total Number of Instances              120     


=== Detailed Accuracy By Class ===

                 TP Rate 

[2019-08-27 15:38:42,451] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'
[2019-08-27 15:38:42,452] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


0.9665831244778613