# StarAI datasets Preprocessing.

The flow will be as follows:

    1. raw -> step-01
        Convert .data to .csv
        Drop constant columns
    2. step-01 -> step-02
        Make a PxS alternative

# Preliminaries

## Imports

In [1]:
import pandas as pd
import os
import numpy as np

from os.path import dirname
from aaai20.io import filename_dataset, filename_query, original_filename
from aaai20.wrangling import arff_to_df

RANDOM_STATE = 250

## Helpers

In [2]:
def detect_constant_columns(dfs):
    try:
        result = []
        for df in iter(dfs):
            result += detect_constant_columns(df)
        return result
    except TypeError:    
        # We assume a single dataframe was passed
        return [col for col in dfs if dfs[col].nunique() < 2]

In [3]:
def headers_pxs(df):
    return ["att_{}".format(x) for x in df.columns.values]

# Functions

In [4]:
def raw_to_step_01(ds):

    # Load in memory
    fn_train, fn_test = (
        original_filename(
            ds, category="starai", extension="data", train_or_test="train"
        ),
        original_filename(
            ds, category="starai", extension="data", train_or_test="test"
        ),
    )

    df_train, df_test = (
        pd.read_csv(fn_train, header=None),
        pd.read_csv(fn_test, header=None),
    )

    # Remove constant columns
    constant_columns = detect_constant_columns([df_train, df_test])
    df_train, df_test = (
        df_train.drop(constant_columns, axis=1),
        df_test.drop(constant_columns, axis=1),
    )
    assert df_train.columns.equals(df_test.columns)

    # Save files
    extension = "csv"
    fn_train, fn_test = (
        filename_dataset(ds, step=1, suffix="train", extension=extension),
        filename_dataset(ds, step=1, suffix="test", extension=extension),
    )

    df_train.to_csv(fn_train, index=False, header=False)
    df_test.to_csv(fn_test, index=False, header=False)

    return

In [5]:
def step_01_to_step_02(ds):
    # Load in memory
    extension = 'csv'
    fn_train, fn_test = (
        filename_dataset(ds, step=1, suffix="train", extension=extension),
        filename_dataset(ds, step=1, suffix="test", extension=extension),
    )

    df_train, df_test = (
        pd.read_csv(fn_train, header=None),
        pd.read_csv(fn_test, header=None),
    )

    # Create PxS Headers
    pxs_headers = headers_pxs(df_train)
    assert pxs_headers == headers_pxs(df_test)

    # Save

    fn_mercs_train, fn_mercs_test = (
        filename_dataset(ds, step=2, suffix="train", extension=extension),
        filename_dataset(ds, step=2, suffix="test", extension=extension),
    )
    df_train.to_csv(fn_mercs_train, index=False, header=False)
    df_test.to_csv(fn_mercs_test, index=False, header=False)

    suffix = "pxs"
    fn_pxs_train, fn_pxs_test = (
        filename_dataset(ds, step=2, suffix=["train", suffix], extension=extension),
        filename_dataset(ds, step=2, suffix=["test", suffix], extension=extension),
    )

    df_train.columns, df_test.columns = pxs_headers, pxs_headers

    df_train.to_csv(fn_pxs_train, index=False)
    df_test.to_csv(fn_pxs_test, index=False)
    return

# Raw -> step-01

In [6]:
original_filename("accidents", category="starai", extension="data")


ds = "nltcs"

raw_to_step_01(ds)

# step-01 -> step 02

Creating the PxS headers now.

In [7]:
step_01_to_step_02(ds)

# Do all

In [8]:
from joblib import Parallel, delayed

starai_datasets = os.listdir(dirname(dirname(original_filename('nltcs', category='starai'))))
print(starai_datasets)

['cwebkb', 'book', 'bbc', 'kdd', 'ad', 'msnbc', 'tretail', 'msweb', 'jester', 'pumsb_star', 'baudio', 'nltcs', 'plants', 'dna', 'bnetflix', 'voting', 'cr52', 'c20ng', 'kosarek', 'accidents', 'tmovie']


In [11]:
Parallel(n_jobs=7)(delayed(raw_to_step_01)(ds) for ds in starai_datasets)

print("step 01 done")

step 01 done


In [12]:
Parallel(n_jobs=7)(delayed(step_01_to_step_02)(ds) for ds in starai_datasets)

print("step 02 done")

step 02 done
