# StarAI - Get data and Preprocess

This notebook summarizes the preprocessing of the StarAI datasets. Cf. https://github.com/UCLA-StarAI/Density-Estimation-Datasets

# Preliminaries

In [1]:
# Black Codeformatter
%load_ext lab_black

## Constants

In [2]:
N_JOBS = 4  # Cores on your current CPU, will speed up the process.
RANDOM_STATE = 42

## Imports

In [3]:
import pandas as pd

# import os
import numpy as np
import warnings
from pathlib import Path

from joblib import Parallel, delayed

In [4]:
import elki_interface

from elki_interface.exps import (
    starai_original_filepath,
    dataset_filepath,
    get_starai_dataset_names,
)

## Functions

In [5]:
def load_raw_starai_dfs(name="nltcs"):
    # Load
    fp_train = starai_original_filepath(name=name, kind="train")
    fp_test = starai_original_filepath(name=name, kind="test")

    df_train = pd.read_csv(fp_train, header=None)
    df_test = pd.read_csv(fp_test, header=None)

    # If test is larger than train, we swap them!
    n_rows_train = df_train.shape[0]
    n_rows_test = df_test.shape[0]
    if n_rows_test > n_rows_train:
        warnings.warn(
            "Test set larger than training set. We assume this is a mistake and we swap them."
        )
        df_temp = df_test
        df_test = df_train
        df_train = df_temp
    return df_train, df_test


def drop_constant_columns(df_train, df_test):
    constant_columns = _detect_constant_columns([df_train, df_test])
    df_train, df_test = (
        df_train.drop(constant_columns, axis=1),
        df_test.drop(constant_columns, axis=1),
    )
    return df_train, df_test


def _detect_constant_columns(dfs, constant_column_uvalues=1):
    """Ugly, but works"""
    try:
        result = []
        for df in iter(dfs):
            result += _detect_constant_columns(df)
        return result
    except TypeError:
        # We assume a single dataframe was passed
        return [col for col in dfs if dfs[col].nunique() <= constant_column_uvalues]


def add_headers(df_train, df_test):
    pxs_headers = _headers_pxs(df_train)
    assert pxs_headers == _headers_pxs(
        df_test
    ), "Headers for train and test set differ. That is not supposed to happen!"

    df_train.columns, df_test.columns = pxs_headers, pxs_headers
    return df_train, df_test


def _headers_pxs(df):
    return ["att_{}".format(x) for x in range(len(df.columns))]


def save_starai_dfs(df_train, df_test, name="nltcs", step=1):
    fp_train = dataset_filepath(name=name, step=step, kind="train", check=False)
    fp_test = dataset_filepath(name=name, step=step, kind="test", check=False)

    df_train.to_csv(fp_train, index=False)
    df_test.to_csv(fp_test, index=False)
    return True

In [6]:
def workfow_raw_to_s01(name, verbose=True):
    if verbose:
        msg = "Start on dataset {}".format(name)
        print(msg)

    df_train, df_test = load_raw_starai_dfs(name=name)
    df_train, df_test = drop_constant_columns(df_train, df_test)
    df_train, df_test = add_headers(df_train, df_test)

    if verbose:
        msg = "Done with dataset {}".format(name)
        print(msg)

    return save_starai_dfs(df_train, df_test, name=name, step=1)

# Get Data

In [7]:
%%bash
rm -rf ../../data/raw/datasets-starai
git clone git@github.com:UCLA-StarAI/Density-Estimation-Datasets.git ../../data/raw/datasets-starai

Cloning into '../../data/raw/datasets-starai'...
Checking out files: 100% (100/100), done.


In [8]:
%%bash
cd ../../data/raw/datasets-starai
git checkout 44c51c50c43686c889de21dcd68cb80820abc9b9

Note: checking out '44c51c50c43686c889de21dcd68cb80820abc9b9'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:

  git checkout -b <new-branch-name>

HEAD is now at 44c51c5 Merge pull request #1 from arranger1044/master


# Preprocess: Raw -> Step 01

This is currently the only step we need. If in the future more preprocessing is necessary, this can simply be added as the next step. In that way, the layout and the logic is flexible.

## Demo

Test

In [9]:
nltcs_filepath = starai_original_filepath(name="nltcs", kind="train")
print(nltcs_filepath)
assert nltcs_filepath.exists(), "This path ({}) does not exist".format(nltcs_filepath)

/home/zissou/repos/missmercs/data/raw/datasets-starai/datasets/nltcs/nltcs.train.data


In [10]:
workfow_raw_to_s01(name="nltcs")

Start on dataset nltcs
Done with dataset nltcs


True

## Preprocess all datasets

Repeat the procedure for all datasets at once.

In [11]:
starai_dataset_names = get_starai_dataset_names()
starai_dataset_names

['accidents',
 'ad',
 'adult',
 'baudio',
 'bbc',
 'bnetflix',
 'book',
 'c20ng',
 'connect4',
 'cr52',
 'cwebkb',
 'dna',
 'jester',
 'kdd',
 'kosarek',
 'moviereview',
 'msnbc',
 'msweb',
 'mushrooms',
 'nips',
 'nltcs',
 'ocr_letters',
 'plants',
 'pumsb_star',
 'rcv1',
 'tmovie',
 'tretail',
 'voting']

In [13]:
Parallel(n_jobs=N_JOBS, verbose=51)(delayed(workfow_raw_to_s01)(ds) for ds in starai_dataset_names)

print("Preprocessing StarAI done")

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    3.8s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    4.4s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    4.8s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    5.9s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    7.6s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:    7.8s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    7.9s
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:    8.1s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    8.4s
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:    9.6s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:    9.8s
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:   11.0s
[Parallel(

lgtm