# StarAI - Queries

Generating queries for the `STARAI` datasets.

# Preliminaries

In [1]:
# Black Codeformatter
%load_ext lab_black

## Constants

In [2]:
RANDOM_STATE = 42
N_JOBS = 4  # Cores on your current CPU, will speed up the process.

## Imports

In [3]:
import pandas as pd
import os
import numpy as np
from joblib import Parallel, delayed
import fileinput
import warnings

from sklearn.model_selection import train_test_split

import mercs
from mercs.utils.encoding import query_to_code, code_to_query

In [4]:
import elki_interface

from elki_interface.exps import (
    query_filepath,
    dataset_filepath,
    get_starai_dataset_names,
)

## Functions

In [5]:
def generate_query(nb_atts, targ_idx=-1, nb_qry=10, random_state=42):
    # init ids
    attr_ids = list(range(nb_atts))
    targ_ids = [attr_ids[targ_idx]]  # Last attribute by default
    desc_ids = [e for e in attr_ids if e not in targ_ids]
    miss_ids = []

    q_targ = [targ_ids]
    q_desc = [desc_ids]
    q_miss = [miss_ids]

    # Start query buiding
    nb_of_attributes_to_make_missing = np.linspace(
        0, nb_atts - 1, nb_qry, endpoint=False, dtype=int
    )
    nb_items_to_transfer = np.ediff1d(nb_of_attributes_to_make_missing)

    for qry_id, e in enumerate(nb_items_to_transfer):
        desc_ids, miss_ids = _transfer_contents(
            desc_ids, miss_ids, nb_items_to_transfer=e, random_state=random_state
        )

        # print(desc_ids, miss_ids, targ_ids)
        q_targ.append(targ_ids)
        q_desc.append(desc_ids)
        q_miss.append(miss_ids)

    return q_desc, q_targ, q_miss


def _transfer_contents(list_one, list_two, nb_items_to_transfer=1, random_state=42):
    np.random.seed(random_state)

    list_one, list_two = list_one.copy(), list_two.copy()

    idx_to_transfer = np.random.choice(
        range(len(list_one)), nb_items_to_transfer, replace=False
    )
    content_to_transfer = [
        e for idx, e in enumerate(list_one) if idx in idx_to_transfer
    ]

    for e in content_to_transfer:
        list_one.remove(e)
        list_two.append(e)

    return list_one, list_two

In [6]:
def generate_queries(dataset, max_nb_queries=10, random_state=42, nb_iterations=10):
    q_codes = []

    # Derive Parameters
    fn = dataset_filepath(dataset, step=1, kind="test", extension="csv")
    df = pd.read_csv(fn, header=None, index_col=None)

    nb_atts = len(df.columns)
    nb_qry = min(nb_atts - 1, max_nb_queries)

    targ_ids = np.random.choice(nb_atts, nb_iterations, replace=True)

    for i, target_idx in enumerate(targ_ids):
        # Generate queries
        q_desc, q_targ, q_miss = generate_query(
            nb_atts, targ_idx=target_idx, nb_qry=nb_qry, random_state=random_state + i
        )

        for q_idx in range(nb_qry):
            q_codes.append(query_to_code(q_desc[q_idx], q_targ[q_idx], q_miss[q_idx]))

    q_codes = np.r_[q_codes]  # Convert to proper np.ndarray

    # Save
    fn_qry = query_filepath(dataset, keyword="default")
    np.save(fn_qry, q_codes)

    return

# Flow

In [7]:
starai_dataset_names = get_starai_dataset_names()
starai_dataset_names

In [11]:
Parallel(n_jobs=N_JOBS, verbose=51)(delayed(generate_queries)(ds, random_state=RANDOM_STATE) for ds in starai_dataset_names)

print("""

Making STARAI queries done.""")

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:    2.7s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:    3.9s
[Parallel(

lgtm