# Generate Queries

More in general query generation. For the Bayesian Networks experiment, we are not going to generate custom datasets for each query.

# Preliminaries

In [1]:
import numpy as np
import pandas as pd
import os
from os.path import dirname


In [2]:
from aaai20.io import filename_dataset, filename_query, original_filename
from mercs.utils.encoding import query_to_code, code_to_query

## Helpers

In [3]:
def generate_query(nb_atts, targ_idx=-1, nb_qry=10, random_state=42):
    # init ids
    attr_ids = list(range(nb_atts))
    targ_ids = [attr_ids[targ_idx]] # Last attribute by default
    desc_ids = [e for e in attr_ids if e not in targ_ids]
    miss_ids = []

    q_targ = [targ_ids]
    q_desc = [desc_ids]
    q_miss = [miss_ids]

    # Start query buiding
    nb_of_attributes_to_make_missing = np.linspace(0, nb_atts-1, nb_qry, endpoint=False, dtype=int)
    nb_items_to_transfer = np.ediff1d(nb_of_attributes_to_make_missing)

    for qry_id, e in enumerate(nb_items_to_transfer):
        desc_ids, miss_ids = transfer_contents(desc_ids, miss_ids, nb_items_to_transfer=e, random_state=random_state)

        #print(desc_ids, miss_ids, targ_ids)
        q_targ.append(targ_ids)
        q_desc.append(desc_ids)
        q_miss.append(miss_ids)
    
    return q_desc, q_targ, q_miss

def transfer_contents(list_one, list_two, nb_items_to_transfer=1, random_state=42):
    np.random.seed(random_state)
    
    list_one, list_two = list_one.copy(), list_two.copy() 
    
    idx_to_transfer = np.random.choice(range(len(list_one)), nb_items_to_transfer, replace=False)
    content_to_transfer = [e for idx, e in enumerate(list_one) if idx in idx_to_transfer]
    
    for e in content_to_transfer:
        list_one.remove(e)
        list_two.append(e)
    
    return list_one, list_two

# Functions

In [4]:
def query_batch(nb_attributes, nb_queries, target_idx=-1, random_state=42):
    # Init
    q_codes = []
    
    # Actions
    q_desc, q_targ, q_miss = generate_query(
        nb_attributes, targ_idx=target_idx, nb_qry=nb_queries, random_state=random_state
    )

    for q_idx in range(nb_queries):
        q_codes.append(query_to_code(q_desc[q_idx], q_targ[q_idx], q_miss[q_idx]))

    q_codes = np.vstack([q_codes])  # Convert to proper np.ndarray
    return q_codes

In [5]:
def generate_queries(dataset, max_nb_queries=10, nb_targets=10, random_state=42):
    q_codes = []
    
    # Derive Parameters
    fn_test = filename_dataset(dataset, step=2, suffix='test', extension="csv")
    df_test = pd.read_csv(fn_test, header=None)

    nb_atts = len(df_test.columns)
    nb_qry = min(nb_atts-1, max_nb_queries)
    
    np.random.seed(random_state)
    target_indices = np.random.choice(np.arange(nb_atts), nb_targets ,replace=False)
    
    for target_idx in target_indices:
        q_codes.append(query_batch(nb_atts, nb_qry, target_idx=target_idx, random_state=random_state))
    
    q_codes = np.vstack([q_codes])
    q_codes = q_codes.reshape(-1, nb_atts)
    
    # Save
    fn_qry = filename_query(dataset, suffix="default")
    np.save(fn_qry, q_codes)
    
    return

# Sandbox

In [6]:
from joblib import Parallel, delayed

starai_datasets = os.listdir(dirname(dirname(original_filename('nltcs', category='starai'))))
print(starai_datasets)

['cwebkb', 'book', 'bbc', 'kdd', 'ad', 'msnbc', 'tretail', 'msweb', 'jester', 'pumsb_star', 'baudio', 'nltcs', 'plants', 'dna', 'bnetflix', 'voting', 'cr52', 'c20ng', 'kosarek', 'accidents', 'tmovie']


In [7]:
Parallel(n_jobs=7)(delayed(generate_queries)(ds, max_nb_queries=10, nb_targets=10, random_state=42) for ds in starai_datasets)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]