# Generate queries

Notebook that generates queries. This hasn't been made into a CL-script yet, but that seems to be OK.

## Prelims

In [1]:
# imports
import joblib
import json
import pandas as pd
import os
import sys

from inspect import signature
from os.path import dirname
from joblib import Parallel, delayed

In [2]:
# Custom imports

root_dir = dirname(dirname(os.getcwd()))
src_dir = os.path.join(root_dir, 'src')
sys.path.append(src_dir)

from exp.query.generation import *
from exp.query.encoding import *
from exp.utils.extra import generate_keychain
from exp.utils import filesystem as fs

In [3]:
from exp.runner.RunExp import RunExp
from exp.runner.RunMercs import RunMercs

## Methods

### Config

#### IO

In [4]:
exp = RunExp()

In [5]:
default_io_config_dirs = exp.default_io_config_dirs

In [6]:
def default_io_config_file(dirs=None, dataset=None, name=None):
    d={}
    
    d['train_data'] = fs.collect_fnames_from_folder(dirs['resc-data-ds'],
                                                    criteria=['Train'],
                                                    indexed=True)
    
    if name is not None and dataset is not None:
        assert isinstance(name, str)
        
        qry_codes_fname = generate_keychain(['qry', 'codes', name], sep='_')
        d['qry-codes'] = fs.gen_derived_fnames(dataset,
                                               name=qry_codes_fname,
                                               extension='',
                                               dname=dirs['resc-query-ds-codes'],
                                               indexed=False)[0]
        
        qry_config_fname = generate_keychain(['qry','config', name], sep='_')
        d['qry-config'] = fs.gen_derived_fnames(dataset,
                                                name=qry_config_fname,
                                                extension='json',
                                                dname=dirs['resc-query-ds-config'],
                                                indexed=False)[0]
    return d

#### Other

In [7]:
def default_query_parameters():
    d = {'qry_mode':      'basic',
         'random_seed':    997}
    return d

In [8]:
def extract_nb_atts(io_config):
    data_fname = io_config['file']['train_data'][0][1]
    df = pd.read_csv(data_fname)
    nb_atts = len(df.columns)            
    return nb_atts

### Summary

In [9]:
def save_outputs(qry_codes, qry_config, io_config):
    """
    Save query-codes and query-config to files.
    """
    
    qry_codes_fname = io_config['file']['qry-codes']
    qry_config_fname = io_config['file']['qry-config']
    
    fs.ensure_dir(os.path.dirname(qry_codes_fname), empty=False)
    fs.ensure_dir(os.path.dirname(qry_config_fname), empty=False)
    
    # Save both files
    np.save(qry_codes_fname, qry_codes)
    
    with open(qry_config_fname, 'w') as f:
        json.dump(qry_config, f, indent=4)
    return

In [10]:
def generate_qry_codes(query_config):
    """
    Generate query codes
    """
    
    _, _, q_desc, q_targ, q_miss = compile_queries(**query_config)
    codes = queries_to_codes(q_desc, q_targ, q_miss)
    
    return codes

In [11]:
def build_config(**kwargs):
    """
    Generate query config.
    """
    
    root_dir = kwargs['root_dir']
    dataset = kwargs['dataset']
    qry_mode = kwargs['mode']
    qry_name = kwargs.get('name', qry_mode)
    
    config={}
    config['io']={}
    
    config['io']['dirs'] = default_io_config_dirs(root_dir=root_dir,
                                                  dataset=dataset)
    config['io']['file'] = default_io_config_file(dirs=config['io']['dirs'],
                                                  dataset=dataset,
                                                  name=qry_name)
    
    nb_atts = extract_nb_atts(config['io'])
    
    # N.b., you start with defaults that may get overridden.
    config['qry'] = {'nb_atts':  nb_atts,
                     **default_query_parameters(),
                     **kwargs}
    return config

In [12]:
def main(config):
    
    # Extract config
    io_config = config['io']
    qry_config = config['qry']
    
    # Generate codes
    qry_codes = generate_qry_codes(qry_config)
    
    save_outputs(qry_codes, qry_config, io_config)
    
    return

## Factory-Floor

In [13]:
ds = 'nltcs'
query_mode = 'iterative'
config = build_config(root_dir=root_dir, dataset=ds, mode=query_mode, name='it-S', nb_diff_configs=10)
#config['io']['file']

In [14]:
qry_config = config['qry']
qry_codes = generate_qry_codes(qry_config)

In [15]:
qry_codes[0:11]

array([[ 0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, -1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, -1, -1,  1,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, -1, -1,  1,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0],
       [ 0, -1, -1, -1,  1,  0,  0,  0, -1,  0,  0, -1,  0, -1,  0,  0],
       [-1, -1, -1, -1,  1,  0,  0,  0, -1,  0,  0, -1,  0, -1,  0,  0],
       [-1, -1, -1, -1,  1,  0,  0, -1, -1,  0,  0, -1,  0, -1, -1,  0],
       [-1, -1, -1, -1,  1,  0,  0, -1, -1,  0,  0, -1,  0, -1, -1, -1],
       [-1, -1, -1, -1,  1,  0,  0, -1, -1,  0, -1, -1, -1, -1, -1, -1],
       [-1, -1, -1, -1,  1,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0]])

In [16]:
len(qry_codes)

100

### Extract datasets

In [17]:
dirs = default_io_config_dirs(root_dir=root_dir)
all_datasets = os.listdir(dirs['resc-data'])

In [18]:
all_datasets 

['adult',
 'cwebkb',
 'book',
 'bbc',
 'kdd',
 'ad',
 'msnbc',
 'tretail',
 'msweb',
 'jester',
 'pumsb_star',
 'baudio',
 'nltcs',
 'plants',
 'dna',
 'bnetflix',
 'voting',
 'cr52',
 'c20ng',
 'kosarek',
 'accidents',
 'tmovie']

In [19]:
dirs

{'cmd': '/cw/dtailocal/Dropbox/Files/KUL/research/codebases/homework/cmd/',
 'prod': '/cw/dtailocal/Dropbox/Files/KUL/research/codebases/homework/prod/',
 'resc': '/cw/dtailocal/Dropbox/Files/KUL/research/codebases/homework/resc/',
 'resc-data': '/cw/dtailocal/Dropbox/Files/KUL/research/codebases/homework/resc/data/tidy/',
 'resc-models': '/cw/dtailocal/Dropbox/Files/KUL/research/codebases/homework/resc/models/',
 'resc-query': '/cw/dtailocal/Dropbox/Files/KUL/research/codebases/homework/resc/query/',
 'root': '/cw/dtailocal/Dropbox/Files/KUL/research/codebases/homework'}

### Loop

In [21]:
all_datasets

['adult',
 'cwebkb',
 'book',
 'bbc',
 'kdd',
 'ad',
 'msnbc',
 'tretail',
 'msweb',
 'jester',
 'pumsb_star',
 'baudio',
 'nltcs',
 'plants',
 'dna',
 'bnetflix',
 'voting',
 'cr52',
 'c20ng',
 'kosarek',
 'accidents',
 'tmovie']

In [23]:
for ds in all_datasets:
    query_name = 'it-S'
    query_mode = 'iterative'
    config = build_config(root_dir=root_dir, dataset=ds, mode=query_mode, name=query_name, nb_diff_configs=10)
    main(config)
    
    msg = """
    Done with query generation for dataset:    {}
    Query mode was: {}
    """.format(ds, query_mode)
    print(msg)


    Done with query generation for dataset:    adult
    Query mode was: iterative
    

    Done with query generation for dataset:    cwebkb
    Query mode was: iterative
    

    Done with query generation for dataset:    book
    Query mode was: iterative
    

    Done with query generation for dataset:    bbc
    Query mode was: iterative
    

    Done with query generation for dataset:    kdd
    Query mode was: iterative
    

    Done with query generation for dataset:    ad
    Query mode was: iterative
    

    Done with query generation for dataset:    msnbc
    Query mode was: iterative
    

    Done with query generation for dataset:    tretail
    Query mode was: iterative
    

    Done with query generation for dataset:    msweb
    Query mode was: iterative
    

    Done with query generation for dataset:    jester
    Query mode was: iterative
    

    Done with query generation for dataset:    pumsb_star
    Query mode was: iterative
    

    Done with query