# Collect results

In this notebook, I check if I can collect all the results in a decent manner.

## Preliminaries

In [1]:
# Imports
import os
import numpy as np
import pandas as pd
import json
import sys
import pickle as pkl
import warnings

from os.path import dirname

In [2]:
# Custom

root_dir = dirname(dirname(os.getcwd()))
src_dir = os.path.join(root_dir, 'src')
sys.path.append(src_dir)

import exp
from exp.utils.extra import mem_usage
from exp.runner.RunExp import RunExp
from exp.runner.RunMercs import RunMercs

## Methods

In [None]:
def load_or_make_runner(idx=None, kind='RunExp', root_dir=None, **kwargs):
    
    if kind in {'RunExp'}:
        helper = RunExp()
    elif kind in {'RunMercs'}:
        helper = RunMercs()
    else:
        msg = """
        Did not recognize kind:    {}
        """.format(kind)
        raise ValueError(msg)

    helper.make_config(idx=idx, root_dir=root_dir, **kwargs)
    runner_fname = helper.get_fname(kind)

    if os.path.isfile(runner_fname):
        with open(runner_fname, 'rb') as f:
            runner = pkl.load(f)
        del helper
    else:
        msg = """
        Could not load the actual runner. Using the self-made helper.
        """
        warninngs.warn(msg)
        runner = helper
    return runner

In [None]:
def examine_RunExp(runner):
    
    # child
    child = runner.config['child']
    
    # folds
    folds = runner.config[child]['folds']
    
    # idxs
    exploration_fname = runner.get_fname('exploration')
    with open(exploration_fname, 'r') as f:
        exploration = json.load(f)
        
    idxs = exploration['idx']
    return child, folds, idxs

In [None]:
def add_multindex(df, idx, f_idx):
    """
    Add multi-index correctly.
    """
    df['idx'] = idx
    df['f_idx'] = f_idx
    
    if 'q_idx' in df.columns:
        df = df.set_index(['idx', 'f_idx', 'q_idx'])
    else:
        df = df.set_index(['idx', 'f_idx'])
    return df

def extract_unique_fnames_kind(df, kind=None):
    """
    Extract unique fnames from df of outputs.
    
    This allows us to read every file just once.
    """
    column = [c for c in df.columns if kind in c][0]
    uniq_fnames = df[column].unique()
    return column, uniq_fnames

In [None]:
# All things queries
def count_attributes(q_codes, encoding):
    """
    Count the number of appearances of a certain value, row-wise.
    """
    return np.count_nonzero(q_codes==encoding, axis=1)

def extract_attributes(q_codes, encoding):
    """
    Extract the attributes that fulfill a certain role, row-wise.
    """
    nb_queries = q_codes.shape[0]
    encod_atts = np.transpose(np.nonzero(q_codes==encoding))
    
    d = [[] for row in range(nb_queries)]
    for qry_idx, att_idx in encod_atts:
        d[qry_idx].append(att_idx)
        
    d = [tuple(l) for l in d]
    return d

def transform_q_codes(q_codes):
    
    # TODO: Extract the encoding from somewhere
    df = pd.DataFrame()
    targ_encoding = 1
    miss_encoding = -1
    nb_qrys, nb_atts = q_codes.shape
    
    df['targ'] = extract_attributes(q_codes, targ_encoding)
    df['perc_miss'] = count_attributes(q_codes, miss_encoding)/nb_atts
    df['q_idx'] = np.arange(nb_qrys)
    
    return df

In [None]:
# All things config
def transform_cfg(cfg):
    df = pd.DataFrame()
    
    # Prelims
    child = cfg['child']
    dataset = cfg['dataset']
    mod_cfg = cfg[child]
    
    # Actual transformation
    head_tuple = ('dataset', *mod_cfg.keys())
    data_tuple = (dataset, *mod_cfg.values())
    
    df = pd.DataFrame.from_records([data_tuple], columns=head_tuple)
    
    return df

## Collect DFs

In [None]:
# General methods

def aggregate_outputs(df_fns, kind='results'):
    """
    Collect in such a way that no single file is accessed more than once.
    """
    df = pd.DataFrame()
    
    column, uniq_fnames = extract_unique_fnames_kind(df_fns, kind=kind)
        
    for fn in uniq_fnames:
        
        if kind in {'results', 'timings'}:
            single_df = pd.read_csv(fn)            # Reading csv
            
        elif kind in {'qry_codes'}:
            q_codes = np.load(fn)                  # Reading npy
            single_df = transform_q_codes(q_codes) # Transformation
            
        elif kind in {'mod_config'}:
            with open(fn, 'r') as f:
                cfg = json.load(f)                 # Reading json
            single_df = transform_cfg(cfg)         # Transformation
        else:
            msg = """
            Did not recognize kind:\t{}
            """.format(kind)
            raise ValueError(msg)
        
        # Concatenation
        head_tuple = ('idx', 'f_idx')
        filt_df_fn = df_fns[df_fns[column]==fn]
        filt_df_fn = filt_df_fn[list(head_tuple)]

        for idx, f_idx in filt_df_fn.itertuples(index=False, name=None):
            tmp = add_multindex(single_df, idx, f_idx)
            df = pd.concat([df, tmp], sort=False)
    return df

### Collect fnames

In [None]:
runner = load_or_make_runner(idx=1, kind='RunExp', root_dir=root_dir)

In [None]:
runner.aggregate_outputs(save=True)

In [None]:
df = runner.qry_codes
mem_usage(df)

In [None]:
runner.load_output('qry_codes')

In [None]:
df['elia'] = 'cool'
df2['elia'] = 'cool'
df.head()

In [None]:
df[:] = np.nan
df.head()

In [None]:
df3 = pd.concat([df,df2], sort=False)
df3

### Collect Outputs

In [None]:
aggregate_outputs(df_fns, kind='results').head()

In [None]:
aggregate_outputs(df_fns, kind='timings').head()

In [None]:
aggregate_outputs(df_fns, kind='qry_codes').head()

In [None]:
df = aggregate_outputs(df_fns, kind='mod_config')
df

In [None]:
gb = df.groupby(level=0)

In [None]:
for n,g in gb:
    print(n)
    print(g)