# Visualization sandbox

First notebook in which I try to visualize my new datastructures.

## Imports

In [1]:
# Imports
import os
import numpy as np
import pandas as pd
import json
import sys
import pickle as pkl
import warnings

from os.path import dirname

In [2]:
# Custom

root_dir = dirname(dirname(os.getcwd()))
src_dir = os.path.join(root_dir, 'src')
sys.path.append(src_dir)

import exp
from exp.utils.extra import mem_usage
from exp.runner.RunExp import RunExp
from exp.runner.RunMercs import RunMercs

## Methods

In [3]:
def merge_aggregated_outputs_multiple_exps(exp_idxs, **kwargs):
    """
    Merge aggregated outputs from multiple experiments.
    """
    
    f = collect_aggregated_outputs_from_exp
    
    gen = (f(exp_idx, **kwargs) for exp_idx in exp_idxs)
    
    result = {}
    for g in gen:
        result = {k: pd.concat([result.get(k, None),v], sort=False)
                  for k,v in g.items()}    
    return result

def collect_aggregated_outputs_from_exp(exp_idx, **kwargs):
    """
    Load the aggregated outputs by a single experiment.
    """
    
    # Preliminaries
    dfs = {}
    
    # Actions
    re = RunExp.load(idx=exp_idx, **kwargs)
    for output in re.aggr_outputs:
        dfs[output] = re.load_output(kind=output)
    return dfs

In [4]:
# Transformation 0
def insert_category_dtype(df):
    for col in df:
        if not pd.api.types.is_numeric_dtype(df[col].dtype):
            df[col] = df[col].astype('category')
        else:
            pass
    return df

In [5]:
# Transformation 1
def build_f_dict(df):
    f_dict = {col:np.mean 
                  if pd.api.types.is_numeric_dtype(df[col])
                  else 'first'
              for col in df}
    return f_dict

In [6]:
# Transformation 2
def insert_targ_idx_column(df):
    df['t_idx'] = df.targ.cat.codes
    return df


def insert_base_perf_column(df, column_name='macro_f1'):
    def f(row):
        return row[column_name] * 100 if row['perc_miss'] <= 0.01 else np.nan

    df['base_perf'] = df.apply(f, axis=1)

    df.fillna(method='ffill', inplace=True)
    return df


def insert_name_column(df, **format_key_kwargs):
    """
    Insert the canonical name column in the dataframe
    
    """
    
    df['name'] = df.apply(lambda x: derive_name(x,
                                                drop_dataset=True,
                                                **format_key_kwargs),
                          axis=1)
    return df


# Helpers - Insert Name Column
def format_key(k, retain=2, delimiter='.', **kwargs):
    l = k.split(delimiter)

    if len(l) > retain:
        l = l[-retain:]

    formatted_key = delimiter.join(l)

    return formatted_key


def dict_to_str(d):
    s = str(d)

    for char in {"{", "}", "'"}:
        s = s.replace(char, "")

    for char in {", "}:
        s = s.replace(char, "|")

    for char in {": "}:
        s = s.replace(char, "=")
        
    return s


def derive_name(row, drop_dataset=True, **format_key_kwargs):
    keys = row.index.values.tolist()
    vals = row.values.tolist()
    
    d = {format_key(k, **format_key_kwargs): v
         for k,v in zip(keys,vals)}
    
    if drop_dataset:
        d.pop('dataset', None)
        
    name = dict_to_str(d)
    return name


## Factory Floor

Testing Area

In [7]:
exp_idxs = [1]
dfs = merge_aggregated_outputs_multiple_exps(exp_idxs)

### Transformation Zero

Convert all non-numeric columns to categories (after all, that is what they are).

In [8]:
for k,v in dfs.items():
    msg = """
    Inserting category dtypes into: {}
    """.format(k)
    print(msg)
    mem_usage(v)
    dfs[k] = insert_category_dtype(v)
    mem_usage(dfs[k])


    Inserting category dtypes into: timings
    

    6.72 kiloB
    

    6.72 kiloB
    

    Inserting category dtypes into: mod_config
    

    1.60 kiloB
    

    0.77 kiloB
    

    Inserting category dtypes into: results
    

    4.15 kiloB
    

    4.15 kiloB
    

    Inserting category dtypes into: qry_codes
    

    17.04 kiloB
    

    8.92 kiloB
    


### Transformation One

Average over folds, removing one level of multi-index.

In [9]:
df_res = dfs['results']
mem_usage(df_res)
df_res.head()


    4.15 kiloB
    


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,macro_f1
idx,f_idx,q_idx,Unnamed: 3_level_1
10,0,0,0.768758
10,0,1,0.775196
10,0,2,0.775028
10,0,3,0.8164
10,0,4,0.813912


In [10]:
f_dict = build_f_dict(df_res)
df_res = df_res.groupby(level=['idx', 'q_idx']).agg(f_dict)

mem_usage(df_res)
df_res.head()


    1.15 kiloB
    


Unnamed: 0_level_0,Unnamed: 1_level_0,macro_f1
idx,q_idx,Unnamed: 2_level_1
10,0,0.770414
10,1,0.776917
10,2,0.779902
10,3,0.812266
10,4,0.815355


In [11]:
df_qry = dfs['qry_codes']
mem_usage(df_qry)
df_qry.head()


    8.92 kiloB
    


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,targ,perc_miss
idx,f_idx,q_idx,Unnamed: 3_level_1,Unnamed: 4_level_1
10,0,0,"(0,)",0.0
10,0,1,"(1,)",0.0
10,0,2,"(2,)",0.0
10,0,3,"(3,)",0.0
10,0,4,"(4,)",0.0


In [12]:
df_qry.dtypes

targ         category
perc_miss     float64
dtype: object

In [13]:
f_dict = build_f_dict(df_qry)
df_qry = df_qry.groupby(level=['idx', 'q_idx']).agg(f_dict)

mem_usage(df_qry)
df_qry.head()


    3.73 kiloB
    


Unnamed: 0_level_0,Unnamed: 1_level_0,targ,perc_miss
idx,q_idx,Unnamed: 2_level_1,Unnamed: 3_level_1
10,0,"(0,)",0.0
10,1,"(1,)",0.0
10,2,"(2,)",0.0
10,3,"(3,)",0.0
10,4,"(4,)",0.0


In [14]:
df_qry.dtypes

targ          object
perc_miss    float64
dtype: object

In [15]:
df_cfg = dfs['mod_config']
mem_usage(df_cfg)
df_cfg.head()


    0.77 kiloB
    


Unnamed: 0_level_0,Unnamed: 1_level_0,dataset,ind.type,ind.max_depth,sel.its,sel.param
idx,f_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,0,nltcs,DT,8,4,2
10,1,nltcs,DT,8,4,2
10,2,nltcs,DT,8,4,2
10,3,nltcs,DT,8,4,2
10,4,nltcs,DT,8,4,2


In [16]:
f_dict = build_f_dict(df_cfg)
df_cfg = df_cfg.groupby(level=['idx']).agg(f_dict)

mem_usage(df_cfg)
df_cfg.head()


    0.30 kiloB
    


Unnamed: 0_level_0,dataset,ind.type,ind.max_depth,sel.its,sel.param
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,nltcs,DT,8,4,2
11,jester,DT,8,4,2


### Transformation Two

Add targ_idx to the query DataFrame.

In [17]:
mem_usage(df_qry)
insert_category_dtype(df_qry)
mem_usage(df_qry)


    3.73 kiloB
    

    5.67 kiloB
    


In [18]:
df_qry = insert_targ_idx_column(df_qry)
mem_usage(df_qry)
df_qry.head()


    5.74 kiloB
    


Unnamed: 0_level_0,Unnamed: 1_level_0,targ,perc_miss,t_idx
idx,q_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,0,"(0,)",0.0,0
10,1,"(1,)",0.0,1
10,2,"(2,)",0.0,2
10,3,"(3,)",0.0,3
10,4,"(4,)",0.0,4


### Transformation Four

Add name column to df_cfg.

In [19]:
df_cfg.head()

Unnamed: 0_level_0,dataset,ind.type,ind.max_depth,sel.its,sel.param
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,nltcs,DT,8,4,2
11,jester,DT,8,4,2


In [20]:
df_cfg = insert_name_column(df_cfg)
mem_usage(df_cfg)
df_cfg.head()


    0.51 kiloB
    


Unnamed: 0_level_0,dataset,ind.type,ind.max_depth,sel.its,sel.param,name
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,nltcs,DT,8,4,2,ind.type=DT|ind.max_depth=8|sel.its=4|sel.param=2
11,jester,DT,8,4,2,ind.type=DT|ind.max_depth=8|sel.its=4|sel.param=2


In [21]:
insert_category_dtype(df_cfg)
mem_usage(df_cfg)


    0.59 kiloB
    


### Merge

Merge qry, cfg, res into one.

In [22]:
def build_df_plt(df_res, df_qry, df_cfg):
    
    # Filter df_qry, df_cfg
    df_qry_f = df_qry[['t_idx', 'perc_miss']]
    df_cfg_f = df_cfg[['dataset', 'name']]
    
    # Join df_res, df_qry
    idx_names = df_res.index.names
    join_idx_names = ['idx', 'q_idx']
    
    df_1 = df_res.join(df_qry_f, how='inner', on=join_idx_names)
    df_1.index.set_names(idx_names, inplace=True)
    
    # Add Base Performance
    df_1 = insert_base_perf_column(df_1, column_name='macro_f1')
    
    # Join df_1, df_cfg (i.e. add dataset column)
    idx_names = df_1.index.names
    join_idx_names = ['idx']
    
    df_2 = df_1.join(df_cfg_f, how='inner', on=join_idx_names)
    df_2.index.set_names(idx_names, inplace=True)
    
    # New Indices
    idx_names = ['idx', 'name', 'q_idx']
    df_2.reset_index(inplace=True)
    df_2.set_index(idx_names, inplace=True)
    df_2.sort_index(inplace=True)
    
    df_plt = df_2
    return df_plt

In [23]:
df_plt = build_df_plt(df_res, df_qry, df_cfg)
mem_usage(df_plt)
df_plt.head()


    2.83 kiloB
    


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,macro_f1,t_idx,perc_miss,base_perf,dataset
idx,name,q_idx,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10,ind.type=DT|ind.max_depth=8|sel.its=4|sel.param=2,0,0.770414,0,0.0,77.04138,nltcs
10,ind.type=DT|ind.max_depth=8|sel.its=4|sel.param=2,1,0.776917,1,0.0,77.69168,nltcs
10,ind.type=DT|ind.max_depth=8|sel.its=4|sel.param=2,2,0.779902,2,0.0,77.9902,nltcs
10,ind.type=DT|ind.max_depth=8|sel.its=4|sel.param=2,3,0.812266,3,0.0,81.22664,nltcs
10,ind.type=DT|ind.max_depth=8|sel.its=4|sel.param=2,4,0.815355,4,0.0,81.53552,nltcs


In [24]:
insert_category_dtype(df_plt)
mem_usage(df_plt)


    2.83 kiloB
    


In [25]:
df_plt.dtypes

macro_f1      float64
t_idx            int8
perc_miss     float64
base_perf     float64
dataset      category
dtype: object