# Generate models - sandbox

Notebook that generates models. These models are meant to be somehow 'baselines' or standard models to reduce training time and increase reproducability.

## Preliminaries

In [1]:
# imports

import json
import pandas as pd
import pickle as pkl
import os
import sys
from inspect import signature

from os.path import dirname

In [2]:
# Custom imports

root_dir = dirname(dirname(os.getcwd()))
src_dir = os.path.join(root_dir, 'src')
libs_dir = os.path.join(root_dir, 'libs')
sys.path.append(src_dir)
sys.path.append(libs_dir)

from exp.runner.RunExp import RunExp
from exp.utils import filesystem as fs
from exp.utils.extra import generate_keychain

import mercs
import PxS as pxs

## Methods

### Config

#### IO

In [3]:
exp = RunExp()
default_io_config_dirs = exp.default_io_config_dirs

In [4]:
def default_io_config_file(dirs=None, dataset=None, mod_type=None, mod_kw=None):
    d={}
    
    train_fnames = fs.collect_fnames_from_folder(dirs['resc-data-ds'],
                                                 criteria=['Train'],
                                                 indexed=True)
        
    d['train_data'] = train_fnames
    
    if dataset is not None and mod_type is not None and mod_kw is not None:
        assert isinstance(dataset, str)
        assert isinstance(mod_type, str)
        assert isinstance(mod_kw, str)
        
        # Models (mod)
        mod_base_fname = generate_keychain(['mod', mod_type, mod_kw], sep='_')
        d['mod'] = fs.gen_derived_fnames(train_fnames,
                                               name=mod_base_fname,
                                               extension='pkl',
                                               dname=dirs['resc-models-ds-models'],
                                               indexed=True)
        # Model configuration (mod_cfg)
        mod_config_fname = generate_keychain(['mod','config', mod_type, mod_kw], sep='_')
        d['mod-config'] = fs.gen_derived_fnames(dataset,
                                                name=mod_config_fname,
                                                extension='json',
                                                dname=dirs['resc-models-ds-config'],
                                                indexed=False)[0]
    return d

In [5]:
def load_train_data(io_config, fold):
    
    train_data_fnames = io_config['file']['train_data']

    train_data_fname = [t[1] for t in train_data_fnames
                        if t[0] == fold][0]

    train = pd.read_csv(train_data_fname)
    return train

In [6]:
def save_outputs(mod, mod_config, io_config, fold):
    
    mod_fname = [t[1] for t in io_config['file']['mod']
                 if t[0] == fold][0]
    mod_config_fname = io_config['file']['mod-config']

    # Save config
    fs.ensure_dir(os.path.dirname(mod_config_fname), empty=False)
    
    with open(mod_config_fname, 'w') as f:
        json.dump(mod_config, f, indent=4)
        
    # Save model
    fs.ensure_dir(os.path.dirname(mod_fname), empty=False)
    with open(mod_fname, 'wb') as f:
        pkl.dump(mod, f)
    
    return

#### Induction

In [7]:
def induction(train_data, mod_type, mod_config):
    
    # Actions
    if mod_type in {'Mercs'}:
        model = mercs.MERCS()
    else:
        msg = """
        Did not recognize model type: {}
        """.format(mod_type)
        raise ValueError(msg)
    
    model.fit(train_data, **mod_config, delimiter='.')

    return model

In [8]:
def generate_model(config, fold):
    
    # Extract
    io_config = config['io']
    mod_type = config['mod_type']
    mod_config = config['mod']
    
    # Actions
    train_data = load_train_data(io_config, fold)
    mod = induction(train_data, mod_type, mod_config)
    
    return mod

### Summary

In [9]:
def build_config(**kwargs):
    
    # Preliminaries
    root_dir = kwargs.get('root_dir', '')
    dataset = kwargs.get('dataset', 'nursery')
    mod_type = kwargs.get('mod_type', 'mercs')
    mod_kw = kwargs.get('mod_kw', 'basic')
    mod_cfg = kwargs.get('mod_cfg', {})
    
    
    config={}
    config['io']={}
    
    # Mod-stuff
    config['mod'] = mod_cfg
    config['mod_kw'] = mod_kw
    config['mod_type'] = mod_type
        
    
    # Actions
    config['io']['dirs'] = default_io_config_dirs(root_dir=root_dir,
                                                  dataset=dataset)
    
    
    config['io']['file'] = default_io_config_file(dirs=config['io']['dirs'],
                                                  dataset = dataset,
                                                  mod_type=mod_type,
                                                  mod_kw=mod_kw)
    
    # Folds
    all_folds = [t[0] for t in config['io']['file']['train_data']]
    folds = kwargs.get('folds', None)
    
    if folds is None:
        config['folds'] =  all_folds
    else:
        assert set(folds) <= set(all_folds)
        config['folds'] = folds

    return config

In [10]:
def main(config):
    
    # Prelimis
    folds = config['folds'] 
    io_config = config['io']
    mod_config = config['mod']
    
    # Actions
    for fold in folds:
        mod = generate_model(config, fold)
        save_outputs(mod, mod_config, io_config, fold)
        
    return

## Factory-Floor

In [11]:
ds = 'nltcs'
mod_type = 'Mercs'
mod_kw = 'default'

config = build_config(root_dir=root_dir,
                      dataset=ds,
                      mod_type=mod_type,
                      mod_kw=mod_kw)

## Loop over ds

In [12]:
dirs = default_io_config_dirs(root_dir=root_dir)
all_datasets = os.listdir(dirs['resc-data'])
all_datasets

['adult',
 'cwebkb',
 'book',
 'bbc',
 'kdd',
 'ad',
 'msnbc',
 'tretail',
 'msweb',
 'nursery',
 'jester',
 'pumsb_star',
 'baudio',
 'nltcs',
 'plants',
 'dna',
 'bnetflix',
 'voting',
 'cr52',
 'c20ng',
 'kosarek',
 'accidents',
 'moviereview',
 'tmovie']

In [13]:
for ds in ['nltcs']:
    mod_type = 'Mercs'
    mod_kw = 'default'
    
    config = build_config(root_dir=root_dir,
                          dataset=ds,
                          mod_type=mod_type,
                          mod_kw=mod_kw)
    main(config)
    print("DONE: {}".format(ds))



DONE: nltcs




DONE: bnetflix
