This notebook is used to generate a run file with multiple configurations

In [1]:
import pandas as pd
import numpy as np
import os
import datetime
from sklearn.model_selection import ParameterGrid,ParameterSampler
from scipy.stats.distributions import uniform
import sys
import yaml
import pytz

from run_experiment import DEFAULT_ARGS
sys.path.insert(0,"../src/")

## Utils

In [2]:
def make_date_dir(dir_path = ".", pattern = "", tz = pytz.timezone("Asia/Taipei")):
    """ Builds the directory dir_path/YYYY-MM-DD_hh-mm-ss_pattern.
    Returns the directory path.
    Raise an exception if the directory already exists.
    """
    
    now = str(datetime.datetime.now(tz = tz)).split(".")[0].replace(" ","_").replace(":","-")
    dir_path = dir_path + "/" + now + "_" + pattern + "/"

    if os.path.isdir(dir_path):
        raise Exception("directory exists!")
        
    os.makedirs(dir_path)
    return dir_path


In [3]:
def generate_run_file(search_args,
                      search_method = "grid_search",
                      how_many = 10,
                      config_dir = "./configs/debug",
                      config_dir_pattern = "debug",
                     ):
    """
    generate config files and corresponding run file
    """
    # set output directory for config files
    out_dir = make_date_dir(config_dir,config_dir_pattern)
    # bash runnable file
    file_name = "run_{}.sh".format(out_dir.split("/")[-2])

    if search_method == "grid_search":
        args_sampler = list(ParameterGrid(search_args))
    elif search_method == "random_search":
        args_sampler = list(ParameterSampler(search_args,how_many,random_state=1234))

    L = len(args_sampler)
    with open(file_name,"w") as run_file:
        for iter,args in enumerate(args_sampler):
            # set default args values for non-specified keys
            for key in DEFAULT_ARGS:
                if key not in args:
                    args[key] = DEFAULT_ARGS[key]
            # normalize numbering length
            iter = "0"*(len(str(L)) - len(str(iter+1))) + str(iter+1)
            # add numbering length to dir_pattern
            args["run_name"] += ("_" + iter)
            # save config file
            with open(out_dir + "config_{}.yml".format(iter),"w") as f:
                yaml.dump(args,f)
            # write run file
            command = "python run_experiment.py --config={}config_{}.yml\n".format(out_dir,iter)
            run_file.write(command)

In [4]:
DEFAULT_ARGS

{'store_artifacts': False,
 'random_state': 1234,
 'cv_state': 1234,
 'n_splits': 5,
 'tags': {}}

## Parameters - EDITABLE

In [5]:
features = ['contratto', 'ore_settimana', 'dimensioni_azienda', 
            'settore', 'qualifica', 'titolo_studio', 'tipo_laurea',
            'tipo_diploma', 'regione', 'ampiezza_comune', 'anni_da_primo_lavoro',
            'anni_da_lavoro_corrente', 'anni_da_edu', 'anni_contributi', 'n_esp_lavorative']
len(features)

15

In [6]:
search_method = "random_search" # random_search or grid_search
how_many = 100                  # used only for random_search
config_dir = "./configs/" + "hparam_search/" # don't edit "./configs/"
config_pattern = "probnn"

search_args = dict(
    # Fixed args
    action = ["cross_validate"],
    experiment = [config_dir.split("/")[2]],
    run_name = [config_pattern],
    # Varying args
    store_artifacts = [False],
    n_splits =  [5],
    model = ["ProbNN"],
    model_args = list(ParameterGrid(dict(
        epochs = [25,50,100,150,200],
        dropout = [0,0.1,0.2,0.3,0.4],
        lr = [0.03,0.01,0.003,0.001,0.0003],
        batch_size = [256],
        num_nodes = [[32],[32,32],[32,32,32]],
        embedding_size = [4,8,16],
        distr = ["normal"],
    ))),
    data_args = [dict(
        features = features,
        alpha = 0.01
    )],
)

## Generate run file

In [7]:
generate_run_file(search_args,
                search_method,
                how_many,
                config_dir,
                config_pattern)