## Create condor scheduling jobs

... and upload them to the server.

In [None]:
from notebook_prelude import *

In [None]:
from utils import time_utils, git_utils

all_experiments = glob('{}/*/*.yaml'.format(EXPERIMENT_CONFIG_FOLDER))

all_experiments = [x for x in all_experiments if '.disabled.' not in x]

priorities = [
    #'min_df',
    'dummy',
    'text',
    'graphs',
    'split_multi_words',
    'edge_labels',
    'content_vs_structure',
    'use_directed',
    'remove_unseen_nodes',
    'relabel',
    'remove_infrequent',
    'node_weights',
    'graph_extra',
    'combined',
#    'ngrams',
    'fast_wl_normalization',
#    'dimensionality_reduction',
]

experiment_filter = dict(
    text=[
        'min_df',
        'ngrams',
        'dummy',
        'text'
    ],
    TYPE_COOCCURRENCE=[],
    TYPE_CONCEPT_MAP=[
        'remove_infrequent',
        'remove_unseen_nodes',
        'split_multi_words',
        'edge_labels',
        'use_directed',
        'relabel'
    ],
    graph_extra=[
        'graph_extra'
    ]
)

exclusive = ['text', 'graph_extra']

verbose = 1
cores = 16
extra = '--use_nested'
extra = ''
create_predictions='true'

tmpl = 'condor_submit priority="{prio}" batch_name="{task_name}__{name}" Args="--task_name {task_name} --experiment_config /home/david/bachelor-thesis/code/{experiment} {extra}" classification_job.condor'



outs = []
for t in [TYPE_CONCEPT_MAP, TYPE_COOCCURRENCE, 'text', 'graph_extra']:
    for experiment in sorted(all_experiments):
        name, name_ = experiment.split('/')[2:]
        if name not in priorities:
            print('Missing priority for experiment: "{}". Skipping.'.format(name))
            continue
        prio = 100 - priorities.index(name)
        is_in_text = name in experiment_filter['text']
        is_exclusive = t in exclusive
        if is_exclusive and (name not in experiment_filter[t]):
            continue
        is_in_specific = len([l for t_, l in experiment_filter.items() if name in l and t_ != t])
        if is_in_specific or (t == 'text' and not is_in_text) or (t != 'text' and is_in_text) or (t == 'graph_extra' and name not in experiment_filter[t]):
            continue
        if t == TYPE_COOCCURRENCE:
            prio -= 30
        cmd = tmpl.format(name=name_, cores=cores, verbose=verbose, experiment=experiment, prio=prio, extra=extra, create_predictions=create_predictions, task_name=t)
        outs.append((prio, cmd))

PRELUDE = '''#!/usr/bin/env bash

# Created: {}
# Commit:  {}

{}
'''
outs = sorted(outs, key=lambda x: x[0], reverse=True)
cmds = ';\n\n'.join([cmd for prio, cmd in outs])
with open('tmp/start_classifaction_jobs.sh', 'w') as f:
    f.write(PRELUDE.format(time_utils.get_time_formatted(), git_utils.get_current_commit(), cmds))

print('# Jobs: {}'.format(len(outs)))
print('Uploading')
!chmod +x tmp/start_classifaction_jobs.sh
!scp tmp/start_classifaction_jobs.sh pe:condor_scripts/
print('Finished')

In [None]:
outs

In [None]:
!cat tmp/start_classifaction_jobs.sh

## Save experiment configs with all parameters

In [None]:
task_params = experiment_helper.get_all_task_type_params()
all_tasks = experiments.get_all_tasks()

In [None]:
tasks = {}
for task in all_tasks:
    if task.name in tasks: continue
    tasks[task.type] = task

In [None]:
all_experiments = experiment_helper.get_all_param_grid_config_files()
experiments = collections.defaultdict(dict)
for name, experiment_config in all_experiments.items():
    if '/all' in name: continue
    print(name)
    for task_name, task in tasks.items():
        if task.type not in experiment_config['params_per_type']: continue
        _, _, _, params = task.fn()
        merged_param_grid = experiment_helper.prepare_param_grid(task, params, experiment_config)
        experiments[name][task.type] = merged_param_grid
    print('-' * 100)


In [None]:
experiment_helper.save_all_experiment_params()