In [None]:
from notebook_prelude import *

In [None]:
# Download logs from server
!rsync pe:logs/* tmp/logs -avP

In [None]:
# Download condor job history
!ssh pe 'condor_history -forwards | grep -v " X " | grep "script_serial"' > tmp/condor_history.txt

In [None]:
condor_history_file = get_text_file_lines('tmp/condor_history.txt')

def f(x):
    return np.all([
        ' X ' not in x,
        'start_script_serial' in x
    ])


def get_parsed_line(x):
    out = [y.strip() for y in x.split('  ')]
    assert len(out) == 9
    parts = out[7].split(' ')
    out[7] = parts[0]
    #out.insert(7, parts[1])
    out = out[:-1] + out[-1].split(' ', 2)
    return out

headers = ['job_id', 'user', '_', '__', '___', '____', 'commit_time', 'run_time', 'end_date', 'end_time', 'cmd']
condor_history_file = list(filter(f, condor_history_file))

df = pd.DataFrame(list(map(get_parsed_line, condor_history_file)), columns=headers)
df.tail(1)

In [None]:
df = df[df.cmd.str.contains('experiment_config') & (df.cmd.str.contains('--n_jobs 16'))]
df['experiment_file'] = df.cmd.str.split('experiment_config ').str.get(1).str.split('/').str.get(-1)
df['nested'] = df.cmd.str.contains('use_nest')

In [None]:
df[['experiment_file', 'run_time', 'nested']]

In [None]:
import re

LOG_DIR = 'tmp/logs'
logs = sorted(glob('{}/*.log'.format(LOG_DIR)))
log = logs[-5]

def get_log(log):
    with open(log) as f:
        return [x for x in f.read().split('\n')]
    
def get_finish_times(log_lines):
    FINISHED_REGEXP = r'\d: (.+?) +- (.+?) +- Finished \(time=(.+?)\)'
    finished_lines = [x.strip() for x in log_lines if x.count('(time=') == 1]
    finish_times = re.findall(FINISHED_REGEXP, '\n'.join(finished_lines))
    return finish_times


def time_str_2_seconds(x):
    parts = x.split(':')
    seconds = 0
    for i, part in enumerate(reversed(parts)):
        seconds += (np.power(60, i)) * int(part)
    return seconds

def get_finish_times_from_log(log_file):
    df = pd.DataFrame(get_finish_times(get_log(log)), columns = ['type', 'name', 'time'])
    df['dataset'] = df.name.apply(filename_utils.get_dataset_from_filename)
    df['time_as_seconds'] = df.time.apply(time_str_2_seconds)
    df['time_as_minutes'] = df.time_as_seconds.apply(lambda x: int(x / 60))
    return df

for log in sorted(logs):
    df = get_finish_times_from_log(log)
    #df = df[df.dataset == 'ng20']
    if not len(df): continue
    print(log.split('/')[-1])
    display(df.groupby(['dataset', 'type']).time_as_minutes.max().to_frame())

## WL run times

In [None]:
from time import time
from memory_profiler import memory_usage
import tempfile

def get_object_size(obj):
    file = tempfile.mktemp()
    with open(file, 'wb') as f:
        pickle.dump(obj, f)
    return os.path.getsize(file)

H=4

trans = transformers.FastWLGraphKernelTransformer(h=H, use_early_stopping=False, same_label=False)
clf = sklearn.svm.LinearSVC(random_state=42)
phi_picker = transformers.PhiPickerTransformer(use_zeroth=True)
text_trans = sklearn.feature_extraction.text.TfidfVectorizer()

data = {'text': collections.defaultdict(list), 'graph': collections.defaultdict(list)}
for dataset in log_progress_nb(dataset_helper.get_dataset_names_with_concept_map()):
    print(dataset)
    def cleanup():
        global clf, phi_picker, trans, text_trans, X, Y
        clf = sklearn.base.clone(clf)
        phi_picker = sklearn.base.clone(phi_picker)
        trans = sklearn.base.clone(trans)
        text_trans = sklearn.base.clone(text_trans)
        gc.collect()
        del X, Y
    
    def get_mem_usage():
        return np.mean(memory_usage(-1, interval=.2, timeout=1, include_children=True))
    
    def measure(name, type_, fn):
        print('\t\t', name)
        #data['mem_start_' + name].append(get_mem_usage())
        data[type_]['start_' + name].append(time())
        out = fn()
        data[type_]['end_' + name].append(time())
        #data['mem_end_' + name].append(get_mem_usage())
        return out
    
    gc.collect()
    
    # Text
    print('\tText')
    data['text']['type'].append('text')
    data['text']['start'].append(time())
    X, Y = dataset_helper.get_dataset(dataset)
    data['text']['dataset'].append(dataset)
    data['text']['num_classes'].append(len(set(Y)))
    data['text']['num_els'].append(len(X))
    phi = measure('tfidf', 'text', lambda: text_trans.fit_transform(X))
    _ = measure('clf_fit', 'text', lambda: clf.fit(phi, Y))
    _ = measure('clf_predict', 'text', lambda: clf.predict(phi))
    data['text']['end'].append(time())
    data['text']['num_features'].append(clf.coef_.shape[1])
    data['text']['estimator_size'].append(get_object_size(clf) + get_object_size(text_trans))
    
    cleanup()
    # Graph
    print('\tGraph')
    data['graph']['type'].append('graph')
    data['graph']['start'].append(time())
    X, Y = dataset_helper.get_concept_map_for_dataset(dataset)
    X = graph_helper.get_graphs_only(X)
    all_nodes = graph_helper.get_all_node_labels(X)
    data['graph']['dataset'].append(dataset)
    data['graph']['num_els'].append(len(all_nodes))
    data['graph']['num_classes'].append(len(set(Y)))
    data['graph']['iterations'].append(trans.h)
    _ = measure('wl', 'graph', lambda: trans.fit(X))
    phi = measure('phi_picker', 'graph', lambda: phi_picker.transform(trans.phi_list))
    _ = measure('clf_fit', 'graph', lambda: clf.fit(phi, Y))
    _ = measure('clf_predict', 'graph', lambda: clf.predict(phi))
    data['graph']['end'].append(time())
    data['graph']['num_features'].append(clf.coef_.shape[1])
    #
    #data['graph']['estimator_size'].append(get_object_size(clf) + get_object_size(trans))
    data['graph']['estimator_size'].append(get_object_size(clf) + get_object_size(trans.label_lookups))
    
    cleanup()

In [None]:
from utils import time_utils
df = pd.DataFrame(data['text']).append(pd.DataFrame(data['graph'])).reset_index()
df['runtime_total'] = df.end - df.start
df = df[[c for c in df.columns if c != 'index']]

for x, (start_attr, end_attr) in [('graph', ('start_wl', 'end_phi_picker')), ('text', ('start_tfidf', 'end_tfidf'))]:
    df.loc[df.type == x, 'runtime_feature_extraction'] = df[df.type == x][end_attr] - df[df.type == x][start_attr]
    
df[['dataset', 'type', 'num_features', 'runtime_total', 'runtime_feature_extraction']]

In [None]:
df_mem = (df.pivot(index='dataset', columns='type', values='estimator_size') / 1024 / 1024).rename(columns=dict(graph='graph_estimator_size', text='text_estimator_size'))
df_runtime = df.pivot(index='dataset', columns='type', values='runtime_total')

for x in ['graph', 'text']:
    df_mem['{}_runtime'.format(x)] = df_runtime[x]

for x in ['graph', 'text']:
    df_mem['{}_runtime_feature_extraction'.format(x)] = df[df.type==x].runtime_feature_extraction.values

for x in ['graph', 'text']:
    df_mem['{}_num_features'.format(x)] = df[df.type==x].num_features.values / 1000
    
print(df_mem.to_latex(float_format='%.0f'))
df_mem

In [None]:
df

In [None]:
pprint(df.columns.values.tolist())

In [None]:
df['runtime_clf'] = df.end_clf_predict - df.start_clf_fit

In [None]:
df.plot(kind='scatter', x='num_features', y='runtime_clf')