In [38]:
import wandb
from collections import defaultdict
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from IPython import embed
from IPython.core import display
from math import ceil

In [2]:
api = wandb.Api()

# Get best runs from sweeps

In [3]:
# List of tags to identify the set of sweeps in this analysis

IN_TAGS = ["feats8-noise1", "feats8-noise2", "feats8-noise3"]

runs = api.runs("dhdhagar/prob-ent-resolution",
                filters={"tags": {"$in": IN_TAGS}, 
                         "state": {"$eq": "finished"},
                         "config.pairwise_mode": {"$eq": False}
                        },
                order="-summary_metrics.best_dev_b3_f1"
               )
print(f"Non-MLP runs: {len(runs)}")
runs_mlp = api.runs("dhdhagar/prob-ent-resolution", 
                filters={"tags": {"$in": IN_TAGS}, 
                         "state": {"$eq": "finished"},
                         "config.pairwise_mode": {"$eq": True}
                        },
                order="-summary_metrics.best_dev_auroc"
               )
print(f"MLP runs: {len(runs_mlp)}")
print(f"Total runs: {len(runs) + len(runs_mlp)}")

methods = {'e2e', 'e2e-nosdp', 'frac', 'frac-nosdp', 'mlp'}

def make_key(noise, model, dataset, seed):
    return f"noise{noise}_{model}_{dataset}_{seed}"
def run_key(run):
    dataset = run._attrs['config']['dataset']
    dataset_seed = run._attrs['config']['dataset_random_seed']
    method = set(run.tags).intersection(methods).pop()
    noise = run._attrs['config']['noise_std']
    key = make_key(noise, method, dataset, dataset_seed)
    return key
def details_from_key(key):
    noise, model, dataset, seed = key.split('_')
    noise = noise[-1]
    return noise, model, dataset, seed

best_runs = {}
finished_runs = defaultdict(int)
sweep_ids = {}
for _runs in [runs, runs_mlp]:
    for run in tqdm(_runs):
        if len(dict(run.summary)) < 10:
            continue
        key = run_key(run)
        finished_runs[key] += 1
        if key in best_runs:
            continue
        best_runs[key] = run
        sweep_ids[key] = run.sweepName
        
print(f"Best runs found for {len(best_runs)} sweeps")
print(f"Total finished runs across sweeps = {sum(finished_runs.values())}")

Non-MLP runs: 13758
MLP runs: 5300
Total runs: 19058


  0%|          | 0/13758 [00:00<?, ?it/s]

  0%|          | 0/5300 [00:00<?, ?it/s]

Best runs found for 225 sweeps
Total finished runs across sweeps = 19056


# Manage sweeps

In [40]:
_SWEEP_PREFIX="feats8-noise"
_SWEEP_FEATS8_FLAGS = "--keep_feat_idxs=0 --keep_feat_idxs=1 --keep_feat_idxs=2 \
    --keep_feat_idxs=3 --keep_feat_idxs=4 --keep_feat_idxs=5 \
    --keep_feat_idxs=14 --keep_feat_idxs=15"

# Add agents to sweeps with fewer completed runs

def add_agents(max_run_count, launched, finished_runs, sweep_ids, n_agents=1, partition="cpu"):
    for k, v in finished_runs.items():
        if k in launched:
            continue
        if v < max_run_count:
            n_max_runs = ceil((max_run_count - v) / n_agents)
            noise, model, dataset, seed = details_from_key(k)
            sweep_id = sweep_ids[k]
            SWEEP_FLAGS = f"{_SWEEP_FEATS8_FLAGS} --noise_std={noise} --wandb_max_runs={n_max_runs}"
            SWEEP_PREFIX = f"{_SWEEP_PREFIX}{noise}"
            print(f'./add_agent.sh {dataset} {seed} {model} dhdhagar/prob-ent-resolution/{sweep_id} {n_agents} {partition} "{SWEEP_FLAGS}" {SWEEP_PREFIX}')
            !./add_agent.sh $dataset $seed $model dhdhagar/prob-ent-resolution/$sweep_id $n_agents $partition "$SWEEP_FLAGS" $SWEEP_PREFIX
            launched.add(k)

def add_agents_by_list(keys_to_add, launched, sweep_ids, n_agents=1, partition="cpu"):
    for k in keys_to_add:
        noise, model, dataset, seed = details_from_key(k)
        sweep_id = sweep_ids[k]
        SWEEP_FLAGS = f"{_SWEEP_FEATS8_FLAGS} --noise_std={noise}"
        SWEEP_PREFIX = f"{_SWEEP_PREFIX}{noise}"
        print(f'./add_agent.sh {dataset} {seed} {model} dhdhagar/prob-ent-resolution/{sweep_id} {n_agents} {partition} "{SWEEP_FLAGS}" {SWEEP_PREFIX}')
        !./add_agent.sh $dataset $seed $model dhdhagar/prob-ent-resolution/$sweep_id $n_agents $partition "$SWEEP_FLAGS" $SWEEP_PREFIX
        launched.add(k)

def add_sweeps_by_list(keys_to_add, launched, partition="cpu"):
    for k in keys_to_add:
        noise, model, dataset, seed = details_from_key(k)
        SWEEP_FLAGS = f"{_SWEEP_FEATS8_FLAGS} --noise_std={noise}"
        SWEEP_PREFIX = f"{_SWEEP_PREFIX}{noise}"
        print(f'./run_sweep.sh {dataset} {seed} {seed} {model} {partition} "{SWEEP_FLAGS}" {SWEEP_PREFIX}')
        !./run_sweep.sh $dataset $seed $seed $model $partition "$SWEEP_FLAGS" $SWEEP_PREFIX
        launched.add(k)

def resubmit_pending_agents(launched, sweep_ids, n_agents=1, partition="cpu"):
    pending = !sacct --format="JobID,JobName%50,Partition,State" | grep PENDING
    print('\n'.join(pending))
    _pending_agents = set()
    _pending_sweep_job_ids = []
    for pen in pending:
        pen_split = pen.split()
        SWEEP_PREFIX, model, dataset, seed, _ = pen_split[1].split('_')
        assert _SWEEP_PREFIX in SWEEP_PREFIX
        noise = SWEEP_PREFIX[-1]
        seed_split = seed.split('-')  # remove "agentX"
        seed = seed_split[0][-1]
        if len(seed_split) > 1:
            # Sweep agent job is pending
            _pending_agents.add(make_key(noise, model, dataset, seed))
            _pending_sweep_job_ids.append(pen_split[0])
    print(f"PENDING agents: {_pending_agents}\n")
    if len(_pending_agents) > 0:
        # Cancel pending
        _pending_sweep_job_ids = ' '.join(_pending_sweep_job_ids)
        print(f"!scancel {_pending_sweep_job_ids}")
        !scancel $_pending_sweep_job_ids
        # Relaunch
        add_agents_by_list(keys_to_add=_pending_agents, launched=launched,
                           sweep_ids=sweep_ids, n_agents=n_agents, partition=partition)

def resubmit_pending_sweeps(launched, partition="cpu"):
    pending = !sacct --format="JobID,JobName%50,Partition,State" | grep PENDING
    print('\n'.join(pending))
    _pending_agents = set()
    _pending_sweep_job_ids = []
    for pen in pending:
        pen_split = pen.split()
        SWEEP_PREFIX, model, dataset, seed, _ = pen_split[1].split('_')
        assert _SWEEP_PREFIX in SWEEP_PREFIX
        noise = SWEEP_PREFIX[-1]
        seed_split = seed.split('-')  # remove "agentX"
        seed = seed_split[0][-1]
        if len(seed_split) == 1:
            # Sweep init job is pending
            _pending_agents.add(make_key(noise, model, dataset, seed))
            _pending_sweep_job_ids.append(pen_split[0])
    print(f"PENDING agents: {_pending_agents}\n")
    if len(_pending_agents) > 0:
        # Cancel pending
        _pending_sweep_job_ids = ' '.join(_pending_sweep_job_ids)
        print(f"!scancel {_pending_sweep_job_ids}")
        !scancel $_pending_sweep_job_ids
        # Relaunch
        add_sweeps_by_list(keys_to_add=_pending_agents, launched=launched, partition=partition)
        
# Stop sweeps that have reached 120 runs
def stop_finished_sweeps(sweep_ids, finished_runs, finished_sweeps=set(), max_runs=120):
    for _sweep, _sweep_id in sweep_ids.items():
        if finished_runs[_sweep] >= max_runs:
            _fetched = api.sweep(f'dhdhagar/prob-ent-resolution/{_sweep_id}')
            if _fetched.state.lower() != 'finished':
                !wandb sweep --stop dhdhagar/prob-ent-resolution/{_sweep_id}
            finished_sweeps.add(_sweep)
    return finished_sweeps

In [5]:
launched = set()
finished_sweeps = set()

In [None]:
resubmit_pending_sweeps(launched)

In [41]:
add_agents(max_run_count=120, launched=launched, finished_runs=finished_runs, sweep_ids=sweep_ids, n_agents=3)

In [None]:
resubmit_pending_agents(launched=launched, sweep_ids=sweep_ids, n_agents=2)

In [36]:
finished_sweeps = stop_finished_sweeps(sweep_ids, finished_runs, max_runs=120)
print(f'Finished {len(finished_sweeps)} sweeps')

Finished 74 sweeps


# Analyze results

In [29]:
def get_result_dfs(best_runs):
    res_map = {
        'train_time': 'z_run_time',
        'inf_time_hac': 'z_inf_time_hac',
        'inf_time_cc': 'z_inf_time_cc',
        'inf_time_cc-nosdp': 'z_inf_time_cc-nosdp',
        'b3_f1_hac': 'best_test_b3_f1_hac',
        'b3_f1_cc': 'best_test_b3_f1_cc',
        'b3_f1_cc-fixed': 'best_test_b3_f1_cc-fixed',
        'b3_f1_cc-nosdp': 'best_test_b3_f1_cc-nosdp',
        'b3_f1_cc-nosdp-fixed': 'best_test_b3_f1_cc-nosdp-fixed',
        'vmeasure_hac': 'best_test_vmeasure_hac',
        'vmeasure_cc': 'best_test_vmeasure_cc',
        'vmeasure_cc-fixed': 'best_test_vmeasure_cc-fixed',
        'vmeasure_cc-nosdp': 'best_test_vmeasure_cc-nosdp',
        'vmeasure_cc-nosdp-fixed': 'best_test_vmeasure_cc-nosdp-fixed'
    }
    final = {}
    for run_id, run in best_runs.items():
        _key = run_id[:-2]  # Remove the seed
        if _key not in final:
            final[_key] = defaultdict(list)
        res = dict(run.summary)
        for out_key, in_key in res_map.items():
            final[_key][out_key].append(float(res[in_key]))
    means, stds, comb = {}, {}, {}
    for k in final:
        if k is not means:
            means[k] = {}
            stds[k] = {}
            comb[k] = {}
        for _k in final[k]:
            means[k][_k] = round(np.mean(final[k][_k])*(1 if 'time' in _k else 100), 2)
            stds[k][_k] = round(np.std(final[k][_k])*(1 if 'time' in _k else 100), 2)
            comb[k][_k] = f"{means[k][_k]}±{stds[k][_k]}"
    return means, stds, comb



def get_df_by_dataset(res, dataset, noise, to_latex=False):
    new_res = {}
    for _r in res:
        if dataset in _r and f'noise{noise}' in _r:
            _new_r = _r.replace(f"{dataset}_", '').replace(f"_{dataset}", '')
            _new_r = _new_r.replace(f"_noise{noise}", '').replace(f"noise{noise}_", '')
            new_res[_new_r] = res[_r]
    if to_latex:
        print(pd.DataFrame(new_res).T.style.to_latex())
    outdf = pd.DataFrame(new_res).T.sort_index()
    def highlight_max(s):
        if s.dtype == object:
            is_max = [False for _ in range(s.shape[0])]
            if '±' in s[0]:
                nums = np.array(list(map(lambda x: float(x.split('±')[0]), s)))
                is_max = nums == nums.max()
        else:
            is_max = s == s.max()
        return ['color: green' if cell else '' for cell in is_max]
    
    if outdf[outdf.keys()[0]].dtype == object:
        return outdf.style.apply(highlight_max), outdf
    return outdf.style.format('{:.2f}').apply(highlight_max), outdf

def display_all_dfs(df_dict, dataset, noise=[1,2,3]):
    for n in noise:
        print(f"dataset={dataset}, noise={n}")
        display.display(get_df_by_dataset(res=df_dict, dataset=dataset, noise=n)[0])

In [16]:
means, stds, comb = get_result_dfs(best_runs)

In [32]:
display_all_dfs(df_dict=comb, dataset='arnetminer', noise=[1,2,3])

dataset=arnetminer, noise=1


Unnamed: 0,train_time,inf_time_hac,inf_time_cc,inf_time_cc-nosdp,b3_f1_hac,b3_f1_cc,b3_f1_cc-fixed,b3_f1_cc-nosdp,b3_f1_cc-nosdp-fixed,vmeasure_hac,vmeasure_cc,vmeasure_cc-fixed,vmeasure_cc-nosdp,vmeasure_cc-nosdp-fixed
e2e,740.2±261.72,31.6±13.92,49.6±75.06,0.8±0.4,58.66±15.68,53.74±13.85,52.04±13.97,54.06±16.29,52.34±15.99,74.7±6.28,74.2±5.94,73.67±5.86,74.49±6.34,73.93±6.1
e2e-nosdp,71.0±24.31,34.4±15.78,34.6±39.27,1.6±2.06,59.72±13.28,59.96±11.8,60.64±11.04,59.5±12.36,60.72±11.6,75.14±5.69,75.37±5.85,74.8±5.8,75.33±5.96,74.74±5.74
frac,945.6±325.63,45.0±29.99,45.4±47.88,0.6±0.49,59.68±14.74,56.98±13.28,56.94±13.11,56.26±14.38,55.9±13.77,74.52±5.93,75.16±5.9,72.3±6.66,74.88±6.13,72.01±6.77
frac-nosdp,100.0±5.18,68.2±30.51,49.6±68.8,2.2±1.72,63.6±11.85,59.92±12.6,50.82±14.87,59.76±12.74,50.0±15.7,71.96±10.58,72.15±10.22,72.76±6.12,72.09±10.33,72.46±5.86
mlp,176.4±29.86,31.6±13.4,37.6±38.6,1.0±0.89,65.58±9.54,61.6±10.24,60.26±11.42,61.64±10.3,59.8±11.67,79.32±5.76,78.09±5.26,76.72±4.46,78.38±5.33,76.65±4.52


dataset=arnetminer, noise=2


Unnamed: 0,train_time,inf_time_hac,inf_time_cc,inf_time_cc-nosdp,b3_f1_hac,b3_f1_cc,b3_f1_cc-fixed,b3_f1_cc-nosdp,b3_f1_cc-nosdp-fixed,vmeasure_hac,vmeasure_cc,vmeasure_cc-fixed,vmeasure_cc-nosdp,vmeasure_cc-nosdp-fixed
e2e,657.4±304.23,34.0±13.91,33.4±48.69,1.8±1.94,58.18±15.28,58.12±15.13,53.9±17.03,58.1±15.12,54.02±16.86,74.06±6.03,74.27±6.11,74.51±4.36,74.12±5.97,73.28±5.21
e2e-nosdp,68.0±24.4,192.2±202.9,25.2±28.13,0.4±0.49,58.52±15.16,59.06±15.37,51.72±18.1,57.98±15.12,51.84±18.36,74.56±5.91,74.96±6.01,73.77±5.71,73.98±6.0,73.89±5.84
frac,636.0±125.81,70.6±83.34,55.6±75.12,1.4±1.85,57.8±14.89,58.38±15.04,41.58±12.32,57.92±15.09,42.54±12.38,73.55±5.44,74.4±5.97,71.08±5.72,73.76±5.84,70.9±5.6
frac-nosdp,50.4±8.09,73.0±37.92,35.6±34.27,0.8±0.75,60.22±13.18,63.98±10.86,60.9±14.49,63.64±10.88,60.88±14.41,75.33±6.25,76.15±6.61,73.7±6.24,76.01±6.58,73.11±6.63
mlp,106.6±8.33,32.8±13.11,57.2±70.65,0.6±0.49,63.7±9.69,50.16±9.62,48.18±9.6,49.18±9.68,47.48±9.39,76.87±5.11,76.22±4.83,75.38±4.72,75.68±4.98,74.85±4.82


dataset=arnetminer, noise=3


Unnamed: 0,train_time,inf_time_hac,inf_time_cc,inf_time_cc-nosdp,b3_f1_hac,b3_f1_cc,b3_f1_cc-fixed,b3_f1_cc-nosdp,b3_f1_cc-nosdp-fixed,vmeasure_hac,vmeasure_cc,vmeasure_cc-fixed,vmeasure_cc-nosdp,vmeasure_cc-nosdp-fixed
e2e,471.0±101.6,29.6±13.32,24.6±28.71,0.8±0.75,57.62±14.78,57.78±14.93,37.64±10.7,58.14±14.98,40.48±9.57,73.82±5.61,74.21±5.95,66.08±4.57,74.15±5.88,66.65±4.53
e2e-nosdp,68.2±15.39,203.6±165.85,27.6±35.85,1.0±1.1,58.34±14.5,58.34±14.85,51.84±17.28,58.36±14.88,52.0±17.53,74.38±5.38,74.13±5.39,72.88±5.54,74.11±5.44,72.98±5.56
frac,543.2±97.91,29.2±12.42,26.0±32.36,0.4±0.49,58.28±14.88,62.92±11.7,53.46±10.9,62.78±11.58,53.62±12.58,74.44±6.62,75.6±7.27,71.53±3.84,75.48±7.19,70.51±5.45
frac-nosdp,64.8±16.68,32.6±13.92,25.2±26.67,1.2±1.94,58.98±13.98,63.54±11.14,62.36±11.8,63.06±11.15,61.9±11.72,74.92±6.1,75.87±6.69,72.31±7.49,75.49±6.6,71.95±7.29
mlp,103.0±13.97,29.8±11.57,26.0±23.23,0.8±0.75,62.66±11.36,47.02±9.38,45.88±9.51,45.98±8.81,45.8±9.45,75.4±6.69,73.93±4.11,71.57±5.64,73.56±4.26,71.47±5.68
