In [163]:
import sys
import os
os.chdir('/home/dfried/projects/rnng-dev')
sys.path.append('./scripts')

In [22]:
from __future__ import print_function
import numpy as np
import json
import sys
from evaluate import eval_b
from remove_dev_unk import get_tags_tokens_lowercase
from collections import namedtuple

In [9]:
def flatten(lol):
    return [x for l in lol for x in l]


def parse_candidate_line(line):
    toks = line.strip().split('|||')
    return int(toks[0].strip()), float(toks[1].strip()), toks[2].strip()


def parse_candidate_file(fname):
    last_ix = None
    parse_counts = []
    last_ix = None
    sent_parses = []
    for line in open(fname):
        ix, score, parse = parse_candidate_line(line)
        if last_ix is not None and ix != last_ix:
            yield sent_parses
            parse_counts.append(len(sent_parses))
            sent_parses = []
        sent_parses.append((ix, score, parse))
        last_ix = ix
    yield sent_parses
    if any(x != parse_counts[0] for x in parse_counts):
        sys.stderr.write("warning: not all sents have same number of parses!\n")

In [10]:
# def rescore(all_indices_proposals_and_parses, all_scores1, all_scores2, lambda1, lambda2, ref_file, out_file):
#     with open(out_file, 'w') as f:
#         for ipp, scores1, scores2 in zip(all_indices_proposals_and_parses, all_scores1, all_scores2):
#             (ix, proposal_score, parse), score1, score2 = max(zip(ipp, scores1, scores2),
#                         key=lambda ((ix, proposal_score, parse), score1, score2): (1 - lambda1 - lambda2) * proposal_score + lambda1 * score1 + lambda2 * score2)
#             f.write(parse)
#             f.write("\n")
#     return eval_b(ref_file, out_file)

In [28]:
def parse_pairs(lst):
    assert len(lst) % 2 == 0
    acc = {}
    for i in range(0, len(lst), 2):
        k, v = lst[i:i+2]
        assert k not in acc
        acc[k] = v
    return acc


def same_length(xs):
    xs = list(xs)
    return all([len(x) == len(xs[0]) for x in xs[1:]])

In [15]:
model_names_and_candidate_files = [
    'topdown', '../rescore/topdown_bert_large_bs=32_lr=2e-5_adam_patience=2_seed=2_best-epoch-12_it-5394-f1-95.59_model-dev-100.txt',
    'inorder', '../rescore/inorder_bert_large_bs=32_lr=2e-5_adam_patience=2_best-epoch-14_it-6121-f1-95.71_model-dev-100.txt'
]

In [14]:
additional_rescoring_models = ['chartlarge', 'chartensemble']

In [16]:
candidate_files = parse_pairs(model_names_and_candidate_files)

In [18]:
model_names = list(sorted(list(candidate_files.keys()) + additional_rescoring_models))

In [19]:
model_names

['chartensemble', 'chartlarge', 'inorder', 'topdown']

In [21]:
import pandas

In [64]:
import tqdm

In [65]:
tqdm.tqdm?

In [70]:
all_candidates = []

num_sentences = 1700

for this_model, this_base_fname in candidate_files.items():
    this_candidates = {
        this_model: list(parse_candidate_file(this_base_fname))
    }
    for other_model in [model for model in model_names if model != this_model]:
        assert other_model not in this_candidates
        other_fname = "{}.{}.scores".format(this_base_fname, other_model)
        this_candidates[other_model] = list(parse_candidate_file(other_fname))
    assert same_length(list(this_candidates.values()))
    for sentence_index in tqdm.tqdm(range(num_sentences), desc=this_model, ncols=80):
        sentence_candidates_for_all_models = {
            model: model_candidates[sentence_index] 
            for model, model_candidates in this_candidates.items()
        }
        assert same_length(sentence_candidates_for_all_models.values())
        this_tokens = None
        for candidate_index in range(len(next(iter(sentence_candidates_for_all_models.values())))):
            record = {
                'source': (this_model, candidate_index),
                'sentence_index': sentence_index,
            }
            cand_parse = None
            for model, cands in sentence_candidates_for_all_models.items():
                ix, score, parse = cands[candidate_index]
                assert ix == sentence_index
                record[model] = score
                cand_tokens = get_tags_tokens_lowercase(parse)[1]
#                 if this_tokens is not None:
#                     assert cand_tokens == this_tokens
#                 else:
#                     this_tokens = cand_tokens
                if cand_parse is not None:
                    assert parse == cand_parse
                else:
                    cand_parse = parse
            record['parse'] = cand_parse
            all_candidates.append(record)

topdown: 100%|██████████████████████████████| 1700/1700 [01:07<00:00, 25.02it/s]
inorder: 100%|██████████████████████████████| 1700/1700 [01:07<00:00, 25.05it/s]


In [71]:
import pandas

In [195]:
candidates_frame = pandas.DataFrame(all_candidates)

In [126]:
bad_candidates = candidates_frame[model_names[0]] == float('-inf')
for model in model_names[1:]:
    bad_candidates = bad_candidates | (candidates_frame[model] == float('-inf'))

In [196]:
print("removing {} candidates which some model couldn't score".format(len(candidates_frame[bad_candidates])))
candidates_frame = candidates_frame[-bad_candidates]

removing 1 candidates which some model couldn't score


In [128]:
def min_max_norm_col(df, col_name):
    df[col_name] = (df[col_name] - df[col_name].min()) / (df[col_name].max() - df[col_name].min())

In [204]:
for model in model_names:
    min_max_norm_col(candidates_frame, model)

In [130]:
#candidates_frame.set_index(['sentence_index', 'source'])

In [198]:
def rank_candidates(frame, model_weights, name='aggregate_score'):
    frame['aggregate_score'] = sum(frame[model] * weight for model, weight in model_weights.items())
    keys = ['aggregate_score'] + list(sorted(model_weights.keys()))
    df = candidates_frame.sort_values(keys, ascending=[False] * len(keys))
    return df.drop_duplicates('sentence_index').sort_values('sentence_index')

In [199]:
def evaluate_candidates(ref_file, candidates, out_file):
    with open(out_file, 'w') as f:
        f.write('\n'.join(candidates))
    return eval_b(ref_file, out_file)

In [203]:
def evaluate_weights(weights):
    return evaluate_candidates('corpora/english/dev.stripped',
                               rank_candidates(candidates_frame, weights)['parse'],
                               '/tmp/ranked.out')

In [210]:
import numpy as np
import itertools

In [224]:
num_ticks = 10
weight_ticks = np.arange(0, num_ticks + 1, 1) / float(num_ticks)

In [225]:
acc = []
weight_prod = list(itertools.product(weight_ticks, repeat=len(model_names) - 1))
for other_weights in tqdm.tqdm(weight_prod, ncols=80):
    weight_sum = sum(other_weights)
    if weight_sum > 1.0:
        continue
    weights = {
        model_names[0]: 1.0 - weight_sum
    }
    assert len(model_names[1:]) == len(other_weights)
    for other_model, other_weight in zip(model_names[1:], other_weights):
        weights[other_model] = other_weight
    results = evaluate_weights(weights)
    data = dict(**weights)
    data.update(results._asdict())
    acc.append(data)

100%|███████████████████████████████████████| 1331/1331 [04:06<00:00,  5.40it/s]


In [226]:
pandas.DataFrame(acc).sort_values('f1')

Unnamed: 0,chartensemble,chartlarge,complete_match,f1,inorder,precision,recall,topdown
285,0.0,1.0,14.18,94.29,0.0,92.79,95.84,0.0
282,0.1,0.9,14.47,94.31,0.0,92.79,95.88,0.0
251,0.4,0.6,14.29,94.31,0.0,92.77,95.91,0.0
166,0.7,0.3,13.76,94.32,0.0,92.75,95.94,0.0
266,0.3,0.7,14.41,94.34,0.0,92.80,95.93,0.0
230,0.5,0.5,14.41,94.34,0.0,92.79,95.94,0.0
202,0.6,0.4,14.35,94.34,0.0,92.79,95.94,0.0
276,0.2,0.8,14.47,94.35,0.0,92.82,95.93,0.0
121,0.8,0.2,13.76,94.35,0.0,92.79,95.98,0.0
0,1.0,0.0,13.94,94.38,0.0,92.82,96.00,0.0


In [227]:
len(acc)

286

In [229]:
pandas.DataFrame(acc).sort_values('f1').to_csv("/tmp/grid_search_ensemble.csv")