In [1]:
# Evaluate the baseline model -- simply the prior probability of the predicted word being correct
# How well can we do with just this simple model?

# THOUGHTS:
# - the assumption is that the word confidence prediction model was trained on data that is similar to the 
#   data used to train the system producing the translations

In [2]:
from __future__ import print_function, division

import cPickle
import os

import codecs
import numpy as np

from nn_imt.evaluation import imt_f1

Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN 5103)


In [3]:
EXPERIMENT_DIR = '/media/1tb_drive/imt_models/word_prediction_accuracy_experiments/en-de/exp_1'
word_acc_priors_file = os.path.join(EXPERIMENT_DIR, 'word_accuracy_priors.pkl')

In [4]:
# newstest 2014
BASEDIR ='/media/1tb_drive/imt_models/newstest_2014_evaluation/'

hyps_file = os.path.join(BASEDIR, 'newstest2014.de.500.bpe.imt-hyps.out')
glimpse_file = os.path.join(BASEDIR, 'newstest2014.de.500.bpe.imt-glimpses.out')
source_file = os.path.join(BASEDIR, 'newstest2014.de.500.bpe.imt-sources.out')
refs_file = os.path.join(BASEDIR, 'reference_suffixes.generated')

# newstest 2015
BASEDIR ='/media/1tb_drive/imt_models/newstest_2015_evaluation/'

hyps_file = os.path.join(BASEDIR, 'newstest2015.de.500.bpe.imt-hyps.out')
glimpse_file = os.path.join(BASEDIR, 'newstest2015.de.500.bpe.imt-glimpses.out')
source_file = os.path.join(BASEDIR, 'newstest2015.de.500.bpe.imt-sources.out')
refs_file = os.path.join(BASEDIR, 'reference_suffixes.generated')

In [5]:
def get_lines(filename, lower=False, cutoff=None):
    with codecs.open(filename, encoding='utf8') as inp:
        lines = [l.split() for l in inp.read().strip().split('\n')][:cutoff]
        if lower:
            lines = [[w.lower() for w in l] for l in lines]
        return lines

In [6]:
def avg_imtF1(hyps, refs):
    f1s, ps, rs = tuple(np.mean(m) for m in zip(*[imt_f1(h,r) for h,r in zip(hyps,refs)]))
    return f1s, ps, rs

In [7]:
hyp_lines = get_lines(hyps_file)
ref_lines = get_lines(refs_file)
source_lines = get_lines(source_file)
# glimpses = numpy.load(glimpse_file)

trans_tups = zip(source_lines, hyp_lines, ref_lines)

In [66]:
# read the hyp, once you get to a word with prediction accuracy <0.5, stop
def prune_hyps_with_prior(threshold, tups, prior_prob_index):
    pruned_hyps = []
    for source, hyp, ref in tups:
#         pruned_hyp = hyp[:cutoff]
        pruned_hyp = []
#         for w, weights in zip(hyp[cutoff:], glimpse[cutoff:]):

        for w in hyp:
            if w in prior_prob_index:
                if prior_prob_index[w] >= threshold:
                    pruned_hyp.append(w)
                else:
                    break
            else:
#                 break
                pruned_hyp.append(w)
                

        pruned_hyps.append(pruned_hyp)
#         if len(pruned_hyp) < len(hyp):
#             print('original hyp: {}'.format(hyp))
#             print('pruned hyp: {}'.format(pruned_hyp))
    return pruned_hyps

In [67]:
word_prior_map = cPickle.load(open(word_acc_priors_file))

In [77]:
t = prune_hyps_with_prior(0.1, trans_tups, word_prior_map)

In [79]:
raw_f1s, raw_ps, raw_rs = avg_imtF1(t, ref_lines)
raw_f1s

0.16798375890368175

In [78]:
t[:10]

[[u'Indien', u'und', u'Japan'],
 [],
 [u'Indiens',
  u'und',
  u'Japans',
  u'treffen',
  u'sich',
  u'in',
  u'Tokio',
  u'</S>'],
 [u'und', u'Japans', u'treffen', u'sich', u'in', u'Tokio', u'</S>'],
 [u'Japans', u'treffen', u'sich', u'in', u'Tokio', u'</S>'],
 [u'treffen', u'sich', u'in', u'Tokio', u'</S>'],
 [u'sich', u'in', u'Tokio', u'</S>'],
 [u'in', u'Tokio', u'</S>'],
 [u'Tokio', u'</S>'],
 [u'</S>']]

In [80]:
raw_f1s, raw_ps, raw_rs = avg_imtF1(hyp_lines, ref_lines)
raw_f1s

0.16918903995065657

In [10]:
thresholds = np.linspace(0.0, 1.0, num=50)

pruned_sets = [prune_hyps_with_prior(thresh, trans_tups, word_prior_map) for thresh in thresholds]
# t_f1s, t_ps, t_rs = zip(*[avg_imtF1(pruned_set, ref_lines) for pruned_set in pruned_sets])

In [11]:
raw_f1s, raw_ps, raw_rs = avg_imtF1(hyp_lines, ref_lines)

In [12]:
raw_f1s

0.16918903995065657

In [5]:
word_prior_map['der']

0.5324615526291555