In [1]:
# load a dataset of word-level predictions, and do some visualization to see what's going on

In [1]:
# now graph f1, precision, recall over threshold
from __future__ import division

%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

import pylab
pylab.rcParams['figure.figsize'] = (14.0, 12.0)
pylab.rcParams['axes.linewidth'] = 2. #set the value globally
params = {'legend.fontsize': 20}
#           'legend.linewidth': 2}
pylab.rcParams.update(params)

In [2]:
import codecs
import numpy
import os

from collections import defaultdict, Counter

In [3]:
OUTPUT_DIR = '/media/1tb_drive/imt_models/word_prediction_accuracy_experiments/en-de/exp_1'

source_output = os.path.join(OUTPUT_DIR,'sources.en')
prefix_output = os.path.join(OUTPUT_DIR,'prefixes.de')
suffix_output = os.path.join(OUTPUT_DIR,'suffixes.de')
prediction_output = os.path.join(OUTPUT_DIR,'predictions.de')
correct_output = os.path.join(OUTPUT_DIR,'prefix_word_prediction_acc.out')

In [4]:
def load_text_file(filename):
    with codecs.open(filename, encoding='utf8') as inp:
        return [l.split() for l in inp.read().strip().split('\n')]

In [5]:
sources = load_text_file(source_output)
prefixes = load_text_file(prefix_output)
suffixes = load_text_file(suffix_output)
predictions = load_text_file(prediction_output)
correct = load_text_file(correct_output)

In [6]:
word_occs = defaultdict(list)

# compute prior probability of word being correct when we predict it
for pred_words, correct_tags in zip(predictions, correct):
    assert len(pred_words) == len(correct_tags)
    for p_w, tag in zip(pred_words, correct_tags):
        word_occs[p_w].append(tag)

In [7]:
word_in_wrong_position = defaultdict(int)

# compute prior probability of word being correct when we predict it
for pred_words, suffix, correct_tags in zip(predictions, suffixes, correct):
    assert len(pred_words) == len(correct_tags) == len(suffix)
    ref_words = set(suffix)
    for p_w, tag in zip(pred_words, correct_tags):
        if p_w in ref_words and tag == 'False':
            word_in_wrong_position[p_w] += 1

In [8]:
word_tag_counts = {k: Counter(v) for k,v in word_occs.items()}
filtered_word_tag_counts = {k: v for k,v in word_tag_counts.items() if sum(v.values()) >= 500}

In [9]:
p_word_in_wrong_position = {}

for p_w, num_wrong_pos in word_in_wrong_position.items():
    if p_w in filtered_word_tag_counts:
        p_word_in_wrong_position[p_w] = num_wrong_pos / float(sum(filtered_word_tag_counts[p_w].values()))
    

In [11]:
w_pred_misprediction_lists = defaultdict(list)
w_ref_misprediction_lists = defaultdict(list)

for pred_words, suffix, correct_tags in zip(predictions, suffixes, correct):
    assert len(pred_words) == len(correct_tags) == len(suffix)
    for p_w, r_w, tag in zip(pred_words, suffix, correct_tags):
        w_pred_misprediction_lists[p_w].append(r_w)
        w_ref_misprediction_lists[r_w].append(p_w)

In [12]:
w_pred_misprediction_counts = {k: Counter(v) for k,v in w_pred_misprediction_lists.items()}
w_ref_misprediction_counts = {k: Counter(v) for k,v in w_ref_misprediction_lists.items()}

In [14]:
import json

In [19]:
for p_w, tag_counts in filtered_word_tag_counts.items()[:10]:
    num_instances = sum(tag_counts.values())
    print('WORD: {}'.format(p_w))
    print(json.dumps(w_pred_misprediction_counts[p_w].most_common()[:20], indent=2))
    print(json.dumps(w_ref_misprediction_counts[p_w].most_common()[:20], indent=2))
    

WORD: kann
[
  [
    "kann", 
    473
  ], 
  [
    "k\u00f6nnen", 
    89
  ], 
  [
    "ist", 
    44
  ], 
  [
    "soll", 
    40
  ], 
  [
    "sind", 
    40
  ], 
  [
    ".", 
    39
  ], 
  [
    "darf", 
    34
  ], 
  [
    "konnte", 
    26
  ], 
  [
    "l\u00e4\u00dft", 
    24
  ], 
  [
    "l\u00e4sst", 
    22
  ], 
  [
    "mag", 
    18
  ], 
  [
    ",", 
    14
  ], 
  [
    "nie", 
    14
  ], 
  [
    "registri@@", 
    12
  ], 
  [
    "hat", 
    11
  ], 
  [
    "haben", 
    8
  ], 
  [
    "und", 
    8
  ], 
  [
    "messen", 
    7
  ], 
  [
    "sch\u00f6pfen", 
    7
  ], 
  [
    "bietet", 
    6
  ]
]
[
  [
    "kann", 
    473
  ], 
  [
    "ist", 
    94
  ], 
  [
    ",", 
    42
  ], 
  [
    "wird", 
    38
  ], 
  [
    "k\u00f6nnte", 
    36
  ], 
  [
    "auch", 
    20
  ], 
  [
    "von", 
    19
  ], 
  [
    ".", 
    18
  ], 
  [
    "m\u00f6chte", 
    13
  ], 
  [
    "im", 
    9
  ], 
  [
    "k\u00f6nnen", 
    9
  ], 
  [
    "sehen"

In [40]:
len(filtered_word_tag_counts)

77

In [23]:
word_tag_accs = {k: v['True'] / float(sum(v.values())) for k,v in word_tag_counts.items()}

In [29]:
import cPickle

# now pickle an object with the word priors
cPickle.dump(word_tag_accs, open(os.path.join(OUTPUT_DIR, 'word_accuracy_priors.pkl'), 'w'))
print('DUMPED word accuracy priors')

DUMPED word accuracy priors


In [24]:
[(i, sum(filtered_word_tag_counts[i[0]].values())) for i in sorted(word_tag_accs.items(), key=lambda x: x[1])][-500:]

KeyError: u'Sam@@'

In [None]:

# NOTES:
# how often was the word in the reference, just not in the position we predicted it at?
# look at which words have low recall -- are these being mistaken for some other word?
# significance values are important -- some words are very rare, so precision will be misleading

# EXPERIMENTS
# what is the performance of the prior word model on new data?
# (1) BASELINE: for every (source, prefix, prediction, suffix), check which word I predicted, then label it as True/False,
#   based upon which word I predicted first after the suffix
# (2) BASELINE 2: for every (source, prefix, prediction, suffix), look at the softmax layer output. When the probability
#   of the argmax word is close to the probablility of the next-best word, the translation model is implicitly not confident