In [1]:
import json
import codecs

from sequence_qe.evaluation import non_matching_words_are_bad, reduce_to_binary_labels

import numpy as np
from sklearn.metrics import f1_score

In [2]:
json_dev_log = '/home/chris/Desktop/Dropbox/data/qe/experiments/sample_json_logs/dev_6500.out'

In [3]:
dev_results = json.loads(codecs.open(json_dev_log, encoding='utf8').read())
dev_preds = [result['pred'].split() for result in dev_results]
dev_mt = [result['mt'].split() for result in dev_results]
dev_true = [result['output'].split() for result in dev_results]

In [4]:
no_eos_preds = []
no_eos_mt = []
no_eos_true = []

global_eos_token = u'</S>'
for p, m, t in zip(dev_preds, dev_mt, dev_true):
    if global_eos_token in m:
        eos_idx = m.index(global_eos_token)
        p = p[:eos_idx]
        m = m[:eos_idx]
        t = t[:eos_idx]
    no_eos_preds.append(p)
    no_eos_mt.append(m)
    no_eos_true.append(t)
dev_preds = no_eos_preds
dev_mt = no_eos_mt
dev_true = no_eos_true
    

In [5]:
def accuracy(preds, truth, eos_token=u'</S>'):
    correct = 0
    total = 0 
    for p, t in zip(preds, truth):
        assert len(p) == len(t)
        if eos_token in t:
            eos_idx = t.index(eos_token)
            p = p[:eos_idx]
            t = t[:eos_idx]
            
        correct += sum([1 for p_w, t_w in zip(p, t) if p_w==t_w])
        total += len(p)
    return float(correct) / float(total)



In [6]:
accuracy(dev_preds, dev_true)

0.6730038022813688

In [7]:
mapped_preds = non_matching_words_are_bad(dev_mt, dev_preds)

In [8]:
accuracy(mapped_preds, dev_true)

0.6869455006337135

In [9]:
reduced_preds = reduce_to_binary_labels(mapped_preds)
reduced_true = reduce_to_binary_labels(dev_true)

In [10]:
accuracy(reduced_preds, reduced_true)

0.6869455006337135

In [11]:
tag_map = {u'OK': 0, u'BAD': 1}
flat_preds = [tag_map[w] for s in reduced_preds for w in s]
flat_true = [tag_map[w] for s in reduced_true for w in s]

In [12]:
f1_scores = f1_score(flat_true, flat_preds, average=None)
f1_scores

array([ 0.79226241,  0.36503856])

In [13]:
f1_product = np.product(f1_scores)
f1_product

0.28920632792889395