In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [201]:
import os
import sys
import json
import numpy as np
from collections import defaultdict
from glob import glob
from IPython.core.display import HTML
from metrics import accuracy, mean_multiset_dice
from tuna import TunaCorpus
from utils import confidence_interval

In [7]:
def load_predictions(run_dir):
    result = []
    for filename in glob(os.path.join(run_dir, 'predictions.*.jsons')):
        split = []
        with open(filename, 'r') as infile:
            for line in infile:
                split.append(json.loads(line))
        result.append(split)
    return result

In [314]:
def evaluate(preds, metrics):
    evals = [np.array([m([e['prediction'] for e in split],
                         [e['gold'] for e in split])
                       for split in preds])
             for m in metrics]
    
    return ', '.join('%.3f (%.3f--%.3f)' %
                     ((e.mean(),) + tuple(confidence_interval(e)))
                     for e in evals)

def evaluate_mean(preds, metrics):
    evals = [m([e['prediction'] for split in preds for e in split],
               [e['gold'] for split in preds for e in split])
             for m in metrics]
    
    return ', '.join('%d / %d (%6.3f)' % e
                     for e in evals)

def is_loc(gold):
    if '[' in gold:
        gold = eval(gold)
    return any('dimension' in a for a in gold)

In [227]:
PLUSLOC_TRIALS = set()
for trial in TunaCorpus(filenames=glob('../TUNA/corpus/*/*/*.xml')).iter_trials():
    if trial.condition == '+LOC':
        PLUSLOC_TRIALS.add(trial.id)
    else:
        assert trial.condition == '-LOC'

def filter_loc(preds, loc=[True, False]):
    return [[e for e in split
             if (e['id'] in PLUSLOC_TRIALS) in loc]
            for split in preds]

In [228]:
furniture = load_predictions('runs/306')
people = load_predictions('runs/307')
combined = [f + p for f, p in zip(furniture, people)]

furniture_loc = filter_loc(furniture, [True])
furniture_no_loc = filter_loc(furniture, [False])
people_loc = filter_loc(people, [True])
people_no_loc = filter_loc(people, [False])
combined_loc = filter_loc(combined, [True])
combined_no_loc = filter_loc(combined, [False])

for preds in [furniture, people,
              furniture_loc, people_loc,
              furniture_no_loc, people_no_loc,
              combined,
              combined_loc, combined_no_loc]:
    if not any(len(s) for s in preds):
        print 'no examples'
    else:
        print evaluate(preds, [accuracy, mean_multiset_dice])

0.281 (0.217--0.324), 0.812 (0.793--0.825)
0.178 (0.094--0.211), 0.730 (0.716--0.746)
0.286 (0.227--0.337), 0.811 (0.795--0.832)
0.106 (0.055--0.175), 0.716 (0.680--0.736)
0.280 (0.201--0.342), 0.814 (0.788--0.825)
0.237 (0.109--0.294), 0.741 (0.704--0.759)
0.233 (0.197--0.267), 0.774 (0.758--0.786)
0.206 (0.160--0.248), 0.768 (0.743--0.781)
0.259 (0.221--0.302), 0.780 (0.769--0.791)


In [332]:
from error_analysis import predicted_proper_subset_stats, what_get_left_out_stats

def stats(predictions, golds, stats_func=predicted_proper_subset_stats):
    log = [{'agent': {'prediction': eval(pred), 'actual': eval(gold)}}
           for pred, gold in zip(predictions, golds)]
    return stats_func(log, agentname='agent')

def proper_subset_frac(predictions, golds):
    return stats(predictions, golds)

predicted_proper_superset_stats = lambda log, agentname: predicted_proper_subset_stats(log, agentname, overproduce=True)
def proper_superset_frac(predictions, golds):
    return stats(predictions, golds, predicted_proper_superset_stats)

def left_out(predictions, golds):
    stats(predictions, golds, what_get_left_out_stats)
    return (0, 0, 0.0)

In [333]:
runs = [['275', '276'],
        ['306', '307'],
        ['251', '252'],
        ['308', '309']]

for furn_run, people_run in runs:
    furniture = load_predictions('runs/%s' % furn_run)
    people = load_predictions('runs/%s' % people_run)
    combined = [f + p for f, p in zip(furniture, people)]

    scores = []
    for preds in [furniture, people, combined]:
        if not any(len(s) for s in preds):
            scores.append('no examples')
        else:
            scores.append(evaluate_mean(preds, [proper_superset_frac]))  # [accuracy, mean_multiset_dice]))
    print ' | '.join(('%22s' % e) for e in scores)

      0 / 353 ( 0.000) |       0 / 326 ( 0.000) |       0 / 679 ( 0.000)
      0 / 302 ( 0.000) |       0 / 296 ( 0.000) |       0 / 598 ( 0.000)
      0 / 323 ( 0.000) |       0 / 317 ( 0.000) |       0 / 640 ( 0.000)
      0 / 304 ( 0.000) |       0 / 279 ( 0.000) |       0 / 583 ( 0.000)


In [335]:
for furn_run, people_run in runs:
    furniture = load_predictions('runs/%s' % furn_run)
    people = load_predictions('runs/%s' % people_run)

    for name, run, preds in [('furniture', furn_run, furniture),
                        ('people', people_run, people)]:
        print '------'
        print '%s (%s)' % (name, run)
        if not any(len(s) for s in preds):
            print 'no examples'
        else:
            evaluate_mean(preds, [left_out])

------
furniture (275)
y-dimension:3 41
size:large 38
y-dimension:1 37
size:small 23
orientation:back 21
------
people (276)
hairColour:light 63
hasBeard:1 62
hasGlasses:1 48
hasHair:1 33
y-dimension:3 24
------
furniture (306)
y-dimension:1 39
y-dimension:3 32
y-dimension:2 28
size:large 24
orientation:left 21
------
people (307)
hasBeard:1 71
hairColour:light 40
hasHair:1 30
y-dimension:1 26
y-dimension:3 25
------
furniture (251)
y-dimension:3 48
size:large 40
y-dimension:1 40
y-dimension:2 31
orientation:right 28
------
people (252)
type:person 103
hairColour:light 26
y-dimension:2 24
hairColour:dark 22
y-dimension:3 20
------
furniture (308)
y-dimension:3 50
y-dimension:1 46
size:large 35
orientation:front 33
orientation:right 31
------
people (309)
y-dimension:1 30
hasHair:1 28
hairColour:dark 28
hairColour:light 24
y-dimension:3 23


In [247]:
p = load_predictions('runs/275')
p[0][0]

def mapify(preds):
    return {ex['id']: ex for split in preds for ex in split}

mp = mapify(p)
mp['s40t6']

{u'gold': u"['colour:green', 'orientation:front', 'size:small', 'type:desk']",
 u'id': u's40t6',
 u'input': u"['colour:green', 'orientation:front', 'size:small', 'type:desk', 'x-dimension:1', 'y-dimension:1']",
 u'prediction': u"['colour:green', 'type:desk', 'y-dimension:1']"}

In [271]:
all_maps = []

furn_p = []
people_p = []
combined_p = []

example_ids = sorted(mapify(load_predictions('runs/%s' % runs[0][0])).keys())

from scipy.stats import wilcoxon

for furn_run, people_run in runs:
    furniture = mapify(load_predictions('runs/%s' % furn_run))
    people = mapify(load_predictions('runs/%s' % people_run))
    combined = dict(furniture)
    combined.update(people)
    
    all_maps.append([furniture, people, combined])

def safe_wilcoxon(a, b):
    if all(a == b): return 1.0
    else: return wilcoxon(a, b)[1]

for metric in [accuracy, mean_multiset_dice]:
    print
    print metric.__name__
    
    for subset in range(0, 3):
        print ['furniture', 'people', 'all'][subset]
        
        subset_runs = [(r[0], r[1], r)[subset] for r in runs]
        subset_evals = [np.array([metric([e['prediction']], [e['gold']])
                                  for e in m[subset].values()])
                        for m in all_maps]
        print '                    ' + ' '.join('%10s (%5.3f)' % (run, subset_eval.mean())
                                    for run, subset_eval in zip(subset_runs, subset_evals))
        
        for run_1, subset_eval_1 in zip(subset_runs, subset_evals):
            header = '%10s (%5.3f)' % (run_1, subset_eval_1.mean())
            print header + ' '.join('%18.3g' % safe_wilcoxon(subset_eval_1, subset_eval_2)
                                    for _, subset_eval_2 in zip(subset_runs, subset_evals))


accuracy
furniture
                           275 (0.160)        306 (0.281)        251 (0.231)        308 (0.276)
       275 (0.160)                 1           4.56e-08           0.000796            2.8e-07
       306 (0.281)          4.56e-08                  1             0.0196              0.808
       251 (0.231)          0.000796             0.0196                  1            0.00179
       308 (0.276)           2.8e-07              0.808            0.00179                  1
people
                           276 (0.094)        307 (0.178)        252 (0.119)        309 (0.225)
       276 (0.094)                 1            6.1e-05              0.189           5.55e-09
       307 (0.178)           6.1e-05                  1            0.00717             0.0378
       252 (0.119)             0.189            0.00717                  1           2.11e-08
       309 (0.225)          5.55e-09             0.0378           2.11e-08                  1
all
                    ['275

In [222]:
confidence_interval([2.0, 3.6, 2.0, 3.0])

array([ 2. ,  3.2])

In [65]:
class ListTable(list):
    """ Overridden list class which takes a 2-dimensional list of 
        the form [[1,2,3],[4,5,6]], and renders an HTML Table in 
        IPython Notebook. """
    
    def _repr_html_(self):
        html = ["<table>"]
        for row in self:
            html.append("<tr>")
            
            for col in row:
                html.append("<td>{0}</td>".format(col))
            
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)

In [113]:
def acc(x, y):
    if y == 0:
        return '0.000'
    else:
        return '%0.3f' % (x * 1.0 / y)

def collate_example_stats(*experiments):
    eval_count = []
    outputs = []
    correct_count = []
    ex_by_id = {}
    
    for predictions in experiments:
        eval_count.append(defaultdict(int))
        correct_count.append(defaultdict(int))
        outputs.append(defaultdict(list))

        for split in predictions:
            for example in split:
                ex_by_id[example['id']] = (example['input'], example['gold'])
                outputs[-1][example['id']].append(example['prediction'])
                eval_count[-1][example['id']] += 1
                if example['prediction'] == example['gold']:
                    correct_count[-1][example['id']] += 1
    
    table = []
    for i in ex_by_id:
        table.append([i])
        for e in range(len(eval_count)):
            table[-1] += [correct_count[e][i], eval_count[e][i],
                          acc(correct_count[e][i], eval_count[e][i]),
                          '<br/>'.join(str(o) for o in outputs[e][i])]
        table[-1] += ex_by_id[i]

    if len(eval_count) == 1:
        sortkey = lambda row: float(row[3])
    else:
        sortkey = lambda row: float(row[4 * len(eval_count) - 1]) - float(row[3])
    return sorted(table, key=sortkey), \
           ['id'] + ['correct', 'total', 'acc', 'predictions'] * len(eval_count) + \
           ['utt', 'gold']

In [230]:
good = load_predictions('runs/309')
bad = load_predictions('runs/307')

In [231]:
print len(good), len(bad)
print len(good[0]), len(bad[0])

5 5
72 72


In [232]:
table, rownames = collate_example_stats(good, bad)
len(table)

360

In [233]:
def display_html(table, row_names):
    html_table = ListTable()
    html_table.append(row_names)
    for row in table:
        html_table.append(row)
    return html_table

display_html(table, rownames)

0,1,2,3,4,5,6,7,8,9,10
id,correct,total,acc,predictions,correct,total,acc,predictions,utt,gold
s312t24,1,1,1.000,"['hasGlasses:1', 'type:person']",0,1,0.000,"['age:old', 'hasGlasses:1', 'type:person']","['age:old', 'hairColour:light', 'hasBeard:0', 'hasGlasses:1', 'hasHair:1', 'hasShirt:1', 'hasSuit:0', 'hasTie:0', 'orientation:left', 'type:person', 'x-dimension:5', 'y-dimension:2']","['hasGlasses:1', 'type:person']"
s312t22,1,1,1.000,"['hasBeard:1', 'type:person']",0,1,0.000,"['type:person', 'x-dimension:1', 'y-dimension:2']","['age:young', 'hairColour:dark', 'hasBeard:1', 'hasGlasses:0', 'hasHair:1', 'hasShirt:1', 'hasSuit:0', 'hasTie:0', 'orientation:front', 'type:person', 'x-dimension:1', 'y-dimension:2']","['hasBeard:1', 'type:person']"
s28t22,1,1,1.000,"['hasBeard:1', 'type:person']",0,1,0.000,"['type:person', 'x-dimension:5', 'y-dimension:3']","['age:young ', 'hairColour:dark', 'hasBeard:1', 'hasGlasses:0', 'hasHair:1', 'hasShirt:1', 'hasSuit:0', 'hasTie:0', 'orientation:front', 'type:person', 'x-dimension:5', 'y-dimension:3']","['hasBeard:1', 'type:person']"
s97t25,1,1,1.000,"['hairColour:light', 'hasBeard:1', 'type:person']",0,1,0.000,"['type:person', 'x-dimension:4', 'y-dimension:2']","['age:old ', 'hairColour:light', 'hasBeard:1', 'hasGlasses:0', 'hasHair:0', 'hasShirt:1', 'hasSuit:0', 'hasTie:0', 'orientation:left', 'type:person', 'x-dimension:4', 'y-dimension:2']","['hairColour:light', 'hasBeard:1', 'type:person']"
s323t21,1,1,1.000,"['hasBeard:1', 'hasGlasses:1', 'type:person']",0,1,0.000,"['hasBeard:1', 'type:person', 'y-dimension:3']","['age:old', 'hairColour:light', 'hasBeard:1', 'hasGlasses:1', 'hasHair:1', 'hasShirt:1', 'hasSuit:1', 'hasTie:1', 'orientation:front', 'type:person', 'x-dimension:3', 'y-dimension:3']","['hasBeard:1', 'hasGlasses:1', 'type:person']"
s323t22,1,1,1.000,"['hasBeard:1', 'type:person']",0,1,0.000,"['type:person', 'x-dimension:5', 'y-dimension:2']","['age:young', 'hairColour:dark', 'hasBeard:1', 'hasGlasses:0', 'hasHair:1', 'hasShirt:1', 'hasSuit:0', 'hasTie:0', 'orientation:front', 'type:person', 'x-dimension:5', 'y-dimension:2']","['hasBeard:1', 'type:person']"
s177t24,1,1,1.000,"['hasGlasses:1', 'type:person']",0,1,0.000,"['hasGlasses:1', 'type:person', 'x-dimension:4', 'y-dimension:3']","['age:old', 'hairColour:light', 'hasBeard:0', 'hasGlasses:1', 'hasHair:1', 'hasShirt:1', 'hasSuit:0', 'hasTie:0', 'orientation:left', 'type:person', 'x-dimension:4', 'y-dimension:3']","['hasGlasses:1', 'type:person']"
s149t23,1,1,1.000,"['hasGlasses:1', 'type:person']",0,1,0.000,"['hairColour:light', 'hasBeard:1', 'hasGlasses:1', 'type:person']","['age:old ', 'hairColour:light', 'hasBeard:1', 'hasGlasses:1', 'hasHair:1', 'hasShirt:0', 'hasSuit:0', 'hasTie:0', 'orientation:right', 'type:person', 'x-dimension:5', 'y-dimension:3']","['hasGlasses:1', 'type:person']"
s53t23,1,1,1.000,"['hairColour:light', 'hasBeard:1', 'hasGlasses:1', 'type:person']",0,1,0.000,"['hasGlasses:1', 'type:person']","['age:old ', 'hairColour:light', 'hasBeard:1', 'hasGlasses:1', 'hasHair:1', 'hasShirt:0', 'hasSuit:0', 'hasTie:0', 'orientation:right', 'type:person', 'x-dimension:4', 'y-dimension:2']","['hairColour:light', 'hasBeard:1', 'hasGlasses:1', 'type:person']"


In [238]:
import training_instances
from sgd_lsl import LSLTrainer

def load_weights(run_dir, split):
    with open(os.path.join(run_dir, 'params.%d.json' % split), 'r') as infile:
        return json.load(infile)

def analyze(run_dir, ex_id, instances):
    example = None
    split = None
    
    for filename in glob(os.path.join(run_dir, 'predictions.*.jsons')):
        with open(filename, 'r') as infile:
            for line in infile:
                ex = json.loads(line)
                if ex['id'] == ex_id:
                    example = ex
                    split = int(filename.split('.')[1])
                    break
            
            if split is not None: break
    
    dataset = instances(filenames=glob('../TUNA/corpus/*/*/*.xml'))
    # dataset = training_instances.add_relevant_alts(dataset)
    (to_analyze,) = [d for d in dataset if d[0] == ex_id]
    id, x, y, domain, attrs, alts = to_analyze

    w = load_weights(run_dir, split)
    t = LSLTrainer(samples_x=10)
    t.predict(example['input'], w, domain, alts, verbose=2)


analyze('runs/309', 's320t23', training_instances.get_generation_instances)

NameError: global name 'LSLTrainer' is not defined