In [1]:
# prototype constrained decoding with INMT models

# build the simplest possible interface to a trained NMT model
# define the payload for NMT hypotheses

# the interface between NMT and constrained decoding needs to know how to create ConstrainedHypothesis objects


import copy

import numpy as np
from collections import defaultdict, OrderedDict

from constrained_decoding import ConstraintHypothesis
from constrained_decoding.translation_model import AbstractConstrainedTM

from nn_imt import IMTPredictor
from neural_mt.machine_translation.configurations import get_config

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
IMT_CONFIGURATION_FILE = '/home/chris/projects/neural_imt/experiments/configs/demos/en-de/en-de_interactive_demo.yaml'

In [4]:
class NeuralTranslationModel(AbstractConstrainedTM):
    
    def __init__(self, config_file):
        """Intitialize the model according to user provided configuration
        
        - follow the style of BeamSearch, but remove the search logic
        - build the graph and load the parameters (i.e. create a Predictor and expose the right functions)
        """
        
        self.imt_model = IMTPredictor(get_config(config_file))
        self.imt_beam_search = self.imt_model.beam_search
        self.eos_token = u'</S>'
        
    def build_input_representations(self, source_tokens, constraint_token_seqs):
        """Encode the input sequences using the source and target word-->idx maps"""
        # TODO: add tokenization, subword encoding

        source_seq = self.imt_model.map_idx_or_unk(source_tokens,
                                                   self.imt_model.src_vocab,
                                                   self.imt_model.unk_idx)

        # Note: we assume that constraints are in the target language
        constraint_seqs = []
        for token_seq in constraint_token_seqs:
            token_idxs = self.imt_model.map_idx_or_unk(token_seq,
                                                       self.imt_model.trg_vocab,
                                                       self.imt_model.unk_idx)
            constraint_seqs.append(token_idxs)
        
        source_seq = np.tile(source_seq, (1, 1))
        
        # TODO: we'll need to tile constraint_seqs up to beam_size for NMT models that take constraints as inputs
        #input_ = numpy.tile(seq, (self.exp_config['beam_size'], 1))
    
        return (source_seq, constraint_seqs)
        
    # TODO: remove target_prefix from args (see below)
    def start_hypothesis(self, source_seq, target_prefix, constraints, coverage=None):
        """
        Build the start hyp for a neural translation model.
        Models may or may not use constraints. I.e. by modeling
        the probability of generating vs. copying from the constraints. 
        
        """
        
        # TODO: there SHOULD BE no self.target_sampling_input because we don't use the prefix representation in constrained
        # input_values = {:class:`~theano.Variable`: :class:`~numpy.ndarray`}
        input_values = {
            self.imt_model.source_sampling_input: source_seq,
            self.imt_model.target_sampling_input: target_prefix
        }
        
        # Note that the initial input of an NMT model is currently implicit (i.e. Readout.initial_input)
        # TODO: how to explicitly encode the NMT start hypothesis
        contexts, states, beam_size = self.imt_beam_search.compute_initial_states_and_contexts(inputs=input_values)

        # Note: explicit initialization of coverage
        coverage = [np.zeros(l, dtype='int16') for l in [len(s) for s in constraints]]
    
        # the payload contains everything that the next timestep will need to generate another output
        payload = {
            'contexts': contexts,
            'states': states,
            # input_values is here because of a bug in getting beam-size from the graph
            'input_values': input_values
        }
    
        start_hyp = ConstraintHypothesis(
            token=None,
            score=None, 
            coverage=coverage,
            constraints=constraints,
            payload=payload,
            backpointer=None,
            constraint_index=None,
            unfinished_constraint=False
        )
        
        return start_hyp
        
    def generate(self, hyp, n_best):
        """
        Note: the `n_best` parameter here is only used to limit the number of hypothesis objects that are generated
        from the input hyp, the beam implementation may specify a different `n_best`
        
        
        """
        
        # if we already generated EOS, theres only one option -- just continue it and copy the cost
        if hyp.token == self.eos_token:
            new_hyp = ConstraintHypothesis(
                token=self.eos_token,
                score=hyp.score, 
                coverage=copy.deepcopy(hyp.coverage),
                constraints=hyp.constraints,
                payload=hyp.payload,
                backpointer=hyp,
                constraint_index=None,
                unfinished_constraint=False
            )
            return [new_hyp]
            
        logprobs = self.imt_beam_search.compute_logprobs(hyp.payload['input_values'],
                                                         hyp.payload['contexts'],
                                                         hyp.payload['states'])
        
        assert len(logprobs) == 1, 'NMT logprob logic depends upon logprobs only having one row'
        n_best_outputs = np.argsort(logprobs.flatten())[:n_best]
        chosen_costs = logprobs.flatten()[n_best_outputs]
        
        # generate n_best ConstrainedHypothesis for each item on the beam, return them all
        # argsort logprobs
        payload = hyp.payload
        
        # Note: it's critical to use the OrderedDict here, otherwise args will get out of order in theano funcs
        tiled_payload = defaultdict(OrderedDict)
        tiled_payload['contexts']['attended'] = np.tile(payload['contexts']['attended'], (1, n_best, 1))
        tiled_payload['contexts']['attended_mask'] = np.tile(payload['contexts']['attended_mask'], (1, n_best))

#         [(k, start_hyp.payload['contexts'][k].shape) for k in start_hyp.payload['contexts'].keys()]
#         [('attended', (6, 1, 2000)), ('attended_mask', (6, 1))]

        tiled_payload['states']['outputs'] = np.tile(payload['states']['outputs'], n_best)
        tiled_payload['states']['states'] = np.tile(payload['states']['states'], (n_best, 1))
        tiled_payload['states']['weights'] = np.tile(payload['states']['weights'], (n_best, 1))
        tiled_payload['states']['weighted_averages'] = np.tile(payload['states']['weighted_averages'], (n_best, 1))

# [(k, start_hyp.payload['states'][k].shape) for k in start_hyp.payload['states'].keys()]
# [('outputs', (1,)),
#  ('states', (1, 1000)),
#  ('weights', (1, 6)),
#  ('weighted_averages', (1, 2000))]

        tiled_payload['input_values'][self.imt_model.source_sampling_input] = np.tile(payload['input_values'][self.imt_model.source_sampling_input], 
                                                                                      (n_best, 1))
        tiled_payload['input_values'][self.imt_model.target_sampling_input] = np.tile(payload['input_values'][self.imt_model.target_sampling_input], 
                                                                                      (n_best, 1))

# [(k, start_hyp.payload['input_values'][k].shape) for k in start_hyp.payload['input_values'].keys()]
# [(sampling_input, (1, 6)), (sampling_target_prefix, (1, 2))]


        # Now we need to tile the previous hyp values to make this work
        next_states = self.imt_beam_search.compute_next_states(tiled_payload['input_values'],
                                                               tiled_payload['contexts'],
                                                               tiled_payload['states'],
                                                               n_best_outputs)

        # create ContstrainedHypothesis objects from these states (tile back down to one)
        new_hyps = []
        for hyp_idx in range(n_best):
            new_payload = defaultdict(OrderedDict)
            new_payload['contexts'] = payload['contexts']

            new_payload['states']['outputs'] = np.atleast_1d(next_states['outputs'][hyp_idx])
            new_payload['states']['states'] = np.atleast_2d(next_states['states'][hyp_idx])
            new_payload['states']['weights'] = np.atleast_2d(next_states['weights'][hyp_idx])
            new_payload['states']['weighted_averages'] = np.atleast_2d(next_states['weighted_averages'][hyp_idx])
# [('outputs', (1,)),
#  ('states', (1, 1000)),
#  ('weights', (1, 6)),
#  ('weighted_averages', (1, 2000))]
            
            
            new_payload['input_values'] = hyp.payload['input_values']
                
            # TODO: account for EOS continuations -- i.e. make other costs infinite
            if hyp.score is not None:
                next_score = hyp.score + chosen_costs[hyp_idx]
            else:
                # hyp.score is None for the start hyp
                next_score = chosen_costs[hyp_idx]
                
            new_hyp = ConstraintHypothesis(
                token=self.imt_model.trg_ivocab[n_best_outputs[hyp_idx]],
                score=next_score, 
                coverage=copy.deepcopy(hyp.coverage),
                constraints=hyp.constraints,
                payload=new_payload,
                backpointer=hyp,
                constraint_index=None,
                unfinished_constraint=False
            )

            new_hyps.append(new_hyp)
            
        return new_hyps
        
        
        # The additional dim (`None`) is needed to maintain 2d, and to
        # make the broadcasting of `logprobs * all_masks[-1, :, None] work
#         next_costs = (all_costs[-1, :, None] +
#                       logprobs * all_masks[-1, :, None])
#         (finished,) = numpy.where(all_masks[-1] == 0)
        
        # WORKING: see if we can generate one timestep from start hyp
        
        # we always start from the beginning of the sequence, so we always need initial states
        # the initial states and contexts are the payload of the "start_hyp"
        # states['outputs']
        # states['weights']
        
        # This array will store all generated outputs, including those from
        # previous step and those from already finished sequences.
        #all_outputs = states['outputs'][None, :]
        #all_masks = numpy.ones_like(all_outputs, dtype=config.floatX)
        #all_costs = numpy.zeros_like(all_outputs, dtype=config.floatX)

        # Chris: get the glimpse weights as well
        #prev_glimpses = states['weights'][None, :]
        #all_glimpses = numpy.zeros_like(prev_glimpses, dtype=config.floatX)

        # Note: confidence at timestep zero is always = 1
        #all_confidences = numpy.ones_like(all_outputs, dtype=config.floatX)
            
    def generate_constrained(self, hyp):
        """Use hyp.constraints and hyp.coverage to return new hypothesis which start constraints
        that are not yet covered by this hypothesis.
        
        """
        assert hyp.unfinished_constraint is not True, 'hyp must not be part of an unfinished constraint'
        
        new_constraint_hyps = []
        available_constraints = hyp.constraint_candidates()
        
        # TODO: if the model knows about constraints, getting the score from the model must be done differently
        # TODO: currently, according to the model, there is no difference between generating and choosing from constraints 
        logprobs = self.imt_beam_search.compute_logprobs(hyp.payload['input_values'],
                                                         hyp.payload['contexts'],
                                                         hyp.payload['states']).flatten()
        for idx in available_constraints:
            # start new constraints
            constraint_idx = hyp.constraints[idx][0]

            next_states = self.imt_beam_search.compute_next_states(hyp.payload['input_values'],
                                                                   hyp.payload['contexts'],
                                                                   hyp.payload['states'],
                                                                   np.atleast_1d(constraint_idx))

            new_payload = defaultdict(OrderedDict)
            new_payload['contexts'] = hyp.payload['contexts']

            new_payload['states'] = next_states
            
            new_payload['input_values'] = hyp.payload['input_values']
            
            
            # get the score for this token from the logprobs
            if hyp.score is not None:
                next_score = hyp.score + logprobs[constraint_idx]
            else:
                # hyp.score is None for the start hyp
                next_score = logprobs[constraint_idx]
                
            coverage = copy.deepcopy(hyp.coverage)
            coverage[idx][0] = 1
            
            if len(coverage[idx]) > 1:
                unfinished_constraint = True
            else:
                unfinished_constraint = False
            
            # TODO: if the model knows about constraints, getting the score from the model must be done differently
            new_hyp = ConstraintHypothesis(token=self.imt_model.trg_ivocab[constraint_idx],
                                           score=next_score,
                                           coverage=coverage,
                                           constraints=hyp.constraints,
                                           payload=new_payload,
                                           backpointer=hyp,
                                           constraint_index=(idx, 0),
                                           unfinished_constraint=unfinished_constraint
                                          )
            new_constraint_hyps.append(new_hyp)
        
        return new_constraint_hyps 
    
    def continue_constrained(self, hyp):
        assert hyp.unfinished_constraint is True, 'hyp must be part of an unfinished constraint'
        
        # TODO: if the model knows about constraints, getting the score from the model must be done differently
        # TODO: currently, according to the model, there is no difference between generating and choosing from constraints 
        logprobs = self.imt_beam_search.compute_logprobs(hyp.payload['input_values'],
                                                         hyp.payload['contexts'],
                                                         hyp.payload['states']).flatten()
        

        constraint_row_index = hyp.constraint_index[0]
        # the index of the next token in the constraint
        constraint_tok_index = hyp.constraint_index[1] + 1
        constraint_index = (constraint_row_index, constraint_tok_index)
        
        continued_constraint_token = hyp.constraints[constraint_index[0]][constraint_index[1]]

        # get the score for this token from the logprobs
        if hyp.score is not None:
            next_score = hyp.score + logprobs[continued_constraint_token]
        else:
            # hyp.score is None for the start hyp
            next_score = logprobs[continued_constraint_token]
        
        coverage = copy.deepcopy(hyp.coverage)
        coverage[constraint_row_index][constraint_tok_index] = 1

        if len(hyp.constraints[constraint_row_index]) > constraint_tok_index + 1:
            unfinished_constraint = True
        else:
            unfinished_constraint = False

        next_states = self.imt_beam_search.compute_next_states(hyp.payload['input_values'],
                                                               hyp.payload['contexts'],
                                                               hyp.payload['states'],
                                                               np.atleast_1d(continued_constraint_token))
   
        new_payload = defaultdict(OrderedDict)
        new_payload['contexts'] = hyp.payload['contexts']

        new_payload['states'] = next_states
            
        new_payload['input_values'] = hyp.payload['input_values']

        new_hyp = ConstraintHypothesis(token=self.imt_model.trg_ivocab[continued_constraint_token],
                                       score=next_score,
                                       coverage=coverage,
                                       constraints=hyp.constraints,
                                       payload=new_payload,
                                       backpointer=hyp,
                                       constraint_index=constraint_index,
                                       unfinished_constraint=unfinished_constraint)

        return new_hyp
        
        

In [5]:
# the mask performs several functions, which serve to tell us the point when a hypothesis starts to end with <EOS>

In [6]:
imt_tm = NeuralTranslationModel(IMT_CONFIGURATION_FILE)


Your function uses a non-shared variable other than those given by scan explicitly. That can significantly slow down `tensor.grad` call. Did you forget to declare it in `contexts`?
INFO:machine_translation.checkpoint:Note that the delimeter for parameter loading is currently hacked
INFO:machine_translation.checkpoint: Loaded to CG (1000,)        : /bidirectionalencoder/bidirectionalwmt15/backward.initial_state
INFO:machine_translation.checkpoint: Loaded to CG (2000,)        : /bidirectionalencoder/back_fork/fork_gate_inputs.b
INFO:machine_translation.checkpoint: Loaded to CG (300, 2000)    : /bidirectionalencoder/back_fork/fork_gate_inputs.W
INFO:machine_translation.checkpoint: Loaded to CG (1000,)        : /bidirectionalencoder/back_fork/fork_inputs.b
INFO:machine_translation.checkpoint: Loaded to CG (300, 1000)    : /bidirectionalencoder/back_fork/fork_inputs.W
INFO:machine_translation.checkpoint: Loaded to CG (1000,)        : /bidirectionalencoder/bidirectionalwmt15/forward.initial

loss function is: cross_entropy


  self.inputs + self.contexts + self.input_states + next_outputs, next_states, on_unused_input='warn')
  self.inputs + self.contexts + self.input_states + next_outputs, next_states, on_unused_input='warn')
  self.inputs + self.contexts + self.input_states + next_outputs, next_states, on_unused_input='warn')


In [15]:
source_input = u'<S> This is a test .</S>'.split()
target_prefix = u'<S>'.split()
constraint_2 = u'gewesen'.split()
constraint_1 = u'wichtiger'.split()

constraint_seq = [constraint_1, constraint_2]

source_, constraints_ = imt_tm.build_input_representations(source_input, constraint_seq)

# TODO: this is a hack until we remove the target_prefix completely from the graph
target_prefix_ = imt_tm.imt_model.map_idx_or_unk(target_prefix,
                                                 imt_tm.imt_model.trg_vocab,
                                                 imt_tm.imt_model.unk_idx)

target_prefix_ = np.tile(target_prefix_, (1, 1))

In [16]:
len(source_input)
source_

array([[   0,   60,   10,    9, 1188,    1]])

In [17]:
# TODO: directly use NMT decoder, or remove target_prefix from IMT graph via config
# TODO: this is a dependency question -- which repo will we rely on??
start_hyp = imt_tm.start_hypothesis(source_seq=source_, target_prefix=target_prefix_, 
                                       constraints=constraints_)

In [18]:
# try full constrained decoding
from constrained_decoding import ConstrainedDecoder, Beam

decoder = ConstrainedDecoder(hyp_generation_func=imt_tm.generate,
                             constraint_generation_func=imt_tm.generate_constrained,
                             continue_constraint_func=imt_tm.continue_constrained,
                             beam_implementation=Beam)





In [19]:
search_grid = decoder.search(start_hyp=start_hyp, constraints=constraints_, max_source_len=10, beam_size=5)

TIME: 2
Dies
<type 'unicode'>
Das
<type 'unicode'>
Es
<type 'unicode'>
Dieser
<type 'unicode'>
Dieses
<type 'unicode'>
index: (1, 0)
index: (1, 1)
2
1
3
TIME: 3
ist
<type 'unicode'>
wird
<type 'unicode'>
stellt
<type 'unicode'>
geht
<type 'unicode'>
war
<type 'unicode'>
ist
<type 'unicode'>
wird
<type 'unicode'>
hier
<type 'unicode'>
geht
<type 'unicode'>
war
<type 'unicode'>
ist
<type 'unicode'>
handelt
<type 'unicode'>
geht
<type 'unicode'>
wird
<type 'unicode'>
gibt
<type 'unicode'>
Test
<type 'unicode'>
Grundsatz
<type 'unicode'>
Bericht
<type 'unicode'>
Prozess
<type 'unicode'>
Test@@
<type 'unicode'>
ist
<type 'unicode'>
Verfahren
<type 'unicode'>
Thema
<type 'unicode'>
Test
<type 'unicode'>
Prinzip
<type 'unicode'>
index: (2, 0)
,
<type 'unicode'>
ist
<type 'unicode'>
dies
<type 'unicode'>
.
<type 'unicode'>
das
<type 'unicode'>
ist
<type 'unicode'>
Punkt
<type 'unicode'>
denn
<type 'unicode'>
sein
<type 'unicode'>
sind
<type 'unicode'>
index: (2, 1)
index: (2, 2)
3
3
6
TIME: 4


In [20]:
top_row = max(k[1] for k in search_grid.keys())
top_row

2

In [21]:
# TODO: carry over score after EOS

output_beams = [search_grid[k] for k in search_grid.keys() if k[1] == top_row]
output_hyps = [h for beam in output_beams for h in beam]

# getting the true length of each hypothesis
eos_token = u'</S>'
true_lens = [h.sequence.index(eos_token) if eos_token in h.sequence else len(h.sequence)
             for h in output_hyps]
true_lens = [float(l) for l in true_lens]

output_seqs = [(h.sequence, h.score / true_len) for h, true_len in zip(output_hyps, true_lens)]
output_seqs = sorted(output_seqs, key=lambda x: x[1])

In [22]:
output_seqs

[([None, u'Dies', u'ist', u'ein', u'wichtiger', u'Test', u'gewesen', u'.'],
  2.6524608135223389),
 ([None,
   u'Dies',
   u'ist',
   u'ein',
   u'wichtiger',
   u'Test',
   u'gewesen',
   u'.',
   u'</S>'],
  2.6524860858917236),
 ([None,
   u'Dies',
   u'ist',
   u'ein',
   u'wichtiger',
   u'Test',
   u'gewesen',
   u'.',
   u'</S>',
   u'</S>'],
  2.6524860858917236),
 ([None,
   u'Dies',
   u'ist',
   u'ein',
   u'wichtiger',
   u'Test',
   u'gewesen',
   u'.',
   u'</S>',
   u'</S>',
   u'</S>'],
  2.6524860858917236),
 ([None, u'Das', u'ist', u'ein', u'wichtiger', u'Test', u'gewesen', u'.'],
  2.6638033390045166),
 ([None,
   u'Das',
   u'ist',
   u'ein',
   u'wichtiger',
   u'Test',
   u'gewesen',
   u'.',
   u'</S>'],
  2.6638233661651611),
 ([None,
   u'Das',
   u'ist',
   u'ein',
   u'wichtiger',
   u'Test',
   u'gewesen',
   u'.',
   u'</S>',
   u'</S>'],
  2.6638233661651611),
 ([None,
   u'Das',
   u'ist',
   u'ein',
   u'wichtiger',
   u'Test',
   u'gewesen',
   u'.',
  

In [None]:
[k for k in search_grid.keys()]

In [None]:
c = imt_tm.generate_constrained(start_hyp)

In [None]:
cont = imt_tm.continue_constrained(c[0])

In [None]:
cont.sequence

In [None]:
c[0].sequence

In [None]:
c = imt_tm.continue_constrained(start_hyp)

In [None]:
t = imt_tm.generate(start_hyp, n_best=10)
nts = [imt_tm.generate(h, n_best=10) for h in t]
nnts = [[imt_tm.generate(h, n_best=10) for h in t] for t in nts]

In [None]:
[[[h.sequence for h in stack3] for stack3 in stack2] for stack2 in nnts]

In [None]:
start_hyp.payload