In [7]:
# prototype constrained decoding with INMT models

# build the simplest possible interface to a trained NMT model
# define the payload for NMT hypotheses

# the interface between NMT and constrained decoding needs to know how to create ConstrainedHypothesis objects

import numpy as np

from constrained_decoding import ConstraintHypothesis
from constrained_decoding.translation_model import AbstractConstrainedTM

from nn_imt import IMTPredictor
from neural_mt.machine_translation.configurations import get_config

In [8]:
IMT_CONFIGURATION_FILE = '/home/chris/projects/neural_imt/experiments/configs/demos/en-de/en-de_interactive_demo.yaml'

In [13]:
class NeuralTranslationModel(AbstractConstrainedTM):
    
    def __init__(self, config_file):
        """Intitialize the model according to user provided configuration
        
        - follow the style of BeamSearch, but remove the search logic
        - build the graph and load the parameters (i.e. create a Predictor and expose the right functions)
        """
        
        self.imt_model = IMTPredictor(get_config(config_file))
        self.imt_beam_search = self.imt_model.beam_search 
        
    def build_input_representations(self, source_tokens, constraint_token_seqs):
        """Encode the input sequences using the source and target word-->idx maps"""
        # TODO: add tokenization, subword encoding

        source_seq = self.imt_model.map_idx_or_unk(source_tokens,
                                                   self.imt_model.src_vocab,
                                                   self.imt_model.unk_idx)

        # we assume that constraints are in the target language
        constraint_seqs = []
        for token_seq in constraint_token_seqs:
            token_idxs = self.imt_model.map_idx_or_unk(token_seq,
                                                       self.imt_model.trg_vocab,
                                                       self.imt_model.unk_idx)
            constraint_seqs.append(token_idxs)
        
        # create the input representations from the model
        # TODO: tile to beam_size, aka n_best list size
#         seq = IMTPredictor.sutils._oov_to_unk(
#             segment, self.exp_config['src_vocab_size'], self.unk_idx)
#         input_ = numpy.tile(seq, (self.exp_config['beam_size'], 1))
#         input_ = numpy.tile(seq, (self.exp_config['beam_size'], 1))
        source_seq = np.tile(source_seq, (1, 1))
        
        # TODO: we'll need to tile constraint_seqs up to beam_size for NMT models
        #input_ = numpy.tile(seq, (self.exp_config['beam_size'], 1))
    
        return (source_seq, constraint_seqs)
        
    # TODO: remove target_prefix from args (see below)
    def start_hypothesis(self, source_seq, target_prefix, constraints, coverage=None):
        """
        Build the start hyp for a neural translation model.
        Models may or may not use constraints. I.e. by modeling
        the probability of generating vs. copying from the constraints. 
        
        """
        
        # TODO: there SHOULD BE no self.target_sampling_input because we don't use the prefix representation in constrained
        input_values = {
            self.imt_model.source_sampling_input: source_seq,
            self.imt_model.target_sampling_input: target_prefix
        }
        
        # Note that the initial input of an NMT model is currently implicit (i.e. Readout.initial_input)
        # TODO: how to explicitly encode the NMT start hypothesis
        contexts, states, beam_size = self.imt_beam_search.compute_initial_states_and_contexts(inputs=input_values)

        # Note: explicit initialization of coverage
        coverage = [np.zeros(l, dtype='int16') for l in [len(s) for s in constraints]]
    
        # the payload contains everything that the next timestep will need to generate another output
        payload = {
            'contexts': contexts,
            'states': states,
        }
    
        start_hyp = ConstraintHypothesis(
            token=None,
            score=None, 
            coverage=coverage,
            constraints=constraints,
            payload=payload,
            backpointer=None,
            constraint_index=None,
            unfinished_constraint=False
        )
        
        return start_hyp
        
    
    # TODO: if score is None, replace, don't add in log space
    # TODO: if token is None, and this is the start hypothesis, do nothing
    def generate(self, hyp, n_best):
        # input_values = {:class:`~theano.Variable`: :class:`~numpy.ndarray`}
        
        # WORKING: see if we can generate one timestep from start hyp
        
        # we always start from the beginning of the sequence, so we always need initial states
        # the initial states and contexts are the payload of the "start_hyp"
        # states['outputs']
        # states['weights']
        
        # This array will store all generated outputs, including those from
        # previous step and those from already finished sequences.
        #all_outputs = states['outputs'][None, :]
        #all_masks = numpy.ones_like(all_outputs, dtype=config.floatX)
        #all_costs = numpy.zeros_like(all_outputs, dtype=config.floatX)

        # Chris: get the glimpse weights as well
        #prev_glimpses = states['weights'][None, :]
        #all_glimpses = numpy.zeros_like(prev_glimpses, dtype=config.floatX)

        # Note: confidence at timestep zero is always = 1
        #all_confidences = numpy.ones_like(all_outputs, dtype=config.floatX)
        
        pass
    
    def generate_constrained(self, hyp):
        pass
    
    def continue_constrained(self, hyp):
        pass
        
        

In [14]:
# the mask performs several functions, which serve to tell us the point when a hypothesis starts to end with <EOS>

In [19]:
imt_tm = NeuralTranslationModel(IMT_CONFIGURATION_FILE)


Your function uses a non-shared variable other than those given by scan explicitly. That can significantly slow down `tensor.grad` call. Did you forget to declare it in `contexts`?
INFO:machine_translation.checkpoint:Note that the delimeter for parameter loading is currently hacked
INFO:machine_translation.checkpoint: Loaded to CG (1000,)        : /bidirectionalencoder/bidirectionalwmt15/backward.initial_state
INFO:machine_translation.checkpoint: Loaded to CG (2000,)        : /bidirectionalencoder/back_fork/fork_gate_inputs.b
INFO:machine_translation.checkpoint: Loaded to CG (300, 2000)    : /bidirectionalencoder/back_fork/fork_gate_inputs.W
INFO:machine_translation.checkpoint: Loaded to CG (1000,)        : /bidirectionalencoder/back_fork/fork_inputs.b
INFO:machine_translation.checkpoint: Loaded to CG (300, 1000)    : /bidirectionalencoder/back_fork/fork_inputs.W
INFO:machine_translation.checkpoint: Loaded to CG (1000,)        : /bidirectionalencoder/bidirectionalwmt15/forward.initial

loss function is: cross_entropy


In [20]:
source_input = u'This is a test'.split()
target_prefix = u'Dies'.split()
constraint_seq = [target_prefix]
    
source_, constraints_ = imt_tm.build_input_representations(source_input, constraint_seq)

# TODO: this is a hack until we remove the target_prefix completely from the graph
target_prefix_ = imt_tm.imt_model.map_idx_or_unk(target_prefix,
                                                 imt_tm.imt_model.trg_vocab,
                                                 imt_tm.imt_model.unk_idx)

target_prefix_ = np.tile(target_prefix_, (1, 1))



In [21]:
# TODO: directly use NMT decoder, or remove target_prefix from IMT graph via config
# TODO: this is a dependency question -- which repo will we rely on??
start_hyp = imt_model.start_hypothesis(source_seq=source_, target_prefix=target_prefix_, 
                                       constraints=constraint_seq)

In [22]:
start_hyp.payload

{'contexts': OrderedDict([('attended',
               array([[[ 0.08506747, -0.07544328,  0.07403184, ...,  0.07506459,
                        -0.05924538, -0.14137718]],
               
                      [[-0.13921246, -0.05131282,  0.07787244, ..., -0.09373741,
                         0.11951141,  0.23827443]],
               
                      [[ 0.07176154, -0.02079754,  0.15051207, ..., -0.05749089,
                        -0.08633555,  0.02983917]],
               
                      [[-0.09832497, -0.06990915,  0.05883681, ..., -0.03891521,
                         0.07506926, -0.02417796]]], dtype=float32)),
              ('attended_mask', array([[ 1.],
                      [ 1.],
                      [ 1.],
                      [ 1.]], dtype=float32))]),
 'states': {'outputs': array([213]),
  'states': array([[ -2.44982019e-01,   6.69726208e-02,   3.39313149e-02,
           -1.55020922e-01,  -2.52012312e-02,   5.66895381e-02,
            6.70981407e-03,  -8.014

In [None]:
start_hyp = imt_model.start_hypothesis

In [None]:
        if target_prefix is not None:
            logger.info(u'predicting target prefix: {}'.format(target_prefix))
            target_prefix = self.map_idx_or_unk(target_prefix, self.trg_vocab, self.unk_idx)
            if len(target_prefix) == 0:
                target_prefix = [self.trg_vocab[u'<S>']]
            prefix_seq = IMTPredictor.sutils._oov_to_unk(
                target_prefix, self.exp_config['trg_vocab_size'], self.unk_idx)

            prefix_input_ = numpy.tile(prefix_seq, (self.exp_config['beam_size'], 1))
            # draw sample, checking to ensure we don't get an empty string back
            trans, costs, glimpses, word_level_costs, timestep_confidences = \
                self.beam_search.search(
                    input_values={self.source_sampling_input: input_,
                                  self.target_sampling_input: prefix_input_},
                    max_length=max_length, eol_symbol=self.trg_eos_idx,
                    ignore_first_eol=False)

        else:
            # draw sample, checking to ensure we don't get an empty string back
            trans, costs, glimpses, word_level_costs = \
                self.beam_search.search(
                    input_values={self.sampling_input: input_},
                    max_length=max_length, eol_symbol=self.trg_eos_idx,
                    ignore_first_eol=False)

        # normalize costs according to the sequence lengths
        if self.exp_config['normalized_bleu']:
            lengths = numpy.array([len(s) for s in trans])
            costs = costs / lengths


In [None]:
        all_masks = numpy.ones_like(all_outputs, dtype=config.floatX)
        all_costs = numpy.zeros_like(all_outputs, dtype=config.floatX)

        # Chris: get the glimpse weights as well
        prev_glimpses = states['weights'][None, :]
        all_glimpses = numpy.zeros_like(prev_glimpses, dtype=config.floatX)

        # Note: confidence at timestep zero is always = 1
        all_confidences = numpy.ones_like(all_outputs, dtype=config.floatX)

        for i in range(max_length):
            # if every sequence is already finished
            if all_masks[-1].sum() == 0:
                break

            # We carefully hack values of the `logprobs` array to ensure
            # that all finished sequences are continued with `eos_symbol`.
            # logprobs: (beam_size, target_vocab_size)
            #logprobs = self.compute_logprobs(contexts, states)
            logprobs = self.compute_logprobs(input_values, contexts, states)
            # The additional dim (`None`) is needed to maintain 2d, and to
            # make the broadcasting of `logprobs * all_masks[-1, :, None] work
            next_costs = (all_costs[-1, :, None] +
                          logprobs * all_masks[-1, :, None])
            (finished,) = numpy.where(all_masks[-1] == 0)

            # every cost to the left and to the right of the EOL symbol is infinite, so any sequence
            # that is finished will certainly be continued with the EOL symbol
            next_costs[finished, :eol_symbol] = numpy.inf
            next_costs[finished, eol_symbol + 1:] = numpy.inf

            # The `i == 0` is required because at the first step the beam
            # size is effectively only 1.
            (indexes, outputs), chosen_costs = self._smallest(
                next_costs, beam_size, only_first_row=i == 0)