In [1]:
# prototype constrained decoding with INMT models

# build the simplest possible interface to a trained NMT model
# define the payload for NMT hypotheses

# the interface between NMT and constrained decoding needs to know how to create ConstrainedHypothesis objects

import copy

import numpy as np
from collections import defaultdict, OrderedDict

from constrained_decoding import ConstraintHypothesis
from constrained_decoding.translation_model import AbstractConstrainedTM

from nn_imt import IMTPredictor
from neural_mt.machine_translation.configurations import get_config

In [2]:
IMT_CONFIGURATION_FILE = '/home/chris/projects/neural_imt/experiments/configs/demos/en-de/en-de_interactive_demo.yaml'

In [21]:
class NeuralTranslationModel(AbstractConstrainedTM):
    
    def __init__(self, config_file):
        """Intitialize the model according to user provided configuration
        
        - follow the style of BeamSearch, but remove the search logic
        - build the graph and load the parameters (i.e. create a Predictor and expose the right functions)
        """
        
        self.imt_model = IMTPredictor(get_config(config_file))
        self.imt_beam_search = self.imt_model.beam_search 
        
    def build_input_representations(self, source_tokens, constraint_token_seqs):
        """Encode the input sequences using the source and target word-->idx maps"""
        # TODO: add tokenization, subword encoding

        source_seq = self.imt_model.map_idx_or_unk(source_tokens,
                                                   self.imt_model.src_vocab,
                                                   self.imt_model.unk_idx)

        # we assume that constraints are in the target language
        constraint_seqs = []
        for token_seq in constraint_token_seqs:
            token_idxs = self.imt_model.map_idx_or_unk(token_seq,
                                                       self.imt_model.trg_vocab,
                                                       self.imt_model.unk_idx)
            constraint_seqs.append(token_idxs)
        
        source_seq = np.tile(source_seq, (1, 1))
        
        # TODO: we'll need to tile constraint_seqs up to beam_size for NMT models
        #input_ = numpy.tile(seq, (self.exp_config['beam_size'], 1))
    
        return (source_seq, constraint_seqs)
        
    # TODO: remove target_prefix from args (see below)
    def start_hypothesis(self, source_seq, target_prefix, constraints, coverage=None):
        """
        Build the start hyp for a neural translation model.
        Models may or may not use constraints. I.e. by modeling
        the probability of generating vs. copying from the constraints. 
        
        """
        
        # TODO: there SHOULD BE no self.target_sampling_input because we don't use the prefix representation in constrained
        # input_values = {:class:`~theano.Variable`: :class:`~numpy.ndarray`}
        input_values = {
            self.imt_model.source_sampling_input: source_seq,
            self.imt_model.target_sampling_input: target_prefix
        }
        
        # Note that the initial input of an NMT model is currently implicit (i.e. Readout.initial_input)
        # TODO: how to explicitly encode the NMT start hypothesis
        contexts, states, beam_size = self.imt_beam_search.compute_initial_states_and_contexts(inputs=input_values)

        # Note: explicit initialization of coverage
        coverage = [np.zeros(l, dtype='int16') for l in [len(s) for s in constraints]]
    
        # the payload contains everything that the next timestep will need to generate another output
        payload = {
            'contexts': contexts,
            'states': states,
            # input_values is here because of a bug in getting beam-size from the graph
            'input_values': input_values
        }
    
        start_hyp = ConstraintHypothesis(
            token=None,
            score=None, 
            coverage=coverage,
            constraints=constraints,
            payload=payload,
            backpointer=None,
            constraint_index=None,
            unfinished_constraint=False
        )
        
        return start_hyp
        
    # TODO: if score is None, replace, don't add in log space
    # TODO: if token is None, and this is the start hypothesis, do nothing
    # TODO: any tiling logic idiosyncrasies for IMT beam search need to be encapsulated within this class
    def generate(self, hyp, n_best):
        """
        Note: the `n_best` parameter here is only used to limit the number of hypothesis objects that are generated
        from the input hyp, the beam implementation may specify a different `n_best`
        
        
        """
        
        logprobs = self.imt_beam_search.compute_logprobs(hyp.payload['input_values'],
                                                         hyp.payload['contexts'],
                                                         hyp.payload['states'])
        
        # TODO: assert that logprobs only has one row, so we can flatten it and not lose anything
        n_best_outputs = np.argsort(logprobs.flatten())[:n_best]
        chosen_costs = logprobs.flatten()[n_best_outputs]
        
        # generate n_best ConstrainedHypothesis for each item on the beam, return them all
        # argsort logprobs
        
        payload = hyp.payload
        
        # Note: it's critical to use the OrderedDict here, otherwise args will get out of order in theano funcs
        tiled_payload = defaultdict(OrderedDict)
        tiled_payload['contexts']['attended'] = np.tile(payload['contexts']['attended'], (1, n_best, 1))
        tiled_payload['contexts']['attended_mask'] = np.tile(payload['contexts']['attended_mask'], (1, n_best))

#         [(k, start_hyp.payload['contexts'][k].shape) for k in start_hyp.payload['contexts'].keys()]
#         [('attended', (6, 1, 2000)), ('attended_mask', (6, 1))]

        tiled_payload['states']['outputs'] = np.tile(payload['states']['outputs'], n_best)
        tiled_payload['states']['states'] = np.tile(payload['states']['states'], (n_best, 1))
        tiled_payload['states']['weights'] = np.tile(payload['states']['weights'], (n_best, 1))
        tiled_payload['states']['weighted_averages'] = np.tile(payload['states']['weighted_averages'], (n_best, 1))

# [(k, start_hyp.payload['states'][k].shape) for k in start_hyp.payload['states'].keys()]
# [('outputs', (1,)),
#  ('states', (1, 1000)),
#  ('weights', (1, 6)),
#  ('weighted_averages', (1, 2000))]

        tiled_payload['input_values'][self.imt_model.source_sampling_input] = np.tile(payload['input_values'][self.imt_model.source_sampling_input], 
                                                                                      (n_best, 1))
        tiled_payload['input_values'][self.imt_model.target_sampling_input] = np.tile(payload['input_values'][self.imt_model.target_sampling_input], 
                                                                                      (n_best, 1))

# [(k, start_hyp.payload['input_values'][k].shape) for k in start_hyp.payload['input_values'].keys()]
# [(sampling_input, (1, 6)), (sampling_target_prefix, (1, 2))]


        # Now we need to tile the previous hyp values to make this work
        next_states = self.imt_beam_search.compute_next_states(tiled_payload['input_values'],
                                                               tiled_payload['contexts'],
                                                               tiled_payload['states'],
                                                               n_best_outputs)

        # create ContstrainedHypothesis objects from these states (tile back down to one)
        new_hyps = []
        for hyp_idx in range(n_best):
            new_payload = defaultdict(OrderedDict)
            new_payload['contexts'] = payload['contexts']

            new_payload['states']['outputs'] = next_states['outputs'][hyp_idx]
            new_payload['states']['states'] = next_states['states'][hyp_idx]
            new_payload['states']['weights'] = next_states['weights'][hyp_idx]
            new_payload['states']['weighted_averages'] = next_states['weighted_averages'][hyp_idx]   
            
            new_payload['input_values'] = hyp.payload['input_values']
                  
            new_hyp = ConstraintHypothesis(
                token=self.imt_model.trg_ivocab[n_best_outputs[hyp_idx]],
                score=chosen_costs[hyp_idx], 
                coverage=copy.deepcopy(hyp.coverage),
                constraints=hyp.constraints,
                payload=new_payload,
                backpointer=hyp,
                constraint_index=None,
                unfinished_constraint=False
            )

            new_hyps.append(new_hyp)
            
        return new_hyps
        
        
        # The additional dim (`None`) is needed to maintain 2d, and to
        # make the broadcasting of `logprobs * all_masks[-1, :, None] work
#         next_costs = (all_costs[-1, :, None] +
#                       logprobs * all_masks[-1, :, None])
#         (finished,) = numpy.where(all_masks[-1] == 0)
        
        # WORKING: see if we can generate one timestep from start hyp
        
        # we always start from the beginning of the sequence, so we always need initial states
        # the initial states and contexts are the payload of the "start_hyp"
        # states['outputs']
        # states['weights']
        
        # This array will store all generated outputs, including those from
        # previous step and those from already finished sequences.
        #all_outputs = states['outputs'][None, :]
        #all_masks = numpy.ones_like(all_outputs, dtype=config.floatX)
        #all_costs = numpy.zeros_like(all_outputs, dtype=config.floatX)

        # Chris: get the glimpse weights as well
        #prev_glimpses = states['weights'][None, :]
        #all_glimpses = numpy.zeros_like(prev_glimpses, dtype=config.floatX)

        # Note: confidence at timestep zero is always = 1
        #all_confidences = numpy.ones_like(all_outputs, dtype=config.floatX)
            
    def generate_constrained(self, hyp):
        pass
    
    def continue_constrained(self, hyp):
        pass
        
        

In [22]:
# the mask performs several functions, which serve to tell us the point when a hypothesis starts to end with <EOS>

In [23]:
imt_tm = NeuralTranslationModel(IMT_CONFIGURATION_FILE)


Your function uses a non-shared variable other than those given by scan explicitly. That can significantly slow down `tensor.grad` call. Did you forget to declare it in `contexts`?
INFO:machine_translation.checkpoint:Note that the delimeter for parameter loading is currently hacked
INFO:machine_translation.checkpoint: Loaded to CG (1000,)        : /bidirectionalencoder/bidirectionalwmt15/backward.initial_state
INFO:machine_translation.checkpoint: Loaded to CG (2000,)        : /bidirectionalencoder/back_fork/fork_gate_inputs.b
INFO:machine_translation.checkpoint: Loaded to CG (300, 2000)    : /bidirectionalencoder/back_fork/fork_gate_inputs.W
INFO:machine_translation.checkpoint: Loaded to CG (1000,)        : /bidirectionalencoder/back_fork/fork_inputs.b
INFO:machine_translation.checkpoint: Loaded to CG (300, 1000)    : /bidirectionalencoder/back_fork/fork_inputs.W
INFO:machine_translation.checkpoint: Loaded to CG (1000,)        : /bidirectionalencoder/bidirectionalwmt15/forward.initial

loss function is: cross_entropy


In [24]:
source_input = u'<S> This is a test </S>'.split()
target_prefix = u'<S> Dies'.split()
constraint_seq = [target_prefix]
    
source_, constraints_ = imt_tm.build_input_representations(source_input, constraint_seq)

# TODO: this is a hack until we remove the target_prefix completely from the graph
target_prefix_ = imt_tm.imt_model.map_idx_or_unk(target_prefix,
                                                 imt_tm.imt_model.trg_vocab,
                                                 imt_tm.imt_model.unk_idx)

target_prefix_ = np.tile(target_prefix_, (1, 1))



In [25]:
len(source_input)
source_

array([[    0,    60,    10,     9,  1188, 79999]])

In [26]:
# TODO: directly use NMT decoder, or remove target_prefix from IMT graph via config
# TODO: this is a dependency question -- which repo will we rely on??
start_hyp = imt_tm.start_hypothesis(source_seq=source_, target_prefix=target_prefix_, 
                                       constraints=constraint_seq)

In [27]:
t = imt_tm.generate(start_hyp, n_best=10)
# [imt_tm.imt_model.trg_ivocab[idx] for idx in t]

In [28]:
[h.sequence for h in t]

[[None, 'ist'],
 [None, 'wird'],
 [None, 'stellt'],
 [None, 'geht'],
 [None, 'war'],
 [None, 'bedeutet'],
 [None, 'hier'],
 [None, 'gilt'],
 [None, 'betrifft'],
 [None, 'hat']]

In [11]:
start_hyp.payload

{'contexts': OrderedDict([('attended',
               array([[[ 0.04528177, -0.0509911 ,  0.09122077, ...,  0.09481169,
                         0.00989652,  0.01458982]],
               
                      [[-0.00767271, -0.06559375,  0.13901421, ...,  0.11239667,
                        -0.03128631, -0.13832098]],
               
                      [[-0.17824191, -0.05932697,  0.05293337, ..., -0.07072181,
                         0.1380036 ,  0.18636586]],
               
                      [[ 0.03807617, -0.02882742,  0.11557579, ..., -0.0431877 ,
                        -0.19011681,  0.04655611]],
               
                      [[-0.10845466, -0.08531244,  0.03143437, ...,  0.02837985,
                         0.09345165,  0.03291582]],
               
                      [[ 0.12777634,  0.02694822, -0.12939385, ..., -0.0445323 ,
                         0.01740642, -0.0358777 ]]], dtype=float32)),
              ('attended_mask', array([[ 1.],
                   