In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

import os
import codecs
import subprocess
from pprint import pprint
from subprocess import Popen, PIPE, STDOUT

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

DEBUG:root:test


In [2]:
import numpy
import codecs
import tempfile
import cPickle
import os
import copy
from collections import OrderedDict
import itertools

from fuel.datasets import H5PYDataset
from picklable_itertools import iter_, chain
from fuel.datasets import Dataset
from fuel.datasets import TextFile
from fuel.schemes import ConstantScheme
from fuel.streams import DataStream
from fuel.transformers import (
    Merge, Batch, Filter, Padding, SortMapping, Unpack, Mapping)

import os
import shutil
from collections import Counter
from theano import tensor
from toolz import merge
import numpy
import pickle
from subprocess import Popen, PIPE
import codecs

from blocks.algorithms import (GradientDescent, StepClipping,
                               CompositeRule, Adam, AdaDelta)
from blocks.extensions import FinishAfter, Printing, Timing
from blocks.extensions.monitoring import TrainingDataMonitoring
from blocks.filter import VariableFilter
from blocks.graph import ComputationGraph, apply_noise, apply_dropout
from blocks.initialization import IsotropicGaussian, Orthogonal, Constant
from blocks.main_loop import MainLoop
from blocks.model import Model
from blocks.select import Selector
from blocks.search import BeamSearch
from blocks_extras.extensions.plot import Plot

from machine_translation.checkpoint import CheckpointNMT, LoadNMT
from machine_translation.model import BidirectionalEncoder, Decoder
from machine_translation.sampling import BleuValidator, Sampler, SamplingBase
from machine_translation.stream import (get_tr_stream, get_dev_stream,
                                        _ensure_special_tokens)


from nnqe.dataset.preprocess import whitespace_tokenize

In [3]:
# make a fuel stream which subclasses text file to create a stream which provides three sources: 
# (source, [samples], and BLEU scores)

# going forward, this may not be the fastest way because the sampling and BLEU score computation can be time consuming
# we should look at Fuel's read-ahead and cacheing capacity

# TODO: stateful transformer which takes a stream and adds the sources ('samples', 'scores')
# class Mapping(Transformer)
# the mapping should be a callable which gets samples, then computes the sentence-level BLEU
# score for each sample with respect to the reference

# use this script to get sentence-level scores(?)
# https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v13a.pl

# parameters
# sample_func: function(num_samples=1) which takes source seq and outputs <num_samples> samples
# score_func: function

# TODO: how do they create simple test streams in the fuel tests?

In [4]:


class MTSampleStreamTransformer:
    """
    Stateful transformer which takes a stream of (source, target) and adds the sources ('samples', 'scores')
    
    Samples are generated by calling the sample func with the source as argument
    
    Scores are generated by comparing each generated sample to the reference
    
    Parameters
    ----------
    sample_func: function(num_samples=1) which takes source seq and outputs <num_samples> samples
    score_func: function

    At call time, we expect a stream providing (sources,) -- i.e. something like a TextFile object

    
    """
    
    
    def __init__(self, sample_func, score_func, num_samples=1):
        self.sample_func = sample_func
        self.score_func = score_func
        self.num_samples = num_samples

    def __call__(self, data):
        print('calling transformer:')
        source = data[0]
        reference = data[1]
        
        print('num_samples: {}'.format(self.num_samples))
        
        # each sample may be of different length
        samples = self.sample_func(numpy.array(source), self.num_samples)
        scores = self._compute_scores(samples, reference)
    
        return (samples, scores)

    # Note that sentence-level BLEU can be computed directly over the indexes (not the strings),
    # So we don't need to map back to a string representation
    def _compute_scores(self, samples, reference):
        """Call the scoring function to compare each sample to the reference"""
        
#         return numpy.ones(len(samples), dtype='float32')
        return self.score_func(samples, reference)


# def _get_true_length(seq, vocab):
#     try:
#         return seq.tolist().index(vocab['</S>']) + 1
#     except ValueError:
#         return len(seq)

In [5]:
def fake_score(source, target):
    return 1.

def fake_sample(source, num_samples=1):
    return numpy.vstack(list(itertools.repeat([2,67,33,778,323,68], num_samples)))

In [6]:
test_transformer = MTSampleStreamTransformer(fake_sample, fake_score, num_samples=5)

In [7]:
fake_data = ([[1,55,75,324,43,0], [6546,24,4123,73,85,13]])

test_transformer(fake_data)

calling transformer:
num_samples: 5


(array([[  2,  67,  33, 778, 323,  68],
        [  2,  67,  33, 778, 323,  68],
        [  2,  67,  33, 778, 323,  68],
        [  2,  67,  33, 778, 323,  68],
        [  2,  67,  33, 778, 323,  68]]), 1.0)

In [8]:
# create the graph which can sample from our model 
# Note that we must sample instead of getting the 1-best or N-best, because we need the randomness to make the expected
# BLEU score make sense

exp_config = {
    'src_vocab_size': 20000,
    'trg_vocab_size': 20000,
    'enc_embed': 300,
    'dec_embed': 300,
    'enc_nhids': 800,
    'dec_nhids': 800,
    'saved_parameters': '/home/chris/projects/neural_mt/archived_models/BERTHA-TEST_Adam_wmt-multimodal_internal_data_dropout'+\
    '0.3_ff_noiseFalse_search_model_en2es_vocab20000_emb300_rec800_batch15/best_bleu_model_1455464992_BLEU31.61.npz',
    'src_vocab': '/home/chris/projects/neural_mt/archived_models/BERTHA-TEST_Adam_wmt-multimodal_internal_data_dropout0'+\
    '.3_ff_noiseFalse_search_model_en2es_vocab20000_emb300_rec800_batch15/vocab.en-de.en.pkl',
    'trg_vocab': '/home/chris/projects/neural_mt/archived_models/BERTHA-TEST_Adam_wmt-multimodal_internal_data_dropout0'+\
    '.3_ff_noiseFalse_search_model_en2es_vocab20000_emb300_rec800_batch15/vocab.en-de.de.pkl',
    'src_data': '/home/chris/projects/neural_mt/archived_models/BERTHA-TEST_Adam_wmt-multimodal_internal_data_dropout'+\
    '0.3_ff_noiseFalse_search_model_en2es_vocab20000_emb300_rec800_batch15/training_data/train.en.tok.shuf',
    'trg_data': '/home/chris/projects/neural_mt/archived_models/BERTHA-TEST_Adam_wmt-multimodal_internal_data_dropout'+\
    '0.3_ff_noiseFalse_search_model_en2es_vocab20000_emb300_rec800_batch15/training_data/train.de.tok.shuf',
    'unk_id':1
}

def get_sampling_model_and_input(exp_config):
    # Create Theano variables
    encoder = BidirectionalEncoder(
        exp_config['src_vocab_size'], exp_config['enc_embed'], exp_config['enc_nhids'])

    decoder = Decoder(
        exp_config['trg_vocab_size'], exp_config['dec_embed'], exp_config['dec_nhids'],
        exp_config['enc_nhids'] * 2)

    # Create Theano variables
    logger.info('Creating theano variables')
    sampling_input = tensor.lmatrix('source')

    # Get beam search
    logger.info("Building sampling model")
    sampling_representation = encoder.apply(
        sampling_input, tensor.ones(sampling_input.shape))
    generated = decoder.generate(sampling_input, sampling_representation)

#     _, samples = VariableFilter(
#         bricks=[decoder.sequence_generator], name="outputs")(
#                  ComputationGraph(generated[1]))  # generated[1] is next_outputs
#     beam_search = BeamSearch(samples=samples)

    # build the model that will let us get a theano function from the sampling graph
    logger.info("Creating Sampling Model...")
    sampling_model = Model(generated)

    # Set the parameters from a trained models
    logger.info("Loading parameters from model: {}".format(exp_config['saved_parameters']))
    # load the parameter values from an .npz file
    param_values = LoadNMT.load_parameter_values(exp_config['saved_parameters'])
    LoadNMT.set_model_parameters(sampling_model, param_values)
    
    return sampling_model, sampling_input

test_model, theano_sampling_input = get_sampling_model_and_input(exp_config)

INFO:root:Creating theano variables
INFO:root:Building sampling model
INFO:root:Creating Sampling Model...
INFO:root:Loading parameters from model: /home/chris/projects/neural_mt/archived_models/BERTHA-TEST_Adam_wmt-multimodal_internal_data_dropout0.3_ff_noiseFalse_search_model_en2es_vocab20000_emb300_rec800_batch15/best_bleu_model_1455464992_BLEU31.61.npz
INFO:machine_translation.checkpoint: Loaded to CG (800,)         : /bidirectionalencoder/bidirectionalwmt15/backward.initial_state
INFO:machine_translation.checkpoint: Loaded to CG (1600,)        : /bidirectionalencoder/back_fork/fork_gate_inputs.b
INFO:machine_translation.checkpoint: Loaded to CG (300, 1600)    : /bidirectionalencoder/back_fork/fork_gate_inputs.W
INFO:machine_translation.checkpoint: Loaded to CG (800,)         : /bidirectionalencoder/back_fork/fork_inputs.b
INFO:machine_translation.checkpoint: Loaded to CG (300, 800)     : /bidirectionalencoder/back_fork/fork_inputs.W
INFO:machine_translation.checkpoint: Loaded to C

In [9]:
# test that we can pull samples from the model
test_sampling_func = test_model.get_theano_function()
trg_vocab = cPickle.load(open(exp_config['trg_vocab']))
trg_vocab_size = exp_config['trg_vocab_size'] - 1

trg_vocab = _ensure_special_tokens(trg_vocab, bos_idx=0,
                                   eos_idx=trg_vocab_size, unk_idx=exp_config['unk_id'])


# close over the sampling func and the trg_vocab to standardize the interface
# TODO: actually this should be a callable class with params (sampling_func, trg_vocab)
# TODO: we may be able to make this function faster by passing multiple sources for sampling at the same damn time
def sampling_func(source_seq, num_samples=1):
    print('sampling_func')
    print('num_samples: {}'.format(num_samples))
    
    def _get_true_length(seq, trg_vocab):
        try:
            return seq.tolist().index(trg_vocab['</S>']) + 1
        except ValueError:
            return len(seq)
    
    samples = []
    for _ in range(num_samples):
        # outputs of self.sampling_fn = outputs of sequence_generator.generate: next_states + [next_outputs] +
        #                 list(next_glimpses.values()) + [next_costs])
        _1, outputs, _2, _3, costs = test_sampling_func(source_seq[None, :])
        # if we are generating a single sample, the length of the output will be len(source_seq)*2
        # see decoder.generate
        # the output is a [seq_len, 1] array
        outputs = outputs.reshape(outputs.shape[0])
        outputs = outputs[:_get_true_length(outputs, trg_vocab)]
        
        samples.append(outputs)
    
    return samples


In [10]:
# make TextFile streams for source data and references, then merge them together

def get_textfile_stream(source_file=None, src_vocab=None, src_vocab_size=30000,
                      unk_id=1):
    """Create a TextFile dataset, and return a stream"""
    
    source_stream = None
   
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict) else
        cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    source_dataset = TextFile([source_file], src_vocab, bos_token=None)
    source_stream = DataStream(source_dataset)
    return source_stream

In [11]:
src_stream = get_textfile_stream(source_file=exp_config['src_data'], src_vocab=exp_config['src_vocab'],
                                         src_vocab_size=exp_config['src_vocab_size'])

# test_source_stream.sources = ('sources',)
trg_stream = get_textfile_stream(source_file=exp_config['trg_data'], src_vocab=exp_config['trg_vocab'],
                                         src_vocab_size=exp_config['trg_vocab_size'])

In [12]:
# Merge them to get a source, target pair
training_stream = Merge([src_stream,
                         trg_stream],
                         ('source', 'target'))

In [13]:
sampling_transformer = MTSampleStreamTransformer(sampling_func, fake_score, num_samples=5)

training_stream = Mapping(training_stream, sampling_transformer, add_sources=('samples', 'scores'))

In [14]:
# SANITY
# load src and target vocabs to validate samples
src_vocab = cPickle.load(open(exp_config['src_vocab']))
src_vocab_size = exp_config['src_vocab_size'] - 1
trg_vocab_size = exp_config['trg_vocab_size'] - 1


src_vocab = _ensure_special_tokens(src_vocab, bos_idx=0,
                                   eos_idx=src_vocab_size, unk_idx=exp_config['unk_id'])

src_ivocab = {v:k for k,v in src_vocab.items()}
trg_ivocab = {v:k for k,v in trg_vocab.items()}

k = 5
test_iter = training_stream.get_epoch_iterator()
for _ in range(k):
    source, reference, samples, scores = test_iter.next()
    print('source: {}'.format(' '.join(src_ivocab[w] for w in source)))
    print('ref: {}'.format(' '.join(trg_ivocab[w] for w in reference)))
    for sample in samples:
        print('sample: {}'.format(' '.join(trg_ivocab[w] for w in sample)))
        print('len sample: {}'.format(len(sample)))
    print('scores: {}'.format(scores))

calling transformer:
num_samples: 5
sampling_func
num_samples: 5
source: Two boys in blue shirts sit on the end of a bench next to a fair ride . </S>
ref: Zwei Jungen in blauen T-Shirts sitzen am Ende einer Bank neben einem Fahrgeschäft . </S>
sample: Zwei Jungen in Bluejeans sitzen am Ende einer Bank neben einem Jahrmarkt . </S>
len sample: 14
sample: Zwei Jungen in blauen Hemden sitzen am Ende einer Treppe neben einem Jahrmarkt . </S>
len sample: 15
sample: Zwei Jungen in blauen Hemden sitzen am Ende einer Holzbank und neben einem Jahrmarkt . </S>
len sample: 16
sample: Zwei Jungen in blauen Hemden sitzen auf einem hergerichteten Schale neben einem Jahrmarkt . </S>
len sample: 15
sample: Zwei Jungen mit blauen Hemden sitzen am Ende einer Bank neben einem Fahrgeschäft in der Stadt . </S>
len sample: 18
scores: 1.0
calling transformer:
num_samples: 5
sampling_func
num_samples: 5
source: A man pointing to the audience area on stage while holding a microphone with banjos behind him . </S

In [15]:
# Now transform the training stream into a stream providing (sources, samples, scores)
# sources is tensor.lmatrix
# targets is tensor.tensor3 - dtype int64 (N target samples for each source)
# scores is tensor.matrix - dtype float32 -- each row contains the N scores for the N samples
# generated by the source

In [16]:
# calling the external scoring script
sentence_level_bleu_script = '/home/chris/projects/neural_mt/scripts/scoring/mteval-v13a.pl'

#     self.multibleu_cmd = ['perl', self.config['bleu_script'],
#                               self.config['val_set_grndtruth'], '<']
    
    
# make temporary files for source, targets, references (sources are required by mteval-v13a.pl)
# TODO: automatically make temporary files that have this data wrapped in SGML so that it can work with moses
# NOTE: remember the sentence level BLEU score python interface in CDEC that we used for QE 2014, 
test_hyps='/home/chris/projects/neural_mt/proto/data_for_testing_sampling/test.target'
test_refs='/home/chris/projects/neural_mt/proto/data_for_testing_sampling/test.ref'
test_srcs='/home/chris/projects/neural_mt/proto/data_for_testing_sampling/test.source'
# test_hyps='/home/chris/projects/neural_mt/scripts/scoring/test.target.de.sgm'
# test_refs='/home/chris/projects/neural_mt/scripts/scoring/test.ref.de.sgm'
# test_srcs='/home/chris/projects/neural_mt/scripts/scoring/test.source.en.sgm'


"""
  "Description:  This Perl script evaluates MT system performance.\n".
    "\n".
    "Required arguments:\n".
    "  -r <ref_file> is a file containing the reference translations for\n".
    "      the documents to be evaluated.\n".
    "  -s <src_file> is a file containing the source documents for which\n".
    "      translations are to be evaluated\n".
    "  -t <tst_file> is a file containing the translations to be evaluated\n".
    "\n".
    "Optional arguments:\n".
    "  -h prints this help message to STDOUT\n".
    "  -c preserves upper-case alphabetic characters\n".
    "  -b generate BLEU scores only\n".
    "  -n generate NIST scores only\n".
    "  -d detailed output flag:\n".
    "         0 (default) for system-level score only\n".
    "         1 to include document-level scores\n".
    "         2 to include segment-level scores\n".
    "         3 to include ngram-level scores\n".
    "  -e enclose non-ASCII characters between spaces\n".
     "  --brevity-penalty ( closest | shortest )\n" .
    "         closest (default) : acts as IBM BLEU (takes the closest reference translation length)\n" .
    "         shortest : acts as previous versions of the script (takes the shortest reference translation length)\n" .
    "  --international-tokenization\n" .
    "         when specified, uses Unicode-based (only) tokenization rules\n" .
    "         when not specified (default), uses default tokenization (some language-dependant rules)\n" .
    "  --metricsMATR : create three files for both BLEU scores and NIST scores:\n" .
    "         BLEU-seg.scr and NIST-seg.scr : segment-level scores\n" .
    "         BLEU-doc.scr and NIST-doc.scr : document-level scores\n" .
    "         BLEU-sys.scr and NIST-sys.scr : system-level scores\n" .
    "  --no-smoothing : disable smoothing on BLEU scores\n" .
    "\n";
"""

# '--international-tokenization' ??
# '-c preserve case' ??
sentence_bleu_cmd = ['perl', sentence_level_bleu_script, '-b', '-r', test_refs, '-s', test_srcs, '-t', test_hyps, 
                     '-d', str(2)]


bleu_score_subprocess = Popen(sentence_bleu_cmd, stdout=PIPE)
# bleu_score_subprocess.stdin.flush()

subprocess.check_output(sentence_bleu_cmd, shell=True)

# bleu_out, bleu_err = bleu_score_subprocess.communicate()


#      # Write to subprocess and file if it exists
#     print(trans_out, file=mb_subprocess.stdin)
            
            
#         # send end of file, read output.
#         mb_subprocess.stdin.close()
#         stdout = mb_subprocess.stdout.readline()
#         logger.info(stdout)
#         out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
#         logger.info("Validation Took: {} minutes".format(
#             float(time.time() - val_start_time) / 60.))
#         assert out_parse is not None

#         # extract the score
#         bleu_score = float(out_parse.group()[6:])
#         self.val_bleu_curve.append(bleu_score)
#         logger.info(bleu_score)
#         mb_subprocess.terminate()


#         return bleu_score

''

In [17]:
# first wrap the source
# perl wrap-xml-modified.perl en ../../proto/data_for_testing_sampling/test.source unbabel src < ../../proto/data_for_testing_sampling/test.source > test.source.en.sgm
# now wrap target and ref
# target hyps
# perl wrap-xml-modified.perl de test.source.en.sgm unbabel tst < ../../proto/data_for_testing_sampling/test.target > test.target.de.sgm
# target refs
# perl wrap-xml-modified.perl de test.source.en.sgm unbabel ref < ../../proto/data_for_testing_sampling/test.ref > test.ref.de.sgm


In [24]:
# from Unbabel evaluation
def wrap_xml(fname, language=None, src_lang=None, flag='tst', engine='moses'):
    '''
    :param fname: The filename that needs to be wrapped around xml tags.
                  The type can be taken from the file's suffix
    :param engine: The MT engine the decoded text comes from
    :param flag: string - source, reference, target (src, ref, tst)
    :param language: string - the language of the text
    :return: fname.flag.language.sgm that is the input file ins SGML format
    '''
    logger.info("Wrapping xml for %s with flag %s" % (fname, flag))
    if flag == 'src':
        """ As an exception, if there is no source for reference,
        we pass the same file name """
        source_sgml = fname
    elif flag == 'ref':
        base_fname = '.'.join(fname.split('.')[:-1])
        source_sgml = "%s.%s.sgm" % (base_fname, src_lang)
    else:
        source_sgml = fname.replace(engine, src_lang) + '.sgm'

    wrap_xml_script = os.path.join(os.path.dirname(os.path.abspath('__file__')), 'scripts',
                                   'wrap-xml-modified.perl')
    # perl ../wrap-xml-modified.perl en 10.unbabel201410.es-en.source.sgm bing tst < 10.unbabel201410.es-en.bing
    # > 10.unbabel201410.es-en.bing.sgm
    wrap_xml_cmd = ['perl', wrap_xml_script, language, source_sgml, engine, flag]
    logger.debug(' '.join(wrap_xml_cmd) + '< ' + fname)
    with open(fname, 'r+') as f:
        # r.readall() gave an error, this is a hack
        fname_stdin = ''.join(f.readlines())

    with open(fname + '.sgm', 'w+') as f:
        p = subprocess.Popen(wrap_xml_cmd, stdout=f, stdin=subprocess.PIPE)
        p.communicate(input=fname_stdin)[0]



def mteval_13(source_file, reference_file, hypothesis_file, src_lang='en', trg_lang='de', engine='nmt'):
    '''
    Calling WMT Perl script for the evaluation
    :param source_file: source SGML file
    :param reference_file: reference SGML file
    :param decoded_file: mt-translated SGML file
    :return: list of sentence level BLEU for each segment
    '''
    
    #     WORKING: incorporate SGML wrapping into temp files and BLEU score computation all into this script
    #     make named temporary files and use these to hold wrap-xml output, delete them after

    wrap_xml_script = '/home/chris/projects/neural_mt/scripts/scoring/wrap-xml-modified.perl'
    
    flags = ['src', 'ref', 'tst']
    inps_and_flags = zip([source_file, reference_file, hypothesis_file], flags)
    
    wrapped_files = []
    for filename, flag in inps_and_flags:
        # the following logic assumes 'src' is the first item in the list, otherwise it will break
        if flag == 'src':
            """For the source, just pass the source filename"""
            source_sgml = filename
            # create the named temporary file that will hold the output sgml file
            source_sgm_file = tempfile.NamedTemporaryFile(delete=False)
            source_sgm_name = source_sgm_file.name
            output_sgm_name = source_sgm_name
            language = src_lang
        
         # TODO: why were they providing different sgml sources for ref and target?
        elif flag == 'ref':
#             base_fname = '.'.join(fname.split('.')[:-1])
#         source_sgml = "%s.%s.sgm" % (base_fname, src_lang)
            source_sgml = source_sgm_name
            temp_file = tempfile.NamedTemporaryFile(delete=False)
            output_sgm_name = temp_file.name
            ref_sgm_name = output_sgm_name
            language = trg_lang

        else:
            source_sgml = source_sgm_name
            temp_file = tempfile.NamedTemporaryFile(delete=False)
            output_sgm_name = temp_file.name
            hyp_sgm_file = output_sgm_name
            language = trg_lang
            
#         write the new sgm file
        wrap_xml_cmd = ['perl', wrap_xml_script, language, source_sgml, engine, flag]
        logger.debug(' '.join(wrap_xml_cmd) + '< ' + filename)
        with open(filename, 'r+') as f:
            fname_stdin = f.read()

        with open(output_sgm_name, 'w+') as f:
            p = subprocess.Popen(wrap_xml_cmd, stdout=f, stdin=subprocess.PIPE)
            p.communicate(input=fname_stdin)[0]
    
    # now compute the segment-level BLEU scores
    mteval_2013_script = '/home/chris/projects/neural_mt/scripts/scoring/mteval-v13a.pl'
#     mteval_2013_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'scripts', 'mteval-v13a.pl')
#     mteval_cmd = ['perl', mteval_2013_script, '-r', reference_file, '-s',
#                   source_file, '-t', decoded_file, '-b', '-d', '2']
    mteval_cmd = ['perl', mteval_2013_script, '-r', ref_sgm_name, '-s',
                  source_sgm_name, '-t', hyp_sgm_file, '-b', '-d', '2']
    logger.debug(' '.join(mteval_cmd))
    mteval_proc = subprocess.Popen(mteval_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
    stdout_data = mteval_proc.communicate()[0]
    
    # now parse stdout_data to get the scores and do asserts to make sure parsing is correct
    per_seg_lines = stdout_data.split('\n\n')[1]
    per_seg_lines = per_seg_lines.split('\n')
        
    # the last two lines are corpus-level info
    per_seg_lines = per_seg_lines[:-2]
    bleu_scores = [l.split()[5] for l in per_seg_lines]
    pprint(bleu_scores)
    
    with open(hypothesis_file) as inp:
        # the substring we're going to count -- we need to do it like this because the
        # moses wrap-xml script adds lines, but each real line is wrapped in <seg id=...></seg> tags 
#         line_substring = 'seg id='
        num_segments = inp.read().strip().split('\n')
    print(num_segments)
    print(len(bleu_scores))
            
    assert len(bleu_scores) == num_segments, "We must get one score for each segment"
    
    return bleu_scores


#     newfile.write("\n".join(["\t".join(lines)
#                              for lines in map(list, zip(*all_lines))]).encode("utf-8"))
#     newfile.flush()
#     temp_file_name = newfile.name

In [25]:
mteval_13(test_srcs, test_refs, test_hyps)

DEBUG:root:perl /home/chris/projects/neural_mt/scripts/scoring/wrap-xml-modified.perl en /home/chris/projects/neural_mt/proto/data_for_testing_sampling/test.source nmt src< /home/chris/projects/neural_mt/proto/data_for_testing_sampling/test.source
DEBUG:root:perl /home/chris/projects/neural_mt/scripts/scoring/wrap-xml-modified.perl de /tmp/tmpbbu9C3 nmt ref< /home/chris/projects/neural_mt/proto/data_for_testing_sampling/test.ref
DEBUG:root:perl /home/chris/projects/neural_mt/scripts/scoring/wrap-xml-modified.perl de /tmp/tmpbbu9C3 nmt tst< /home/chris/projects/neural_mt/proto/data_for_testing_sampling/test.target
DEBUG:root:perl /home/chris/projects/neural_mt/scripts/scoring/mteval-v13a.pl -r /tmp/tmpsdKgbb -s /tmp/tmpbbu9C3 -t /tmp/tmpiGv7BJ -b -d 2


['1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000',
 '1.0000']


AssertionError: We must get one score for each segment

In [26]:
test_hyps

'/home/chris/projects/neural_mt/proto/data_for_testing_sampling/test.target'

In [None]:
bleu_err

mteval script output 
(1:2008)$ perl mteval-v13a.pl -b -d 2 -r test.ref.de.sgm -s test.source.en.sgm -t test.target.de.sgm 
Use of 'Hyphen' in \p{} or \P{} is deprecated because: Supplanted by Line_Break property values; see www.unicode.org/reports/tr14; at mteval-v13a.pl line 951.
MT evaluation scorer began on 2016 Mar 11 at 17:32:48
command line:  mteval-v13a.pl -b -d 2 -r test.ref.de.sgm -s test.source.en.sgm -t test.target.de.sgm
  Evaluation of any-to-de translation using:
    src set "SETID" (1 docs, 20 segs)
    ref set "SETID" (1 refs)
    tst set "SETID" (1 systems)

  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 1 of document "../../proto/data_for_testing_sampling/test.source" (14 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 2 of document "../../proto/data_for_testing_sampling/test.source" (19 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 3 of document "../../proto/data_for_testing_sampling/test.source" (15 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 4 of document "../../proto/data_for_testing_sampling/test.source" (13 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 5 of document "../../proto/data_for_testing_sampling/test.source" (14 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 6 of document "../../proto/data_for_testing_sampling/test.source" (14 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 7 of document "../../proto/data_for_testing_sampling/test.source" (10 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 8 of document "../../proto/data_for_testing_sampling/test.source" (9 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 9 of document "../../proto/data_for_testing_sampling/test.source" (13 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 10 of document "../../proto/data_for_testing_sampling/test.source" (13 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 11 of document "../../proto/data_for_testing_sampling/test.source" (8 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 12 of document "../../proto/data_for_testing_sampling/test.source" (11 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 13 of document "../../proto/data_for_testing_sampling/test.source" (11 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 14 of document "../../proto/data_for_testing_sampling/test.source" (12 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 15 of document "../../proto/data_for_testing_sampling/test.source" (11 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 16 of document "../../proto/data_for_testing_sampling/test.source" (11 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 17 of document "../../proto/data_for_testing_sampling/test.source" (16 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 18 of document "../../proto/data_for_testing_sampling/test.source" (13 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 19 of document "../../proto/data_for_testing_sampling/test.source" (14 words)
  BLEU score using 4-grams = 1.0000 for system "unbabel" on segment 20 of document "../../proto/data_for_testing_sampling/test.source" (9 words)
BLEU score using   4-grams = 1.0000 for system "unbabel" on document "../../proto/data_for_testing_sampling/test.source" (20 segments, 250 words)
length ratio: 1 (250/250), penalty (log): 0


In [None]:
# Model related -----------------------------------------------------------
# Sequences longer than this will be discarded
'seq_len': 40
# Number of hidden units in encoder/decoder GRU
'enc_nhids': &REC_SIZE 800
'dec_nhids': 800

# Dimension of the word embedding matrix in encoder/decoder
'enc_embed': &EMBED_SIZE 300
'dec_embed': 300

# Optimization related ----------------------------------------------------
# Batch size
'batch_size': &BATCH_SIZE 15 

# This many batches will be read ahead and sorted
'sort_k_batches': 15 

# Optimization step rule
'step_rule': &STEP_RULE 'Adam'

# Gradient clipping threshold
'step_clipping': 1.

# Std of weight initialization
'weight_scale': 0.01

# Regularization related --------------------------------------------------

# Weight noise flag for feed forward layers
'weight_noise_ff': &FF_NOISE False

# Weight noise flag for recurrent layers
'weight_noise_rec': False

# Dropout ratio, applied only after readout maxout
'dropout': &DROPOUT 0.3

# Source and target vocabulary sizes, should include bos, eos, unk tokens
'src_vocab_size': &SRC_VOCAB_SIZE 20000
'trg_vocab_size': &TGT_VOCAB_SIZE 20000

# Special tokens and indexes
'unk_id': 1
'bos_token': '<S>'
'eos_token': '</S>'
'unk_token': '<UNK>'

# Root directory for dataset
'datadir': &DATADIR /media/1tb_drive/multilingual-multimodal/flickr30k/train/processed

# the name of the directory where the model will be saved and checkpointed
#'model_save_directory': &SAVEDIR !format_str ['unbabel_data_dropout{}_ff_noise{}_search_model_en2es_vocab{}_emb{}_rec{}_batch{}', *DROPOUT, *FF_NOISE, *SRC_VOCAB_SIZE, *EMBED_SIZE, *REC_SIZE, *BATCH_SIZE]
'model_save_directory': &SAVEDIR !format_str ['BERTHA-TEST_{}_wmt-multimodal_internal_data_dropout0.3_ff_noiseFalse_search_model_en2es_vocab20000_emb300_rec800_batch15', *STEP_RULE]

# Where to save model, this corresponds to 'prefix' in groundhog
'saveto': &OUTPUT_DIR !path_join [*DATADIR, *SAVEDIR]

# Module name of the stream that will be used
# note this requires the stream to be implemented as a module -- there may be a better way
'stream': 'stream'

# Source and target vocabularies
'src_vocab': !path_join [*DATADIR, 'vocab.en-de.en.pkl']
'trg_vocab': !path_join [*DATADIR, 'vocab.en-de.de.pkl']

# Source and target datasets
'src_data': !path_join [*DATADIR, 'train.en.tok.shuf']
'trg_data': !path_join [*DATADIR, 'train.de.tok.shuf']


# Early stopping based on BLEU score on dev set ------------------------------------

# Normalize cost according to sequence length after beam-search
'normalized_bleu': True

# Bleu script that will be used (moses multi-perl in this case)
'bleu_script': !path_join [*DATADIR, 'multi-bleu.perl']

# Validation set source file
'val_set': !path_join [*DATADIR, 'dev.en.tok']

# Validation set gold file
'val_set_grndtruth': !path_join [*DATADIR, 'dev.de.tok']

# Print validation output to file
'output_val_set': True

# Validation output file
'val_set_out': !path_join [*OUTPUT_DIR, 'validation_out.txt']

# Beam-size
'beam_size': 20 

# Timing/monitoring related -----------------------------------------------

# Maximum number of updates
'finish_after': 1000000

# Reload model from files if exist
'reload': True

# Save model after this many updates
'save_freq': 5000

# Show samples from model after this many updates
'sampling_freq': 5000

# Show this many samples at each sampling
'hook_samples': 5

# Validate bleu after this many updates
'bleu_val_freq': 1000 

# Start bleu validation after this many updates
'val_burn_in': 5000

# Using trained models for prediction ------------

# The location of the saved parameters of a trained model as .npz
'saved_parameters': ~

# The location of a test set in the source language
'test_set': ~


In [None]:
class MTSampleStreamGenerator(Dataset):
    """

    Parameters
    ----------
    source_files: list[str] - the files containing the source seqs we'll use to generate samples
    sample_func: function(num_samples=1) which takes source seq and outputs <num_samples> samples
    score_func: function
    
    # source dict only or source + target dict?
    dictionary: dict mapping word-->id -- TODO: switch to parameterized choice

    """

    provides_sources = ('sources','targets', 'scores')
    example_iteration_scheme = None

    def __init__(self, files, dictionary, all_idxs_and_probs, bos_token=None, eos_token=None,
                 unk_token='<UNK>', level='word', preprocess=None):
        self.files = files
        self.dictionary = dictionary
        # use all idxs to choose random substitutions and insertions
        # WORKING: switch to parameterized choice using word freq
        # WORKING pass in two lists -- words and probs (use the p= kwarg from np.random.choice)
        # self.all_idxs = dictionary.values()
        self.all_idxs, self.all_probs = zip(*all_idxs_and_probs.items())
        if bos_token is not None and bos_token not in dictionary:
            raise ValueError
        self.bos_token = bos_token
        if eos_token is not None and eos_token not in dictionary:
            raise ValueError
        self.eos_token = eos_token
        if unk_token not in dictionary:
            raise ValueError
        self.unk_token = unk_token
        if level not in ('word', 'character'):
            raise ValueError
        self.level = level
        self.preprocess = preprocess
        super(RandomTargetGenerator, self).__init__()

    def open(self):
        return chain(*[iter_(open(f)) for f in self.files])

    def get_data(self, state=None, request=None):
        if request is not None:
            raise ValueError
        sentence = next(state)
        if self.preprocess is not None:
            sentence = self.preprocess(sentence)
        pe_sequence = [self.dictionary[self.bos_token]] if self.bos_token else []
        if self.level == 'word':
            pe_sequence.extend(self.dictionary.get(word,
                                            self.dictionary[self.unk_token])
                        for word in sentence.split())
        else:
            pe_sequence.extend(self.dictionary.get(char,
                                            self.dictionary[self.unk_token])
                        for char in sentence.strip())
        if self.eos_token:
            pe_sequence.append(self.dictionary[self.eos_token])

        # now use the pe sequence to generate artificial target_sequence and tag_sequence
        target_sequence, tag_sequence = self._generate_artificial_target_and_tag_seqs(pe_sequence)

        return (target_sequence, tag_sequence)

    # TODO: currently only supports subs, not inserts
    # TODO: tag identities are hard-coded as 0=BAD 1=OK
    def _generate_artificial_target_and_tag_seqs(self, pe_sequence):
        target_sequence = copy.copy(pe_sequence)
        tag_sequence = [1] * len(pe_sequence)

        # TODO: make num_subs distribution configurable
        num_subs_to_make = numpy.random.randint(0, (int(numpy.ceil(len(pe_sequence / 1.3)))))
        sub_idxs = numpy.random.choice(len(pe_sequence), size=num_subs_to_make)
#         print('len orig seq: {}'.format(len(pe_sequence)))
#         print('len orig seq / 2: {}'.format(len(pe_sequence) //2))
#         print('n_subs: {}'.format(num_subs_to_make))
#         print(sub_idxs)
        for idx in sub_idxs:
            new_word = numpy.random.choice(self.all_idxs, p=self.all_probs)
            target_sequence[idx] = new_word
            tag_sequence[idx] = 0

        return target_sequence, tag_sequence
