In [1]:
import numpy
import codecs
import tempfile
import cPickle
import os
import copy
from collections import OrderedDict

from fuel.datasets import H5PYDataset
from picklable_itertools import iter_, chain
from fuel.datasets import Dataset
from fuel.datasets import TextFile
from fuel.transformers import Merge

from nnqe.dataset.preprocess import whitespace_tokenize

In [None]:
# make a fuel stream which subclasses text file to create a stream which provides three sources: 
# (source, [samples], and BLEU scores)

# going forward, this may not be the fastest way because the sampling and BLEU score computation can be time consuming
# we should look at Fuel's read-ahead and cacheing capacity

In [3]:
# TODO: stateful transformer which takes a stream and adds the sources ('samples', 'scores')
# class Mapping(Transformer)
# the mapping should be a callable which gets samples, then computes the sentence-level BLEU
# score for each sample with respect to the reference

# use this script to get sentence-level scores(?)
# https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v13a.pl

# parameters
# sample_func: function(num_samples=1) which takes source seq and outputs <num_samples> samples
# score_func: function

# TODO: how do they create simple test streams in the fuel tests


class MTSampleStreamTransformer(object):
    """
    Stateful transformer which takes a stream and adds the sources ('samples', 'scores')
    
    Parameters
    ----------
    sample_func: function(num_samples=1) which takes source seq and outputs <num_samples> samples
    score_func: function

    At call time, we expect a stream providing (sources,) -- i.e. something like a TextFile object

    
    """
    
    
    
    
    def __init__(self, sample_func, score_func):
        self.sample_func = sample_func
        self.score_func = score_func

    def __call__(self, source_data):
        return ([x for x in source_data[0]])





In [4]:
def fake_score(source, target):
    return 1.

def fake_sample(source):
    return [[2,67,33,778,323,68], [545,5347,432,21,53,68,47,3689,3]]

In [5]:
test_transformer = MTSampleStreamTransformer(fake_sample, fake_score)

In [6]:
fake_source = [1,55,75,324,43,0]

test_transformer(fake_source)

NameError: global name 'sentence_pair' is not defined

In [None]:
class MTSampleStreamGenerator(Dataset):
    """

    Parameters
    ----------
    source_files: list[str] - the files containing the source seqs we'll use to generate samples
    sample_func: function(num_samples=1) which takes source seq and outputs <num_samples> samples
    score_func: function
    
    # source dict only or source + target dict?
    dictionary: dict mapping word-->id -- TODO: switch to parameterized choice

    """

    provides_sources = ('sources','targets', 'scores')
    example_iteration_scheme = None

    def __init__(self, files, dictionary, all_idxs_and_probs, bos_token=None, eos_token=None,
                 unk_token='<UNK>', level='word', preprocess=None):
        self.files = files
        self.dictionary = dictionary
        # use all idxs to choose random substitutions and insertions
        # WORKING: switch to parameterized choice using word freq
        # WORKING pass in two lists -- words and probs (use the p= kwarg from np.random.choice)
        # self.all_idxs = dictionary.values()
        self.all_idxs, self.all_probs = zip(*all_idxs_and_probs.items())
        if bos_token is not None and bos_token not in dictionary:
            raise ValueError
        self.bos_token = bos_token
        if eos_token is not None and eos_token not in dictionary:
            raise ValueError
        self.eos_token = eos_token
        if unk_token not in dictionary:
            raise ValueError
        self.unk_token = unk_token
        if level not in ('word', 'character'):
            raise ValueError
        self.level = level
        self.preprocess = preprocess
        super(RandomTargetGenerator, self).__init__()

    def open(self):
        return chain(*[iter_(open(f)) for f in self.files])

    def get_data(self, state=None, request=None):
        if request is not None:
            raise ValueError
        sentence = next(state)
        if self.preprocess is not None:
            sentence = self.preprocess(sentence)
        pe_sequence = [self.dictionary[self.bos_token]] if self.bos_token else []
        if self.level == 'word':
            pe_sequence.extend(self.dictionary.get(word,
                                            self.dictionary[self.unk_token])
                        for word in sentence.split())
        else:
            pe_sequence.extend(self.dictionary.get(char,
                                            self.dictionary[self.unk_token])
                        for char in sentence.strip())
        if self.eos_token:
            pe_sequence.append(self.dictionary[self.eos_token])

        # now use the pe sequence to generate artificial target_sequence and tag_sequence
        target_sequence, tag_sequence = self._generate_artificial_target_and_tag_seqs(pe_sequence)

        return (target_sequence, tag_sequence)

    # TODO: currently only supports subs, not inserts
    # TODO: tag identities are hard-coded as 0=BAD 1=OK
    def _generate_artificial_target_and_tag_seqs(self, pe_sequence):
        target_sequence = copy.copy(pe_sequence)
        tag_sequence = [1] * len(pe_sequence)

        # TODO: make num_subs distribution configurable
        num_subs_to_make = numpy.random.randint(0, (int(numpy.ceil(len(pe_sequence / 1.3)))))
        sub_idxs = numpy.random.choice(len(pe_sequence), size=num_subs_to_make)
#         print('len orig seq: {}'.format(len(pe_sequence)))
#         print('len orig seq / 2: {}'.format(len(pe_sequence) //2))
#         print('n_subs: {}'.format(num_subs_to_make))
#         print(sub_idxs)
        for idx in sub_idxs:
            new_word = numpy.random.choice(self.all_idxs, p=self.all_probs)
            target_sequence[idx] = new_word
            tag_sequence[idx] = 0

        return target_sequence, tag_sequence
