In [None]:
# we want to extract segments from the WMT data according to a few criteria
# (1) segments should be short because constrained decoding is slow
# (2) segments should contain a minimum of special characters, numbers, and domain jargon
# (3) segments should not match the reference (translations should require some editing)

In [1]:
import os
import codecs
import itertools
import sys
import errno
from subprocess import Popen, PIPE

In [2]:
def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

In [3]:
DATADIR = '/media/1tb_drive/Dropbox/data/qe/wmt_2017/test/wmt17_qe_test_data/word_level/2016/'

# Note we use the data with no processing applied, because we want to render as realistically as possible in the UI
SRC_FILE = os.path.join(DATADIR, 'test.src')
MT_FILE = os.path.join(DATADIR, 'test.mt')
PE_FILE = os.path.join(DATADIR, 'test.pe')



def parallel_iterator(src_file, mt_file, pe_file):
    with codecs.open(src_file, encoding='utf8') as src:
        with codecs.open(mt_file, encoding='utf8') as mt:
            with codecs.open(pe_file, encoding='utf8') as pe:
                for src_l, mt_l, pe_l in itertools.izip(src, mt, pe):
                    yield (src_l.strip(), mt_l.strip(), pe_l.strip())

In [4]:
def length_filter(min_length=None, max_length=None):
    """Return a filter function rejecting segments with src length > max_length"""
    if min_length is None and max_length is None:
        raise(AssertionError('You must define at least one of {min_length, max_length}'))
        
    if min_length is None:
        min_length = 0
    if max_length is None:
        max_length = sys.maxint
     
    def filter_len(triple):
        if min_length <= len(triple[0]) <= max_length:
            return True
        else:
            return False
    return filter_len

def char_filter(bad_chars):
    """Return a filter function rejecting segments where src sequence contains unwanted chars"""
    def filter_chars(triple):
        if any(char in triple[0] for char in bad_chars):
            return False
        else:
            return True
    return filter_chars

In [5]:
# dataset output hyperparams
OUTPUT_DIR = '/home/chris/projects/handycat/app/data/word_level_qe_experiments/experiment_data'
mkdir_p(OUTPUT_DIR)

NUM_PROJECTS = 5
SEGMENTS_PER_PROJECT = 12

MIN_SEGMENT_LENGTH = 50
MAX_SEGMENT_LENGTH = 75

SRC_LANG = 'en'
TRG_LANG = 'de'

In [6]:
filter_1 = length_filter(min_length=MIN_SEGMENT_LENGTH, max_length=MAX_SEGMENT_LENGTH)
chars_to_filter = [u'>', u'"', u'(', u')', u'+', u'^', u'®', u':', u';', u'/', u'\\']
chars_to_filter.extend([unicode(c) for c in range(10)])
filter_2 = char_filter(set(chars_to_filter))

filters = [filter_1, filter_2]

In [7]:
pe_triple_iterator = parallel_iterator(SRC_FILE, MT_FILE, PE_FILE)
short_triples = [triple for triple in pe_triple_iterator if all(filter_f(triple) for filter_f in filters)]

In [8]:
# triple_cols = zip(*short_triples)

In [9]:
# detokenization logic
MOSES_SCRIPTS = '/home/chris/projects/mosesdecoder/scripts/'
detokenize_script = os.path.join(MOSES_SCRIPTS, 'tokenizer/detokenizer.perl')
def detokenize(text, lang):
    detokenizer_cmd = [detokenize_script, '-l', lang, '-q', '-']

    if type(text) is unicode:
        text = text.encode('utf8')

    detokenizer = Popen(detokenizer_cmd, stdin=PIPE, stdout=PIPE)
    text, _ = detokenizer.communicate(text)

    utf_line = text.rstrip().decode('utf8')
    return utf_line

In [10]:
def write_lines(filename, lines, cutoff=None):
    with codecs.open(filename, 'w', encoding='utf8') as out:
        if cutoff == None:
            cutoff = len(lines)
        for i, l in enumerate(lines):
            if i > cutoff:
                break
            # don't append newline to last line
            if i == cutoff - 1:
                out.write(u'{}'.format(l))
            else:
                out.write(u'{}\n'.format(l))

    print('Wrote {} lines to {}'.format(i+1, filename))

output_files = [[os.path.join(OUTPUT_DIR, f_name + '.' + str(p_i))
                for f_name in ['test.src', 'test.mt', 'test.pe']] for p_i in range(NUM_PROJECTS)]

project_start_idx = 0
for project_files in output_files:
    project_end_idx = project_start_idx + SEGMENTS_PER_PROJECT
    assert project_end_idx < len(short_triples), 'We cannot have duplicate segments, number of projects is too big'
    # grab the right number of instances from triple_cols
    output_triple_cols = zip(*short_triples[project_start_idx:project_end_idx])
    project_start_idx = project_end_idx
    
    # we detokenize so that strings render correctly in the UI
    src_rows, mt_rows, pe_rows = output_triple_cols
    src_rows = [detokenize(row, SRC_LANG) for row in src_rows]
    mt_rows = [detokenize(row, TRG_LANG) for row in mt_rows]
    pe_rows = [detokenize(row, TRG_LANG) for row in pe_rows]

    src_filename, mt_filename, pe_filename = project_files
    
    write_lines(src_filename, src_rows, cutoff=None)
    write_lines(mt_filename, mt_rows, cutoff=None)
    write_lines(pe_filename, pe_rows, cutoff=None)
    
        


Wrote 12 lines to /home/chris/projects/handycat/app/data/word_level_qe_experiments/experiment_data/test.src.0
Wrote 12 lines to /home/chris/projects/handycat/app/data/word_level_qe_experiments/experiment_data/test.mt.0
Wrote 12 lines to /home/chris/projects/handycat/app/data/word_level_qe_experiments/experiment_data/test.pe.0
Wrote 12 lines to /home/chris/projects/handycat/app/data/word_level_qe_experiments/experiment_data/test.src.1
Wrote 12 lines to /home/chris/projects/handycat/app/data/word_level_qe_experiments/experiment_data/test.mt.1
Wrote 12 lines to /home/chris/projects/handycat/app/data/word_level_qe_experiments/experiment_data/test.pe.1
Wrote 12 lines to /home/chris/projects/handycat/app/data/word_level_qe_experiments/experiment_data/test.src.2
Wrote 12 lines to /home/chris/projects/handycat/app/data/word_level_qe_experiments/experiment_data/test.mt.2
Wrote 12 lines to /home/chris/projects/handycat/app/data/word_level_qe_experiments/experiment_data/test.pe.2
Wrote 12 lines t

In [11]:
len(short_triples)

372

In [None]:
[(t[0], t[2]) for t in short_triples[:100]]

In [None]:
# dump a 20-sentence sample set for testing
# remember we're going to use the WMT hyps as MT hyps, we'll use a single good APE system with constrained decoding to provide outputs
# remember the APE server will need (at least) source + MT as inputs to work

