In [1]:
# Check Phrasal paper to see how they split the corpus
# they only use en-fr from autodesk, not en-de
# according to the Autodesk README, there can be duplicates

In [5]:
import bz2

# GERMAN
# autodesk_file = '/media/1tb_drive/parallel_data/autodesk/deu.mt.bz2'

# FRENCH
# autodesk_file = '/media/1tb_drive/parallel_data/autodesk/fra.mt.bz2'

# PORTUGUESE
autodesk_file = '/media/1tb_drive/parallel_data/autodesk/ptg.mt.bz2'

In [6]:
with bz2.BZ2File(autodesk_file) as inp:
    autodesk_rows = inp.read().strip().decode('utf8').split('\n')
    
autodesk_rows_cols = [l.split(u'\uf8ff') for l in autodesk_rows]

In [7]:
from subprocess import Popen, PIPE

# tokenization funcs
SOURCE_LANG = 'en'
# TARGET_LANG = 'de'
# TARGET_LANG = 'fr'
TARGET_LANG = 'pt'

tokenize_script = '/media/1tb_drive/parallel_data/en-de/chris_en-de_big_corpus/train/processed/tokenizer.perl'
source_tokenizer_cmd = [tokenize_script, '-l', SOURCE_LANG, '-q', '-', '-no-escape', '1']
target_tokenizer_cmd = [tokenize_script, '-l', TARGET_LANG, '-q', '-', '-no-escape', '1']

# NOTE: it's much slower to create a new tokenizer every time, see entity linking for persistent tokenizer
def tokenize(segment, src_trg):
    if src_trg == 'source':
        source_tokenizer = Popen(source_tokenizer_cmd, stdin=PIPE, stdout=PIPE)
        segment, _ = source_tokenizer.communicate(segment.encode('utf-8'))
    else:
        target_tokenizer = Popen(target_tokenizer_cmd, stdin=PIPE, stdout=PIPE)
        segment, _ = target_tokenizer.communicate(segment.encode('utf-8'))
    
    segment = segment.strip().decode('utf-8')
    return segment

In [8]:
# BPE encoding func
from subword_nmt.apply_bpe import BPE

# BPE_CODE_FILE = '/media/1tb_drive/parallel_data/en-de/chris_en-de_big_corpus/train/processed/all_text_both_EN_and_DE.79000.bpe.codes'
# BPE_CODE_FILE = '/media/1tb_drive/parallel_data/en-fr/phrasal_acl/all_text_both_EN_and_FR.79000.bpe.codes'
BPE_CODE_FILE = '/media/1tb_drive/parallel_data/en-pt/all_text_both_EN_and_PT.80000.bpe.codes'


bpe_codes = open(BPE_CODE_FILE)

bpe_encoder = BPE(bpe_codes)
def bpe_encode(text):
    return bpe_encoder.segment(text)

# python apply_bpe.py -c  < /home/chris/projects/neural_qe/data/wmt16/task2_en-de_training/train.src > /home/chris/projects/neural_qe/data/wmt16/task2_en-de_training/train.src.bpe

In [9]:
len(autodesk_rows_cols[0])

11

In [10]:
autodesk_rows_cols[0]

[u'The absolute coordinates of your pointing device is updated continuously and is displayed in the status bar',
 u'As coordenadas absolutas de seu dispositivo apontador s\xe3o atualizadas continuamente e \xe9 exibido na barra de status',
 u'As coordenadas absolutas de seu dispositivo apontador s\xe3o atualizadas continuamente e exibidas na barra de status.',
 u'ACD',
 u'2014',
 u'MT',
 u'0.000',
 u'58',
 u'',
 u'2012/11/19 19:24:51',
 u'\u25ca\xf7']

In [11]:
# columns according to README
# 1) EN-US source segment
# 2) target-language raw MT output (for .mt.bz2 files) or raw TM output (for .tm.bz2 files)
# 3) final post-edited target-language segment
# 4) Autodesk product code
# 5) product release identifier
# 6) translation type (FUZZY or MT)
# 7) raw MT score from Moses (with unknown-word penalties discarded)
# 8) TM fuzzy match score
# 9) TMX-style XML-encapsulated placeholder content (<phs>…</phs>)
# 10) entry creation date


In [12]:
raw_sources, raw_hyps, raw_refs, product_codes, release_identifiers, trans_types, mt_scores, fuzzy_matches, placeholders, creation_dates, _ = zip(*autodesk_rows_cols)

In [13]:
len(autodesk_rows_cols)

79608

In [14]:
len(raw_sources)

79608

In [15]:
from collections import OrderedDict

In [16]:
no_dups = OrderedDict(zip(raw_sources, raw_refs))

In [17]:
len(no_dups)

66243

In [18]:
import random

In [19]:
src_refs = no_dups.items()
random.seed(37)
random.shuffle(src_refs)

In [20]:
train_src_refs = src_refs[:-1000]
dev_src_refs = src_refs[-1000:]

In [21]:
train_sources, train_refs = zip(*train_src_refs)
dev_sources, dev_refs = zip(*dev_src_refs)

In [18]:
len(train_sources)

80721

In [19]:
# load big corpus, get noun chunks across the whole language, these are the general corpus frequency stats
# we want to compare these freqs to the target domain to see what's actually a term

In [21]:
import spacy
en_nlp = spacy.load('en')
de_nlp = spacy.load('de')

In [22]:
en_doc = en_nlp(u'Hello, world. Here are two sentences.')

In [23]:
tc = [c for c in en_doc.noun_chunks]

In [24]:
c1 = tc[0]

In [25]:
c1.orth_

u'two sentences'

In [26]:

# de_doc = de_nlp(u'ich bin ein Berliner.')

source_chunks = []
ref_chunks = []
source_iter = (l for l in train_sources)
ref_iter = (l for l in train_refs)

for i, doc in enumerate(en_nlp.pipe(source_iter, batch_size=50, n_threads=10)):
    source_chunks.append([c.orth_ for c in doc.noun_chunks])
    if i % 5000 == 0:
        print('processed {} segments'.format(i))
    
for i, doc in enumerate(en_nlp.pipe(ref_iter, batch_size=50, n_threads=10)):
    ref_chunks.append([c.orth_ for c in doc.noun_chunks])
    if i % 5000 == 0:
        print('processed {} segments'.format(i))



processed 0 segments
processed 5000 segments
processed 10000 segments
processed 15000 segments
processed 20000 segments
processed 25000 segments
processed 30000 segments
processed 35000 segments
processed 40000 segments
processed 45000 segments
processed 50000 segments
processed 55000 segments
processed 60000 segments
processed 65000 segments
processed 70000 segments
processed 75000 segments
processed 80000 segments
processed 85000 segments
processed 90000 segments
processed 95000 segments
processed 100000 segments
processed 105000 segments
processed 0 segments
processed 5000 segments
processed 10000 segments
processed 15000 segments
processed 20000 segments
processed 25000 segments
processed 30000 segments
processed 35000 segments
processed 40000 segments
processed 45000 segments
processed 50000 segments
processed 55000 segments
processed 60000 segments
processed 65000 segments
processed 70000 segments
processed 75000 segments
processed 80000 segments
processed 85000 segments
processe

In [27]:
from collections import defaultdict, Counter

chunk_map = defaultdict(Counter)
source_chunk_occs = Counter()
target_chunk_occs = Counter()

for src_chunks, trg_chunks in zip(source_chunks, ref_chunks):
    for src_chunk in set(src_chunks):
        chunk_map[src_chunk].update(set(trg_chunks))
        
    source_chunk_occs.update(src_chunks)
    target_chunk_occs.update(trg_chunks)


In [28]:
assert len(source_chunks) == len(ref_chunks)

In [29]:
num_segments = float(len(source_chunks))

src_priors = OrderedDict([(k, c / num_segments) for k,c in source_chunk_occs.most_common()])
trg_priors = OrderedDict([(k, c / num_segments) for k,c in target_chunk_occs.most_common()])

In [30]:
src_posteriors = {}
for src_chunk, trg_chunk_counter in chunk_map.items():
    total_occs = float(source_chunk_occs[src_chunk])
    posteriors = OrderedDict([(k, v / total_occs) for k, v in trg_chunk_counter.most_common()])
    src_posteriors[src_chunk] = posteriors

In [31]:
# terminology should be ranked by how frequent it is in the target domain vs how frequent it is generally

In [32]:
# filter keys and values to find good term pairs
# filtered_src = [c for c,v in src_priors.items() if len(c.split()) > 1 and any([w[0].isupper() for w in c.split()])]
filtered_src = [c for c,v in src_priors.items() if any([w[0].isupper() for w in c.split()])]

filtered_trg = [c for c,v in trg_priors.items() if len(c.split()) > 1 and any([w[0].isupper() for w in c.split()])]

In [33]:
# filtered_posteriors = [(k, [(t, v) for t, v in src_posteriors[k].items()
#                             if len(t.split()) > 1 and all([w[0].isupper() for w in t.split()])]) for k in filtered_src]

filtered_posteriors = [(k, [(t, v) for t, v in src_posteriors[k].items()
                            if any([w[0].isupper() for w in t.split()])]) for k in filtered_src]

In [34]:
len(filtered_posteriors)

60933

In [35]:
term_pairs = [(k,v[0]) for k,v in filtered_posteriors 
              if len(v) > 0  
              and v[0][1] > 0.7
              and not abs(len(k) - len(v[0][0])) > 8
              and source_chunk_occs[k] > 2]

In [36]:
len(term_pairs)

532

In [37]:
term_pairs

[(u'U-Value', (u'U-Wert', 1.0)),
 (u'Video', (u'Video', 0.9579439252336449)),
 (u'CivilSales', (u'CivilSales', 1.0)),
 (u'Step', (u'Schritt', 0.7431192660550459)),
 (u'Frame construction', (u'Rahmenkonstruktion', 0.9888888888888889)),
 (u'An issue', (u'Ein Problem', 0.8571428571428571)),
 (u'Block', (u'Block', 0.9838709677419355)),
 (u'Demonstration', (u'Pr\xe4sentation', 0.9310344827586207)),
 (u'Brick', (u'Ziegel', 1.0)),
 (u'Knowledge Check', (u'Wissens-Check', 0.7727272727272727)),
 (u'Passive floor', (u'Passive Geschossdecke', 0.9428571428571428)),
 (u'All rights', (u'All rights', 0.8823529411764706)),
 (u'the In Canvas Tools', (u'Canvas Tools', 0.9696969696969697)),
 (u'Raster menu', (u'Men\xfc Raster', 0.7586206896551724)),
 (u'Connectivity', (u'Connectivity', 0.8571428571428571)),
 (u'Bridge', (u'Bridge', 0.72)),
 (u'Membrane', (u'Folie', 0.9583333333333334)),
 (u'Microsoft Windows', (u'Microsoft Windows', 0.8260869565217391)),
 (u'VIDEO', (u'VIDEO', 0.8181818181818182)),
 (u'P

In [50]:
import codecs

# write terminology to file, sorted by length of source
output_terms = sorted(term_pairs, key=lambda x: len(x[0]), reverse=True)

all_rules = []

terms_output = 'autodesk.noun_chunk.terminology.tsv'
with codecs.open(terms_output, 'w', encoding='utf8') as out:
    for source_term, (target_term, score) in output_terms:
        source_term = tokenize(source_term, 'source')
        target_term = tokenize(target_term, 'target')
        source_term = bpe_encode(source_term)
        target_term = bpe_encode(target_term)
        all_rules.append((source_term, target_term, score))
        out.write(u'\t'.join([source_term, target_term, unicode(score)]) + u'\n')


In [22]:
import codecs

# write dev set to file
# Note: we could optionally tokenize and BPE encode dev set here

def write_lines(lines, filename):
    with codecs.open(filename, 'w', encoding='utf8') as out:
        out.write(u'\n'.join(lines))

In [23]:
# En-De Corpus
# dev_src_output = 'autodesk.dev.1000.en'
# dev_trg_output = 'autodesk.dev.1000.de'

# En-Fr Corpus
# dev_src_output = 'autodesk_constrained_decoding_corpus/en-fr/autodesk.dev.1000.en'
# dev_trg_output = 'autodesk_constrained_decoding_corpus/en-fr/autodesk.dev.1000.fr'

# En-Pr Corpus
dev_src_output = 'autodesk_constrained_decoding_corpus/en-pt/autodesk.dev.1000.en'
dev_trg_output = 'autodesk_constrained_decoding_corpus/en-pt/autodesk.dev.1000.pt'


write_lines(dev_sources, dev_src_output)
write_lines(dev_refs, dev_trg_output)

In [24]:
# train_src_output = 'autodesk.train.en'
# train_trg_output = 'autodesk.train.de'

# En-Fr Corpus
# train_src_output = 'autodesk_constrained_decoding_corpus/en-fr/autodesk.train.en'
# train_trg_output = 'autodesk_constrained_decoding_corpus/en-fr/autodesk.train.fr'

# En-Pt Corpus
train_src_output = 'autodesk_constrained_decoding_corpus/en-pt/autodesk.train.en'
train_trg_output = 'autodesk_constrained_decoding_corpus/en-pt/autodesk.train.pt'

write_lines(train_sources, train_src_output)
write_lines(train_refs, train_trg_output)

In [53]:
# how many constraints occur in the dev set?
source_constraints = set([src_term for src_term, (_, _) in output_terms])

In [54]:
len(source_constraints)

532

In [55]:
src_constraint_occs = 0
for source_sen in dev_sources:
    for cons in source_constraints:
        if cons in source_sen:
            src_constraint_occs += 1
            break
src_constraint_occs

154

In [56]:
src_rules, trg_rules, rule_scores = zip(*all_rules)


In [58]:
# import the tokenized and bpe encoded dev lines
prepped_dev_lines = codecs.open('/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus/autodesk.dev.1000.en.bpe',
                                 encoding='utf8').read().strip().split('\n')

In [60]:
# create terminology spotter
from semantic_annotator.spotting import MatchSpotter

term_spotter = MatchSpotter(rules=src_rules)


# get spots in dev set us

In [77]:
term_pair_map = OrderedDict((k,v) for k,v,s in all_rules)

In [86]:
dev_term_spots = []
for l in prepped_dev_lines:
    spots = term_spotter.get_spots(l)
    dev_term_spots.append(spots)

dev_term_constraints = []
for text, spots in zip(prepped_dev_lines, dev_term_spots):
    output_constraints = []
    if len(spots) > 0:
        for spot in spots:
            if spot[1] - spot[0] > 1:
                spotted_term = text[spot[0]:spot[1]]
                mapped_term = term_pair_map[spotted_term]
                output_constraints.append(mapped_term.split())
    dev_term_constraints.append(output_constraints)
#                 print(u'Spotted: {}, mapping is: {}'.format(spotted_term, term_pair_map[spotted_term]))
#     else:
#         print('No spot in line')

In [87]:
len(dev_term_constraints)

1000

In [90]:
import os
import json

OUTPUT_DIR='/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus'

with codecs.open(os.path.join(OUTPUT_DIR, 'dev.constraints.json'), 'w', encoding='utf8') as out:
    out.write(json.dumps(dev_term_constraints, indent=2))

In [37]:
top_1000_src[:10]

[u'Autodesk Inventor',
 u'Simulation CFD',
 u'Autodesk Vault',
 u'Service Pack',
 u'Autodesk InfraWorks',
 u'Inventor LT',
 u'Simulation Mechanical',
 u'Autodesk\xae Simulation CFD',
 u'AutoCAD LT',
 u'Autodesk Revit']

In [37]:
for src_chunk, prior in src_priors.items()[300:310]:
    print(u'CHUNK: {}, top 5: {}'.format(src_chunk, src_posteriors[src_chunk].items()[:50]))

CHUNK: the fluid, top 5: [(u'i', 0.07109004739336493), (u'das Fluid', 0.014218009478672985), (u'nichtnewtonschen Fluid wird', 0.014218009478672985), (u'Fluid', 0.009478672985781991), (u'das', 0.009478672985781991), (u'bei der das Fluid zu', 0.009478672985781991), (u'Gleichgewicht aufgrund der Schwerkraft', 0.009478672985781991), (u'ihrem Verlauf m\xfcssen fein genug sein', 0.009478672985781991), (u'System', 0.009478672985781991), (u'Fluid aus\xfcbt', 0.009478672985781991), (u'Bewegung', 0.009478672985781991), (u'ist', 0.009478672985781991), (u'W\xe4rmeleitf\xe4higkeit des Fluids', 0.009478672985781991), (u'und', 0.009478672985781991), (u'Interaktion zwischen Festk\xf6rper', 0.009478672985781991), (u'Watt', 0.009478672985781991), (u'der', 0.009478672985781991), (u'Interaktion zwischen Festk\xf6rper und dem Fluid angemessen darzustellen', 0.009478672985781991), (u'dass', 0.009478672985781991), (u'Dichte des Fluids', 0.009478672985781991), (u'Fluid vom Auslass bis zum Einlass', 0.00947867

In [30]:
src_priors.items()[50:100]

[(u'who', 0.0019538882375928096),
 (u'order', 0.0019445840078899867),
 (u'the ribbon', 0.0019445840078899867),
 (u'use', 0.0019352797781871638),
 (u'the part', 0.001925975548484341),
 (u'a drawing', 0.0018794543999702264),
 (u'addition', 0.0018701501702674036),
 (u'BIM', 0.0018422374811589348),
 (u'Select', 0.001823629021753289),
 (u'the name', 0.0018050205623476433),
 (u'the design', 0.0017957163326448203),
 (u'a file', 0.0017957163326448203),
 (u'insulation', 0.0017212824950222372),
 (u'the top', 0.0017119782653194143),
 (u'(100 mm', 0.0017026740356165912),
 (u'the flow', 0.0016747613465081226),
 (u'models', 0.0016747613465081226),
 (u'the display', 0.0016747613465081226),
 (u'Autodesk Inventor', 0.0016654571168052998),
 (u'design', 0.001637544427696831),
 (u'features', 0.001628240197994008),
 (u'InfraWorks', 0.0015072852118573103),
 (u'the command', 0.0014979809821544874),
 (u'materials', 0.0014979809821544874),
 (u'an object', 0.0014793725227488417),
 (u'the installation', 0.001460

In [None]:
dedupe

no_dup_sources = []
no_dup_refs = []

In [None]:
shuffle jointly, then split into train test


In [4]:
print(autodesk_rows[0])

The NVIDIA® iray® renderer now supports the NVIDIA® Kepler chipset.Die NVIDIA ® iray ® Renderer unterstützt jetzt die NVIDIA ® Kepler Chipsatz.Der NVIDIA® iray®-Renderer bietet nun Unterstützung für den Chipsatz NVIDIA® Kepler.3DSMAX2013MT0.000362012/11/13 00:13:54◊÷


In [3]:
autodesk_rows[:3]

[u'The NVIDIA\xae iray\xae renderer now supports the NVIDIA\xae Kepler chipset.\uf8ffDie NVIDIA \xae iray \xae Renderer unterst\xfctzt jetzt die NVIDIA \xae Kepler Chipsatz.\uf8ffDer NVIDIA\xae iray\xae-Renderer bietet nun Unterst\xfctzung f\xfcr den Chipsatz NVIDIA\xae Kepler.\uf8ff3DSMAX\uf8ff2013\uf8ffMT\uf8ff0.000\uf8ff36\uf8ff\uf8ff2012/11/13 00:13:54\uf8ff\u25ca\xf7',
 u'{1}Home tab{2}Modify panel {3}> Draworder flyout{4}.\uf8ff{1}Registerkarte Start{2}Gruppe \xc4ndern {3}> Flyout Zeichnungsreihenfolge{4}.\uf8ff{1}Registerkarte Start{2}Gruppe \xc4ndern {3}> Flyout Zeichnungsreihenfolge{4}.\uf8ffACD\uf8ff2014\uf8ffMT\uf8ff0.074\uf8ff70\uf8ff<phs><ph id="1">&lt;menucascade id=&quot;GUID-9206B026-680D-4738-8931-1F0F6D54CB36&quot;&gt; &lt;uicontrol id=&quot;GUID-3B7B95A6-B68E-40CD-9670-D0298139ABEF&quot;&gt;</ph><ph id="2">&lt;/uicontrol&gt; &lt;uicontrol id=&quot;GUID-DDBA3BDC-A9DA-49FE-9C08-4FBDD5CC200E&quot;&gt;</ph><ph id="3">&lt;/uicontrol&gt; &lt;/menucascade&gt; &lt;glyph type