In [1]:
# Check Phrasal paper to see how they split the corpus
# they only use en-fr from autodesk, not en-de

In [2]:
import bz2

autodesk_file = '/media/1tb_drive/parallel_data/corpora/deu.mt.bz2'

# according to the README there can be duplicates

In [3]:
with bz2.BZ2File(autodesk_file) as inp:
    autodesk_rows = inp.read().strip().decode('utf8').split('\n')
    
autodesk_rows_cols = [l.split(u'\uf8ff') for l in autodesk_rows]

In [4]:
len(autodesk_rows_cols[0])

11

In [5]:
autodesk_rows_cols[0]

[u'The NVIDIA\xae iray\xae renderer now supports the NVIDIA\xae Kepler chipset.',
 u'Die NVIDIA \xae iray \xae Renderer unterst\xfctzt jetzt die NVIDIA \xae Kepler Chipsatz.',
 u'Der NVIDIA\xae iray\xae-Renderer bietet nun Unterst\xfctzung f\xfcr den Chipsatz NVIDIA\xae Kepler.',
 u'3DSMAX',
 u'2013',
 u'MT',
 u'0.000',
 u'36',
 u'',
 u'2012/11/13 00:13:54',
 u'\u25ca\xf7']

In [6]:
# columns according to README
# 1) EN-US source segment
# 2) target-language raw MT output (for .mt.bz2 files) or raw TM output (for .tm.bz2 files)
# 3) final post-edited target-language segment
# 4) Autodesk product code
# 5) product release identifier
# 6) translation type (FUZZY or MT)
# 7) raw MT score from Moses (with unknown-word penalties discarded)
# 8) TM fuzzy match score
# 9) TMX-style XML-encapsulated placeholder content (<phs>…</phs>)
# 10) entry creation date


In [7]:
raw_sources, raw_hyps, raw_refs, product_codes, release_identifiers, trans_types, mt_scores, fuzzy_matches, placeholders, creation_dates, _ = zip(*autodesk_rows_cols)

In [8]:
len(autodesk_rows_cols)

124500

In [9]:
len(raw_sources)

124500

In [10]:
from collections import OrderedDict

In [11]:
no_dups = OrderedDict(zip(raw_sources, raw_refs))

In [12]:
len(no_dups)

108478

In [13]:
import random

In [14]:
src_refs = no_dups.items()
random.seed(37)
random.shuffle(src_refs)

In [15]:
train_src_refs = src_refs[:-1000]
dev_src_refs = src_refs[-1000:]

In [16]:
train_sources, train_refs = zip(*train_src_refs)
dev_sources, dev_refs = zip(*dev_src_refs)

In [17]:
len(train_sources)

107478

In [18]:
# load big corpus, get noun chunks across the whole language, these are the general corpus frequency stats
# we want to compare these freqs to the target domain to see what's actually a term

In [19]:
import spacy
en_nlp = spacy.load('en')
de_nlp = spacy.load('de')

In [20]:
en_doc = en_nlp(u'Hello, world. Here are two sentences.')

In [21]:
tc = [c for c in en_doc.noun_chunks]

In [22]:
c1 = tc[0]

In [23]:
c1.orth_

u'two sentences'

In [24]:

# de_doc = de_nlp(u'ich bin ein Berliner.')

source_chunks = []
ref_chunks = []
source_iter = (l for l in train_sources)
ref_iter = (l for l in train_refs)

for i, doc in enumerate(en_nlp.pipe(source_iter, batch_size=50, n_threads=10)):
    source_chunks.append([c.orth_ for c in doc.noun_chunks])
    if i % 500 == 0:
        print('processed {} segments'.format(i))
    
for i, doc in enumerate(en_nlp.pipe(ref_iter, batch_size=50, n_threads=10)):
    ref_chunks.append([c.orth_ for c in doc.noun_chunks])
    if i % 500 == 0:
        print('processed {} segments'.format(i))



processed 0 segments
processed 500 segments
processed 1000 segments
processed 1500 segments
processed 2000 segments
processed 2500 segments
processed 3000 segments
processed 3500 segments
processed 4000 segments
processed 4500 segments
processed 5000 segments
processed 5500 segments
processed 6000 segments
processed 6500 segments
processed 7000 segments
processed 7500 segments
processed 8000 segments
processed 8500 segments
processed 9000 segments
processed 9500 segments
processed 10000 segments
processed 10500 segments
processed 11000 segments
processed 11500 segments
processed 12000 segments
processed 12500 segments
processed 13000 segments
processed 13500 segments
processed 14000 segments
processed 14500 segments
processed 15000 segments
processed 15500 segments
processed 16000 segments
processed 16500 segments
processed 17000 segments
processed 17500 segments
processed 18000 segments
processed 18500 segments
processed 19000 segments
processed 19500 segments
processed 20000 segments

In [25]:
from collections import defaultdict, Counter

chunk_map = defaultdict(Counter)
source_chunk_occs = Counter()
target_chunk_occs = Counter()

for src_chunks, trg_chunks in zip(source_chunks, ref_chunks):
    for src_chunk in src_chunks:
        chunk_map[src_chunk].update(trg_chunks)
        
    source_chunk_occs.update(src_chunks)
    target_chunk_occs.update(trg_chunks)


In [27]:
assert len(source_chunks) == len(ref_chunks)

In [28]:
num_segments = float(len(source_chunks))

src_priors = OrderedDict([(k, c / num_segments) for k,c in source_chunk_occs.most_common()])
trg_priors = OrderedDict([(k, c / num_segments) for k,c in target_chunk_occs.most_common()])

In [32]:
src_posteriors = {}
for src_chunk, trg_chunk_counter in chunk_map.items():
    total_maps = float(sum(trg_chunk_counter.itervalues()))
    posteriors = OrderedDict([(k, v / total_maps) for k, v in trg_chunk_counter.most_common()])
    src_posteriors[src_chunk] = posteriors

In [None]:
# terminology should be ranked by how frequent it is in the target domain vs how frequent it is generally

In [37]:
for src_chunk, prior in src_priors.items()[300:310]:
    print(u'CHUNK: {}, top 5: {}'.format(src_chunk, src_posteriors[src_chunk].items()[:50]))

CHUNK: the fluid, top 5: [(u'i', 0.07109004739336493), (u'das Fluid', 0.014218009478672985), (u'nichtnewtonschen Fluid wird', 0.014218009478672985), (u'Fluid', 0.009478672985781991), (u'das', 0.009478672985781991), (u'bei der das Fluid zu', 0.009478672985781991), (u'Gleichgewicht aufgrund der Schwerkraft', 0.009478672985781991), (u'ihrem Verlauf m\xfcssen fein genug sein', 0.009478672985781991), (u'System', 0.009478672985781991), (u'Fluid aus\xfcbt', 0.009478672985781991), (u'Bewegung', 0.009478672985781991), (u'ist', 0.009478672985781991), (u'W\xe4rmeleitf\xe4higkeit des Fluids', 0.009478672985781991), (u'und', 0.009478672985781991), (u'Interaktion zwischen Festk\xf6rper', 0.009478672985781991), (u'Watt', 0.009478672985781991), (u'der', 0.009478672985781991), (u'Interaktion zwischen Festk\xf6rper und dem Fluid angemessen darzustellen', 0.009478672985781991), (u'dass', 0.009478672985781991), (u'Dichte des Fluids', 0.009478672985781991), (u'Fluid vom Auslass bis zum Einlass', 0.00947867

In [30]:
src_priors.items()[50:100]

[(u'who', 0.0019538882375928096),
 (u'order', 0.0019445840078899867),
 (u'the ribbon', 0.0019445840078899867),
 (u'use', 0.0019352797781871638),
 (u'the part', 0.001925975548484341),
 (u'a drawing', 0.0018794543999702264),
 (u'addition', 0.0018701501702674036),
 (u'BIM', 0.0018422374811589348),
 (u'Select', 0.001823629021753289),
 (u'the name', 0.0018050205623476433),
 (u'the design', 0.0017957163326448203),
 (u'a file', 0.0017957163326448203),
 (u'insulation', 0.0017212824950222372),
 (u'the top', 0.0017119782653194143),
 (u'(100 mm', 0.0017026740356165912),
 (u'the flow', 0.0016747613465081226),
 (u'models', 0.0016747613465081226),
 (u'the display', 0.0016747613465081226),
 (u'Autodesk Inventor', 0.0016654571168052998),
 (u'design', 0.001637544427696831),
 (u'features', 0.001628240197994008),
 (u'InfraWorks', 0.0015072852118573103),
 (u'the command', 0.0014979809821544874),
 (u'materials', 0.0014979809821544874),
 (u'an object', 0.0014793725227488417),
 (u'the installation', 0.001460

In [None]:
dedupe

no_dup_sources = []
no_dup_refs = []

In [None]:
shuffle jointly, then split into train test


In [4]:
print(autodesk_rows[0])

The NVIDIA® iray® renderer now supports the NVIDIA® Kepler chipset.Die NVIDIA ® iray ® Renderer unterstützt jetzt die NVIDIA ® Kepler Chipsatz.Der NVIDIA® iray®-Renderer bietet nun Unterstützung für den Chipsatz NVIDIA® Kepler.3DSMAX2013MT0.000362012/11/13 00:13:54◊÷


In [3]:
autodesk_rows[:3]

[u'The NVIDIA\xae iray\xae renderer now supports the NVIDIA\xae Kepler chipset.\uf8ffDie NVIDIA \xae iray \xae Renderer unterst\xfctzt jetzt die NVIDIA \xae Kepler Chipsatz.\uf8ffDer NVIDIA\xae iray\xae-Renderer bietet nun Unterst\xfctzung f\xfcr den Chipsatz NVIDIA\xae Kepler.\uf8ff3DSMAX\uf8ff2013\uf8ffMT\uf8ff0.000\uf8ff36\uf8ff\uf8ff2012/11/13 00:13:54\uf8ff\u25ca\xf7',
 u'{1}Home tab{2}Modify panel {3}> Draworder flyout{4}.\uf8ff{1}Registerkarte Start{2}Gruppe \xc4ndern {3}> Flyout Zeichnungsreihenfolge{4}.\uf8ff{1}Registerkarte Start{2}Gruppe \xc4ndern {3}> Flyout Zeichnungsreihenfolge{4}.\uf8ffACD\uf8ff2014\uf8ffMT\uf8ff0.074\uf8ff70\uf8ff<phs><ph id="1">&lt;menucascade id=&quot;GUID-9206B026-680D-4738-8931-1F0F6D54CB36&quot;&gt; &lt;uicontrol id=&quot;GUID-3B7B95A6-B68E-40CD-9670-D0298139ABEF&quot;&gt;</ph><ph id="2">&lt;/uicontrol&gt; &lt;uicontrol id=&quot;GUID-DDBA3BDC-A9DA-49FE-9C08-4FBDD5CC200E&quot;&gt;</ph><ph id="3">&lt;/uicontrol&gt; &lt;/menucascade&gt; &lt;glyph type