In [1]:
# Check Phrasal paper to see how they split the corpus
# they only use en-fr from autodesk, not en-de

In [1]:
import codecs
from collections import Counter, OrderedDict, defaultdict

In [2]:
# EN-DE
# train_source_file = '/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus/autodesk.train.en.bpe'
# train_target_file = '/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus/autodesk.train.de.bpe'

# EN-FR
# train_source_file = '/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus/en-fr/autodesk.train.en.bpe'
# train_target_file = '/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus/en-fr/autodesk.train.fr.bpe'

# EN-PT
train_source_file = '/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus/en-pt/autodesk.train.en.bpe'
train_target_file = '/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus/en-pt/autodesk.train.pt.bpe'

In [3]:
def get_segments_from_file(filename, max_segs=50000):
    with codecs.open(filename, encoding='utf8') as inp:
        for i, l in enumerate(inp):
            if i < max_segs:
                yield l.strip().split()
            else:
                raise StopIteration


In [4]:
def all_ngrams_in_segment(segment, min_ngram, max_ngram):
    # note we return the set to only count each ngram once per segment
    return list(set([u' '.join(ngram) for factor in range(min_ngram, max_ngram+1) 
                     for ngram in [segment[i:i+factor] for i in range(len(segment)-factor+1)]]))

In [5]:
def count_ngrams_in_corpus(segments, min_ngram, max_ngram):
    ngram_counts = Counter()
    ngram_lines = []
    for segment in segments:
        ngrams = all_ngrams_in_segment(segment, min_ngram, max_ngram)
        ngram_counts.update(ngrams)
        ngram_lines.append(ngrams)
    return ngram_counts, ngram_lines

In [7]:
# the number of lines in the dataset
num_segments = 100000.

# num_segments = 107478.
print('Size of corpus: {}'.format(num_segments))

Size of corpus: 100000.0


In [8]:
MIN_NGRAM = 1
MAX_NGRAM = 4

train_source = get_segments_from_file(train_source_file, max_segs=num_segments)
train_target = get_segments_from_file(train_target_file, max_segs=num_segments)

source_ngrams, source_line_ngrams = count_ngrams_in_corpus(train_source, min_ngram=MIN_NGRAM, max_ngram=MAX_NGRAM)
target_ngrams, target_line_ngrams = count_ngrams_in_corpus(train_target, min_ngram=MIN_NGRAM, max_ngram=MAX_NGRAM)

In [9]:
print('Ngrams in source: {}'.format(len(source_ngrams)))

Ngrams in source: 1515849


In [10]:
max_occs = 500
min_occs = 5
def prune_high_low_freq(counter):
    for ngram, count in counter.items():
        if count > max_occs or count < min_occs:
            del counter[ngram]

In [11]:
prune_high_low_freq(source_ngrams)
prune_high_low_freq(target_ngrams)

In [12]:
print('Ngrams in source: {}'.format(len(source_ngrams)))

Ngrams in source: 99308


In [13]:
src_priors = OrderedDict((k, v / float(num_segments)) for k,v in source_ngrams.most_common())
trg_priors = OrderedDict((k, v / float(num_segments)) for k,v in target_ngrams.most_common())

In [14]:
len(trg_priors)

123365

In [15]:
len(src_priors)

99308

In [16]:
from collections import defaultdict, Counter

chunk_map = defaultdict(Counter)

for src_chunks, trg_chunks in zip(source_line_ngrams, target_line_ngrams):
    for src_chunk in set(src_chunks):
        if src_chunk in source_ngrams:
            chunk_map[src_chunk].update(list(set([chunk for chunk in trg_chunks if chunk in target_ngrams])))

del source_line_ngrams
del target_line_ngrams

In [17]:
src_posteriors = {}
num_processed = 0
for src_chunk, trg_chunk_counter in chunk_map.items():
    total_occs = float(source_ngrams[src_chunk])
    posteriors = OrderedDict([(k, v / total_occs) for k, v in trg_chunk_counter.most_common()])
    del chunk_map[src_chunk]
    src_posteriors[src_chunk] = posteriors
    num_processed += 1
    if num_processed % 10000 == 0:
        print('processed {}'.format(num_processed))
del chunk_map

processed 10000
processed 20000
processed 30000
processed 40000
processed 50000
processed 60000
processed 70000
processed 80000
processed 90000


In [18]:
import numpy

def normalized_pmi(p_x, p_y, p_y_given_x):
    joint = p_x * p_y_given_x
    normalize = -numpy.log(joint)
    return numpy.log(p_y_given_x / p_y) / normalize

In [19]:
pmi_cands = {}

num_processed = 0 
for source_phrase, posteriors in src_posteriors.items():
    source_prior = src_priors[source_phrase]
    pmi_scores = []
    for target_phrase, posterior in posteriors.items():
        target_prior = trg_priors[target_phrase]
        pmi_score = normalized_pmi(source_prior, target_prior, posterior)
        pmi_scores.append((target_phrase, pmi_score))

    pmi_scores = sorted(pmi_scores, key=lambda x: x[1], reverse=True)
    pmi_cands[source_phrase] = pmi_scores
#     del src_posteriors[source_phrase]
    num_processed += 1
    if num_processed % 10000 == 0:
        print('processed {}'.format(num_processed))
        
# del src_posteriors
# del trg_priors

processed 10000
processed 20000
processed 30000
processed 40000
processed 50000
processed 60000
processed 70000
processed 80000
processed 90000


In [20]:
len(pmi_cands)

99308

In [21]:
min_occs = 5
min_score = 0.95
min_source_len = 5

good_cands = []
for src, cands in pmi_cands.items()[:50000]:
    if len(src) >= min_source_len and len(cands) > 0 and cands[0][1] >= min_score:
        if source_ngrams[src] > min_occs:
#             print(u'match: {}-->{} score {}'.format(src, cands[0][0], cands[0][1]))
            internal_cands = [cand for cand in cands 
                              if target_ngrams[cand[0]] >  min_occs
                              and cand[1] >= min_score]
            # sort by length descending
            internal_cands = sorted(internal_cands, key=lambda x: len(x[0].split()), reverse=True)
            if len(internal_cands) > 0:
                good_cands.append((src, cands[0]))
            
# TODO:
# where the source phrase is a subset of another phrase, choose the longest one
# choose the longest target candidate whose PMI is high enough
print len(good_cands)

14289


In [22]:
# WORKING: just print this to file and try it
src_rules = [src for src, (trg, score) in good_cands]
term_pair_map = OrderedDict((k,v) for k,(v,s) in good_cands)

In [23]:
# create terminology spotter
from semantic_annotator.spotting import MatchSpotter

term_spotter = MatchSpotter(rules=src_rules)

In [24]:
# import the tokenized and bpe encoded dev lines
# prepped_dev_lines = codecs.open('/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus/autodesk.dev.1000.en.bpe',
#                                  encoding='utf8').read().strip().split('\n')

prepped_dev_lines = codecs.open('/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus/en-fr/autodesk.dev.1000.en.bpe',
                                 encoding='utf8').read().strip().split('\n')

In [25]:
dev_term_spots = []
for l in prepped_dev_lines:
    spots = term_spotter.get_spots(l)
    dev_term_spots.append(spots)

dev_term_constraints = []
for text, spots in zip(prepped_dev_lines, dev_term_spots):
    output_constraints = []
    if len(spots) > 0:
        for spot in spots:
            if spot[1] - spot[0] > 1:
                spotted_term = text[spot[0]:spot[1]]
                mapped_term = term_pair_map[spotted_term]
                output_constraints.append(mapped_term.split())
    dev_term_constraints.append(output_constraints)

In [26]:
len(dev_term_constraints)

1000

In [27]:
dev_term_constraints[:100]

[[[u'help@@', u'.@@', u'auto@@'],
  [u'/', u'view', u'/', u'ICD'],
  [u'/', u'2014', u'/'],
  [u'?', u'cont@@', u'ext@@', u'I@@'],
  [u'{', u'3', u'}', u'et'],
  [u'4', u'}', u'http'],
  [u'help@@', u'.@@', u'auto@@'],
  [u'/', u'view', u'/', u'ICD'],
  [u'/', u'2014', u'/'],
  [u'?', u'cont@@', u'ext@@', u'I@@'],
  [u'5', u'}', u'Cr\xe9er']],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [[u'sk@@', u'etch@@']],
 [],
 [],
 [[u'AutoCAD', u'{', u'1', u'}'], [u'Civil', u'3D', u'{', u'2']],
 [],
 [],
 [],
 [[u'file', u':', u'/'],
  [u'clou@@', u'd@@', u'help', u'/'],
  [u'/', u'2015', u'/'],
  [u'CAD@@', u'-@@'],
  [u'files', u'/', u'GU@@'],
  [u'8@@', u'19@@', u'2-@@'],
  [u'8@@', u'19@@', u'2-@@'],
  [u'8@@', u'19@@', u'2-@@'],
  [u'8@@', u'19@@', u'2-@@'],
  [u'8@@', u'19@@', u'2-@@'],
  [u'3', u'}', u'\xa9', u'Copyright'],
  [u'3', u'}', u'\xa9', u'Copyright'],
  [u'Inc', u'.', u'All', u'rights']],
 [],
 [[u'vous', u'enregistrez', u'un', u'fichier'],
  [u'le', u'dans', u'un', u'dossier']],


In [28]:
import os
import json

# OUTPUT_DIR='/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus'
OUTPUT_DIR='/home/chris/projects/constrained_decoding/proto/autodesk_constrained_decoding_corpus/en-fr'

with codecs.open(os.path.join(OUTPUT_DIR, 'pmi.baseline.dev.constraints.json'), 'w', encoding='utf8') as out:
    out.write(json.dumps(dev_term_constraints, indent=2))

In [42]:
good_cands[:100]

[(u'1 } Nav@@', (u'1 } Nav@@', 0.98445032753820794)),
 (u'localized', (u'lok@@', 0.96140193746584057)),
 (u'DL@@ M', (u'DL@@ M', 0.98167267895539889)),
 (u'ei@@', (u'Decke', 0.9473565816682249)),
 (u'installed to revert Auto@@', (u'Updates den', 1.0)),
 (u'abler {', (u'AutoCAD Civil 3D Object', 0.98308124041157308)),
 (u', rendering', (u', Ren@@ dering', 0.93604792711326779)),
 (u'} As', (u'Vom {', 0.93623755231209871)),
 (u'bit ) {', (u'bit ) {', 1.0)),
 (u'Factory Design Suite 2015',
  (u'Factory Design Suite 2015', 0.97405927122050773)),
 (u'list box', (u'Listen@@ feld auf', 0.96719814603844934)),
 (u"a user 's", (u'eines Benutzers', 0.91638656221226733)),
 (u'Rev@@ it LT {', (u'Rev@@ it LT {', 0.9698565171985376)),
 (u'{ 24', (u'{ 24', 1.0)),
 (u'{ 23', (u'{ 24', 1.0)),
 (u'{ 22', (u'{ 22', 1.0)),
 (u'1 } ..', (u'1 } ..', 1.0)),
 (u'marking menu', (u'Mark@@ ierungs@@ men\xfc', 0.93841832577685758)),
 (u'in Ab@@ aqu@@', (u'in Ab@@', 0.95043650671515612)),
 (u'Tru@@ e@@', (u'Tru@@ e@

In [24]:
source_ngrams.most_common(10)

[(u'start', 500),
 (u'applied', 500),
 (u'2 } :', 499),
 (u'{ 2 } :', 499),
 (u'What', 499),
 (u'was', 498),
 (u'Update', 497),
 (u'Subscri@@ ption', 496),
 (u'R-@@', 496),
 (u'ption', 496)]

In [22]:
pmi_cands.items()[:10]

[(u'privilege on',
  [(u'PC \xfcber', 0.82050633570393328),
   (u'Beta 3-@@', 0.78550731479385638),
   (u'3-@@ Version', 0.7414137631415163),
   (u'der Beta', 0.72567609336183414),
   (u'lokalen PC', 0.72567609336183414),
   (u'rechte verf\xfcgen', 0.7008438928398889),
   (u'2014 m\xfcssen', 0.67335553793232839),
   (u'Zur Installation', 0.62175132027747182),
   (u'Ihrem lokalen', 0.60248996914707698),
   (u'\xfcber Administr@@', 0.57344628916161022),
   (u'ator@@ rechte', 0.53287641885998283),
   (u'Installation der', 0.52409071923946871),
   (u'verf\xfcgen .', 0.50427426518988461),
   (u'auf Ihrem', 0.43601506618455216),
   (u'or 2014', 0.43601506618455216),
   (u'Administr@@ ator@@', 0.43501715672980074),
   (u'Version von', 0.39677270819610594),
   (u'm\xfcssen Sie', 0.34617769699067885),
   (u'desk Invent@@', 0.3318397320927583),
   (u'von Auto@@', 0.28160489285413842),
   (u'Invent@@ or', 0.23913715889811826),
   (u'Sie auf', 0.1594822507543559),
   (u'Auto@@ desk', 0.10663153914