In [1]:
from __future__ import division, print_function
import numpy as np
import codecs
import re

%load_ext autoreload
%autoreload 2

In [4]:
from lm_autocomplete.phrase_table.parsers.moses_triple_pipe_parser import MosesTriplePipeParser
from lm_autocomplete.phrase_table.in_memory_phrase_table import InMemoryPhraseTable

phrase_table_file = '/home/chris/projects/maxent_decoder/phrase_table/filtered_phrase_table'
parser = MosesTriplePipeParser()
phrase_objects = parser.parse(phrase_table_file)

pt_cutoff = 4
de_en_phrase_table = InMemoryPhraseTable(phrase_objects, cutoff=pt_cutoff)

In [9]:
# WORKING - test usage of the lm autocomplete lib
from lm_autocomplete.language_model_autocompleter import LanguageModelAutocompleter

# language_models are [{'lang_code': <lang_code>, 'srilm_lm_file': <srilm_lm_file>,
# 'phrase_tables': {(source_lang, target_lang): phrase_table}]

language_models = [
    {
        'lang_code': 'en',
        'srilm_lm_file': '/home/chris/projects/maxent_decoder/lm/europarl.srilm.gz',
        'phrase_tables': {
            ('de', 'en'): de_en_phrase_table
        }
    }
]

lm_autocompleter = LanguageModelAutocompleter(language_models=language_models)

In [10]:
# assume that the target prefix and the source have already been segmented
target_prefix = [u'that', u'is']
source_segment = [u'Dies', u'ist', u'falsch', u'.']

In [11]:
print(lm_autocompleter.language_model_servers.keys())

['en']


In [12]:
#     def get_ranked_completions(self, source_lang, target_lang, source_tokens=[], target_prefix=[]):?
ranked_completions = lm_autocompleter.get_ranked_completions('de', 'en', source_tokens=source_segment, 
                                                             target_prefix=target_prefix, metric='ppl1')
print([c for c in ranked_completions if len(c[0]) > 3])

[(u'wrong .', -18.0351), (u'mistaken .', -45.86), (u'false .', -46.2835), (u'flawed .', -78.747), (u', and', -167.043), (u'wrong', -653.672), (u'That', -679.782), (u'this', -1098.27), (u'This', -1098.27), (u'This is', -1144.49), (u'is wrong', -1398.6), (u'mistaken', -1683.23), (u'false', -1970.36), (u'mistake', -4746.79)]


In [None]:
# OLDER CODE BELOW HERE

In [61]:
def extract_phrases(tokens, max_len=2):
    phrase_list = []
    for i in range(1, min(len(tokens)+1, max_len+1)):
        phrase_list.extend([tokens[j:j+i] for j in range(len(tokens)-i+1)])
    return phrase_list

In [62]:
# Create the in-memory phrase table
from interactive_decoding.phrase_table.parsers.moses_triple_pipe_parser import MosesTriplePipeParser
phrase_table_file = '/home/chris/projects/maxent_decoder/phrase_table/filtered_phrase_table'
parser = MosesTriplePipeParser()
phrase_objects = parser.parse(phrase_table_file)

In [63]:
from interactive_decoding.phrase_table.in_memory_phrase_table import InMemoryPhraseTable

pt_cutoff = 4
phrase_table = InMemoryPhraseTable(phrase_objects, cutoff=pt_cutoff)

In [64]:
# assume that the target prefix and the source have already been segmented
target_prefix = [u'that', u'is']
source_segment = [u'Dies', u'ist', u'falsch', u'.']

source_phrases = extract_phrases(source_segment)
target_lm_candidates = [cand['target'] for phrase in source_phrases for cand in phrase_table.get_target_phrases(phrase)]
# reference
# target = [u'That', u'is', u'wrong', u'.']

In [65]:
len(target_lm_candidates)
target_lm_candidates

[u'That',
 u'this',
 u'This',
 u'are',
 u'has',
 u',',
 u'is',
 u'mistake',
 u'mistaken',
 u'false',
 u'wrong',
 u', and',
 u';',
 u',',
 u'.',
 u'This is',
 u'is wrong',
 u'flawed .',
 u'mistaken .',
 u'false .',
 u'wrong .']

In [66]:
import subprocess

In [67]:
# run the srilm language model server
lm_location = '/home/chris/projects/maxent_decoder/lm/europarl.srilm.gz'
# running the srilm server with a text file:
# ngram -server-port 6070 -lm <lm-name>
start_server_command = "ngram -server-port 6070 -lm {}".format(lm_location)

ngram_server_log = codecs.open('srilm_ngram_server_log', 'w', encoding='utf8')
# server_output, server_error = subprocess.Popen(
#     start_server_command.split(), stdout=ngram_server_log,
#     stderr=subprocess.PIPE)
    
# start the server
popen_obj = subprocess.Popen(
    start_server_command.split(), stdout=ngram_server_log,
    stderr=subprocess.PIPE)


In [64]:
# write a file with some test lines
# candidate_words = [u'wrong', u'happy', u'swimmer', u'crazy', u'aewgaegh']
candidate_words = target_lm_candidates
candidate_file_name = 'candidate_words.tmp'
with codecs.open(candidate_file_name, 'w', encoding='utf8') as tmp:
    for w in candidate_words:
        cand = target_prefix + [w]
        # remember that the lm is LOWERCASE
        tmp.write(' '.join(cand).lower() + '\n')


In [65]:
# call the srilm server with the candidate files and get the output back 
# calling the ngram server
# ngram -use-server 5050 -ppl test.txt -debug 2
# this flag lets you cache client-side: -cache-served-ngrams
# -order n
# Set the maximal N-gram order to be used, by default 3. NOTE: The order of the model is not set automatically when a model file is read, so the same file can be used at various orders. 
# To use models of order higher than 3 it is always necessary to specify this option.

call_server_command = "ngram -use-server 6070 -ppl {} -debug 2 -tolower".format(candidate_file_name)
lm_client_output, lm_client_error = subprocess.Popen(
                    call_server_command.split(), stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE).communicate()


In [66]:
# SRILM output parser

In [67]:
# SRILM prints one blank line at the end of the file, ignore it
output_lines = lm_client_output.split('\n')[:-1]

# each result is separated by one blank line
# iterate until a blank line, then get the previous index
# the logprob is the fourth unit in the whitespace-delimeted last line
# the ppl1 (ppl without sentence ending is the last unit in the whitespace-delimeted last line
ordered_logprobs = []
for i,l in enumerate(output_lines):
    if re.match("^$", l):
        completion_scores = output_lines[i-1].split()
        logprob = float(completion_scores[3])
        ppl = float(completion_scores[-1])
        ordered_logprobs.append(logprob)
        



In [68]:
ordered_logprobs

[-8.49711,
 -9.12213,
 -9.12213,
 -9.49876,
 -9.69618,
 -7.60759,
 -9.44612,
 -11.0292,
 -9.67843,
 -9.88364,
 -8.44608,
 -8.89131,
 -7.50606,
 -7.60759,
 -4.55949,
 -12.2344,
 -12.5828,
 -7.58494,
 -6.64574,
 -6.66171,
 -5.02448]

In [69]:
assert len(ordered_logprobs) == len(candidate_words)

In [70]:
sorted_completions = sorted(zip(candidate_words, ordered_logprobs), key=lambda u: u[1], reverse=True)

In [71]:
sorted_completions

[(u'.', -4.55949),
 (u'wrong .', -5.02448),
 (u'mistaken .', -6.64574),
 (u'false .', -6.66171),
 (u';', -7.50606),
 (u'flawed .', -7.58494),
 (u',', -7.60759),
 (u',', -7.60759),
 (u'wrong', -8.44608),
 (u'That', -8.49711),
 (u', and', -8.89131),
 (u'this', -9.12213),
 (u'This', -9.12213),
 (u'is', -9.44612),
 (u'are', -9.49876),
 (u'mistaken', -9.67843),
 (u'has', -9.69618),
 (u'false', -9.88364),
 (u'mistake', -11.0292),
 (u'This is', -12.2344),
 (u'is wrong', -12.5828)]

In [26]:
output_lines

['that is wrong',
 '\tp( that | <s> ) \t=  0.0276599 [ -1.55815 ]',
 '\tp( is | that ...) \t=  0.605759 [ -0.2177 ]',
 '\tp( wrong | is ...) \t=  0.00204489 [ -2.68933 ]',
 '\tp( </s> | wrong ...) \t=  0.000104496 [ -3.9809 ]',
 '1 sentences, 3 words, 0 OOVs',
 '0 zeroprobs, logprob= -8.44608 ppl= 129.277 ppl1= 653.672',
 '',
 'that is happy',
 '\tp( that | <s> ) \t=  0.0276599 [ -1.55815 ]',
 '\tp( is | that ...) \t=  0.605759 [ -0.2177 ]',
 '\tp( happy | is ...) \t=  2.171e-05 [ -4.66334 ]',
 '\tp( </s> | happy ...) \t=  0.000123957 [ -3.90673 ]',
 '1 sentences, 3 words, 0 OOVs',
 '0 zeroprobs, logprob= -10.3459 ppl= 385.905 ppl1= 2809.57',
 '',
 'that is swimmer',
 '\tp( that | <s> ) \t=  0.0276599 [ -1.55815 ]',
 '\tp( is | that ...) \t=  0.605759 [ -0.2177 ]',
 '\tp( swimmer | is ...) \t=  0 [ -inf ]',
 '\tp( </s> | swimmer ...) \t=  0.00164014 [ -2.78512 ]',
 '1 sentences, 3 words, 0 OOVs',
 '1 zeroprobs, logprob= -4.56097 ppl= 33.1378 ppl1= 190.759',
 '',
 'file candidate_words.