**Eric Meinhardt / emeinhardt@ucsd.edu**

In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
# import csv
import json

In [3]:
from copy import deepcopy

In [4]:
import random

In [5]:
def readStringList(fp):
    lines = []
    with open(fp, 'r') as file:
        for line in file:
            lines.append(line.rstrip())
    return lines

def writeStringList(fp, strings):
    with open(fp, 'w') as file:
        strings_w_linebreaks = list(map(lambda l: l + "\n", strings))
        file.writelines(strings_w_linebreaks)

In [6]:
import subprocess

In [7]:
from itertools import product

In [8]:
import kenlm

In [9]:
# from functools import reduce
# from itertools import chain
# import re
# from more_itertools import replace
# from funcy import compose

In [10]:
from joblib import Parallel, delayed

J = -1
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [11]:
import pandas as pd
from plotnine import *

In [12]:
repo_dir = '/mnt/cube/home/AD/emeinhar/fisher-lm'

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview-&amp;-Requirements" data-toc-modified-id="Overview-&amp;-Requirements-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview &amp; Requirements</a></span></li><li><span><a href="#Loading-data" data-toc-modified-id="Loading-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Loading data</a></span></li><li><span><a href="#Splitting-the-corpus-into-training-and-test-sets" data-toc-modified-id="Splitting-the-corpus-into-training-and-test-sets-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Splitting the corpus into training and test sets</a></span></li><li><span><a href="#Build-preliminary-files-for-each-language-model-of-interest" data-toc-modified-id="Build-preliminary-files-for-each-language-model-of-interest-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Build preliminary files for each language model of interest</a></span></li><li><span><a href="#Creating-and-querying-a-model-using-the-kenlm-python-package" data-toc-modified-id="Creating-and-querying-a-model-using-the-kenlm-python-package-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Creating and querying a model using the <code>kenlm</code> python package</a></span></li><li><span><a href="#Calculate-perplexity-of-the-test-set-for-each-model" data-toc-modified-id="Calculate-perplexity-of-the-test-set-for-each-model-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Calculate perplexity of the test set for each model</a></span></li></ul></div>

# Overview & Requirements

In [13]:
#FXIME

# Loading data

In [14]:
main_corpus_fn = 'fisher_utterances_main.txt'
bbn_corpus_fn = 'fisher_utterances_bbn.txt'

my_corpus_fn = main_corpus_fn

In [15]:
utterances = readStringList(my_corpus_fn)

In [16]:
len(utterances)

1077813

In [17]:
utterances[0]

def reverse_utterance(u):
    ws = u.split(' ')
    rev = list(reversed(ws))
    rev_str = ' '.join(rev)
    return rev_str

reverse_utterance(utterances[0])

utterances_reversed = list(map(reverse_utterance, utterances))

'i mean no money is very important definitely and a million dollars is a dream come true for me i mean'

'mean i me for true come dream a is dollars million a and definitely important very is money no mean i'

# Splitting the corpus into training and test sets

In [18]:
test_proportion = 0.10

exact_num_test_utterances = round(test_proportion * len(utterances))
exact_num_test_utterances

107781

In [19]:
utterances[0]
utterances[23124]

'i mean no money is very important definitely and a million dollars is a dream come true for me i mean'

"and they're not as strict either"

In [20]:
indices = list(range(len(utterances)))

shuffled_indices = deepcopy(indices)
random.shuffle(shuffled_indices) #stateful, in-place shuffle

In [21]:
shuffled_utterances = list(map(lambda idx: utterances[idx],
                               shuffled_indices))
shuffled_utterances_reversed = list(map(reverse_utterance, shuffled_utterances))

In [22]:
shuffled_utterances[0]
shuffled_utterances[23124]

shuffled_utterances_reversed[0]
shuffled_utterances_reversed[23124]

"so that ah that ah yes yes yeah and so i actually feel safe about it i mean i don't know if it is a"

'yeah'

"a is it if know don't i mean i it about safe feel actually i so and yeah yes yes ah that ah that so"

'yeah'

In [23]:
test_utterances = shuffled_utterances[:exact_num_test_utterances]
training_utterances = shuffled_utterances[exact_num_test_utterances:]

test_utterances_reversed = shuffled_utterances_reversed[:exact_num_test_utterances]
training_utterances_reversed = shuffled_utterances_reversed[exact_num_test_utterances:]

For reproducibility, we want to export these...

In [24]:
test_set_prefix = 'fisher_test_utterances'
training_set_prefix = 'fisher_training_utterances'

test_set_fn = test_set_prefix + '.txt'
training_set_fn = training_set_prefix + '.txt'

test_set_rev_fn = test_set_prefix + '_rev' + '.txt'
training_set_rev_fn = training_set_prefix + '_rev' + '.txt'

In [25]:
writeStringList(test_set_fn, test_utterances)
writeStringList(training_set_fn, training_utterances)

writeStringList(test_set_rev_fn, test_utterances_reversed)
writeStringList(training_set_rev_fn, training_utterances_reversed)

In [26]:
%cat -n /mnt/cube/home/AD/emeinhar/fisher-lm/fisher_training_utterances.txt | head -10

     1	you know
     2	oh yeah
     3	and the child the child may not or it'll lose even if it remembers it it may it may not have it may hamper it's ability to um to choose how to approach
     4	some people in her unit are getting called up she hasn't yet
     5	was she a nice lady
     6	yes that if you if
     7	hi i'm mark king
     8	yeah i guess that's it's the best place to get lobster isn't it maine lobster i've heard that's 'bout the best
     9	well i guess ah it's popular
    10	what's that
cat: write error: Broken pipe


In [27]:
%cat -n /mnt/cube/home/AD/emeinhar/fisher-lm/fisher_training_utterances_rev.txt | head -10

     1	know you
     2	yeah oh
     3	approach to how choose to um to ability it's hamper may it have not may it may it it remembers it if even lose it'll or not may child the child the and
     4	yet hasn't she up called getting are unit her in people some
     5	lady nice a she was
     6	if you if that yes
     7	king mark i'm hi
     8	best the 'bout that's heard i've lobster maine it isn't lobster get to place best the it's that's guess i yeah
     9	popular it's ah guess i well
    10	that what's
cat: write error: Broken pipe


# Build preliminary files for each language model of interest

As far as we are concerned, there are two parameter choices for the language model:
 - the choice of $n$ (as in $n$-gram)
 - the minimum token count threshold $p$ tokens must have before they are pruned

Choices:
 - $n \in \{1, 2, 3, 4, 5\}$
 - $p \in \{0, 5, 10\}$

In [28]:
N = (1,2,3,4,5)
P = (0,5,10)

The two shell commands below 
 - build a unigram model (from the complete set of LDC transcriptions) with no pruning.
 - build a binary memory map version of that same `.arpa` file for faster queries.
 
```
/home/AD/emeinhar/GitHub/kenlm/build/bin/lmplz -o 1 --text fisher_training_utterances.txt --arpa fisher_training_utterances_1gram.arpa
```

```
/home/AD/emeinhar/GitHub/kenlm/build/bin/build_binary fisher_training_utterances_1gram.arpa fisher_training_utterances_1gram.mmap
```

We'll use the `subprocess` module to build and execute as many of these shell calls as we need.

In [29]:
repo_dir

'/mnt/cube/home/AD/emeinhar/fisher-lm'

In [30]:
kenlm_path = '/home/AD/emeinhar/GitHub/kenlm/'
lmplz_path = os.path.join(kenlm_path, 'build/bin/lmplz')
lmplz_path
lmplz = lmplz_path

build_binary_path = os.path.join(kenlm_path, 'build/bin/build_binary')
build_binary_path
build_binary = build_binary_path

'/home/AD/emeinhar/GitHub/kenlm/build/bin/lmplz'

'/home/AD/emeinhar/GitHub/kenlm/build/bin/build_binary'

In [35]:
def build_model(n, p, training_set_prefix, rev=False):
    assert n in N
    assert p in P
    assert p == 0, 'p other than 0 not supported yet'
    assert rev in {True, False}
    
    print("Building .arpa and .memmap files for n='{0}', p='{1}', training_set_prefix='{2}', rev='{3}'".format(n, p, training_set_prefix, rev))
    print('\n')
    
    rev_str = '' if not rev else '_rev'
    training_set_fn = training_set_prefix + rev_str + '.txt'
    arpa_fn = training_set_prefix + rev_str + '_{0}gram'.format(n) + '.arpa'
    mmap_fn = training_set_prefix + rev_str + '_{0}gram'.format(n) + '.mmap'
    
    fns = {'training_set':training_set_fn,
           'arpa':arpa_fn,
           'mmap':mmap_fn}
    
    build_arpa_file = [lmplz, '-o', str(n), 
                              '--text', training_set_fn,
                              '--arpa', arpa_fn]
    build_mmap_file = [build_binary, arpa_fn, 
                                     mmap_fn]

#     subprocess.run(build_arpa_file)
#     subprocess.run(build_mmap_file)
#     arpa_build_out = subprocess.run(build_arpa_file, stdout=subprocess.PIPE).stdout.decode('utf-8')
#     print(arpa_build_out)
    
#     binary_build_out = subprocess.run(build_mmap_file, stdout=subprocess.PIPE).stdout.decode('utf-8')
#     print(binary_build_out)
    
    build_arpa_file_cmd = ' '.join(build_arpa_file)
    build_mmap_file_cmd = ' '.join(build_mmap_file)
    
#     os.system(build_arpa_file_cmd)
#     os.system(build_mmap_file_cmd)
#     arpa_build_out = subprocess.run(build_arpa_file_cmd, shell=True, stdout=subprocess.PIPE, 
#                         universal_newlines=True)
    arpa_build_out = subprocess.getoutput(build_arpa_file_cmd)
    print(arpa_build_out)
    
    print(' ')
    if n == 1:
        print('build_binary requires n > 1. Skipping.\n')
        fns['mmap'] = None
    if n != 1:
    #     binary_build_out = subprocess.run(build_mmap_file_cmd, shell=True, stdout=subprocess.PIPE, 
    #                         universal_newlines=True)
        binary_build_out = subprocess.getoutput(build_mmap_file_cmd)
        print(binary_build_out)
        print('\n')
    
    print('Done.')
    print('\n')
    return fns

In [32]:
parameter_combinations = tuple(product(N, 
                                       {0}, #P,
                                       {training_set_prefix},
                                       {False, True},
                                      ))
parameter_combinations

((1, 0, 'fisher_training_utterances', False),
 (1, 0, 'fisher_training_utterances', True),
 (2, 0, 'fisher_training_utterances', False),
 (2, 0, 'fisher_training_utterances', True),
 (3, 0, 'fisher_training_utterances', False),
 (3, 0, 'fisher_training_utterances', True),
 (4, 0, 'fisher_training_utterances', False),
 (4, 0, 'fisher_training_utterances', True),
 (5, 0, 'fisher_training_utterances', False),
 (5, 0, 'fisher_training_utterances', True))

In [36]:
# peak memory usage is something like 40GB
# 2m50s on wittgenstein
output_files = []
for n, p, prefix, rev in parameter_combinations:
    output_files.append(build_model(n, p, prefix, rev))

Building .arpa and .memmap files for n='1', p='0', training_set_prefix='fisher_training_utterances', rev='False'


=== 1/5 Counting and sorting n-grams ===
Reading /mnt/cube/home/AD/emeinhar/fisher-lm/fisher_training_utterances.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 9660450 types 42458
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:509496
Statistics:
1 42458 D1=0.560943 D2=1.00505 D3+=1.49408
Memory estimate for binary LM:
type      kB
probing 1824 assuming -p 1.5
probing 1990 assuming -r models -p 1.5
trie    1238 without quantization
trie    1118 assuming -q 8 -b 8 quantization 
trie    1238 assuming -a 22 array pointer compression
trie    1118 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:50

Reading fisher_training_utterances_rev_3gram.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS


Done.


Building .arpa and .memmap files for n='4', p='0', training_set_prefix='fisher_training_utterances', rev='False'


=== 1/5 Counting and sorting n-grams ===
Reading /mnt/cube/home/AD/emeinhar/fisher-lm/fisher_training_utterances.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 9660450 types 42458
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:509496 2:18400368640 3:34500694016 4:55201107968
Statistics:
1 42458 D1=0.568635 D2=0.994079 D3+=1.51652
2 861871 D1=0.701037 D2=1.08009 D3+=1.40893
3 3259120 D1=0.797312 D2=1.11694 D3+=1.37379


In [37]:
output_files

[{'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_1gram.arpa',
  'mmap': None},
 {'training_set': 'fisher_training_utterances_rev.txt',
  'arpa': 'fisher_training_utterances_rev_1gram.arpa',
  'mmap': None},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_2gram.arpa',
  'mmap': 'fisher_training_utterances_2gram.mmap'},
 {'training_set': 'fisher_training_utterances_rev.txt',
  'arpa': 'fisher_training_utterances_rev_2gram.arpa',
  'mmap': 'fisher_training_utterances_rev_2gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_3gram.arpa',
  'mmap': 'fisher_training_utterances_3gram.mmap'},
 {'training_set': 'fisher_training_utterances_rev.txt',
  'arpa': 'fisher_training_utterances_rev_3gram.arpa',
  'mmap': 'fisher_training_utterances_rev_3gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_4gram.arpa',
  'mm

In [38]:
parameters_and_models = tuple(zip(parameter_combinations,
                                  output_files))
parameters_and_models

(((1, 0, 'fisher_training_utterances', False),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_1gram.arpa',
   'mmap': None}),
 ((1, 0, 'fisher_training_utterances', True),
  {'training_set': 'fisher_training_utterances_rev.txt',
   'arpa': 'fisher_training_utterances_rev_1gram.arpa',
   'mmap': None}),
 ((2, 0, 'fisher_training_utterances', False),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_2gram.arpa',
   'mmap': 'fisher_training_utterances_2gram.mmap'}),
 ((2, 0, 'fisher_training_utterances', True),
  {'training_set': 'fisher_training_utterances_rev.txt',
   'arpa': 'fisher_training_utterances_rev_2gram.arpa',
   'mmap': 'fisher_training_utterances_rev_2gram.mmap'}),
 ((3, 0, 'fisher_training_utterances', False),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_3gram.arpa',
   'mmap': 'fisher_training_utterances_3gram.mmap'}),
 ((3, 0, 'fisher_train

# Creating and querying a model using the `kenlm` python package

In [39]:
def make_model(arpa_or_memmap_fp):
    return kenlm.LanguageModel(arpa_or_memmap_fp)

In [40]:
from math import log2, log10, pow

In [41]:
def make_utterance_score_and_perplexity_functions(model, base=None, parallelize=False):
    if base is None:
        base = 10
    assert base == 10 or base == 2

    if base == 10:
        changeOfBase = lambda log10p: log10p
    else:
        changeOfBase = lambda log10p: log2(pow(10, log10p))
    
    def score(utterance):
        return changeOfBase( model.score(utterance) )
    
    def perplexity_u(utterance):
        score = changeOfBase( model.score(utterance) )
        n = len(utterance.split(' ')) + 1
#         print('base = {0}'.format(base))
#         print('{0} vs. {1}'.format(2 ** (-1.0 * score / n), pow(base, -1.0 * score / n)))
        perp = pow(base, -1.0 * score / n)
        return perp
    
    def perplexity_c(utterances):
        N = sum(map(lambda utt: len(utt.split(' ')) + 1,
                    utterances))
        
        if not parallelize:
            sentence_scores = (score(u) for u in utterances)
            sum_of_scores = sum(sentence_scores)
        else:
            sentence_scores = par((delayed(score)(u) for u in utterances))
            sum_of_scores = sum(sentence_scores)
        
        perp = pow(base, -1.0 * (1.0 / N) * sum_of_scores)
        return perp
    
#     return {'score':score, 
#             'perplexity':perplexity}
    return score, perplexity_u, perplexity_c

In [42]:
def changeOfBase(log10p):
    return log2(pow(10, log10p))

In [43]:
def perplexity_corpus(utterances, model, base=None, parallelize=False):
    if base is None:
        base = 10
    assert base == 10 or base == 2
    
    if base == 10:
        changeOfBase = lambda log10p: log10p
#     else:
#         changeOfBase = lambda log10p: log2(pow(10, log10p))
#     score = lambda utt: changeOfBase( model.score(utt) )
    
    N = sum(map(lambda utt: len(utt.split(' ')) + 1,
                    utterances))
    if not parallelize:
        sentence_scores = (changeOfBase(model.score(u)) for u in utterances)
        sum_of_scores = sum(sentence_scores)
    else:
        if base == 10:
            sentence_scores = par((delayed(model.score)(u) for u in utterances))
            sum_of_scores = sum(sentence_scores)
        else:
            sentence_scores = par((delayed(model.score)(u) for u in utterances))
            sentence_scores_base2 = (changeOfBase(s) for s in sentence_scores)
            sum_of_scores = sum(sentence_scores)

    perp = pow(base, -1.0 * (1.0 / N) * sum_of_scores)
    return perp

In [44]:
bigram = make_model('fisher_training_utterances_2gram.mmap')

In [45]:
bigram.score("this is a sentence")
bigram.score("this is a sentence", eos = True)
bigram.score("this is a sentence </s>", eos=False)
bigram.score("this is a sentence </s>")
tuple(bigram.full_scores("this is a sentence"))
sum(map(lambda triple: triple[0],
        tuple(bigram.full_scores("this is a sentence"))))

-9.59227466583252

-9.59227466583252

-9.59227466583252

-11.219353675842285

((-2.6841859817504883, 2, False),
 (-0.6199646592140198, 2, False),
 (-1.2637051343917847, 2, False),
 (-4.348629474639893, 2, False),
 (-0.6757899522781372, 2, False))

-9.592275202274323

In [46]:
test_sentence = "has anyone ever told you"
n = len(test_sentence.split(' ')) + 1
n

6

In [47]:
s = bigram.score(test_sentence)
s # = log_10( p(test_sentence) )

-15.431868553161621

In [48]:
bigram.perplexity(test_sentence)
10.0 ** (-1.0 * s / n)

373.23132982001835

373.23132982001835

In [49]:
bigram_score, bigram_perplexity_utt, bigram_perplexity_corpus = make_utterance_score_and_perplexity_functions(bigram, 10)

In [50]:
bigram_score(test_sentence)
bigram_perplexity_utt(test_sentence)
bigram_perplexity_corpus([test_sentence])

-15.431868553161621

373.23132982001835

373.23132982001835

In [51]:
corpus = [test_sentence, 'call me ishmael']
bigram_perplexity_corpus(corpus)

903.3458098303903

# Calculate perplexity of the test set for each model

In [52]:
# parameters_and_models = tuple(zip(parameter_combinations,
#                                   output_files))
parameters_and_models

(((1, 0, 'fisher_training_utterances', False),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_1gram.arpa',
   'mmap': None}),
 ((1, 0, 'fisher_training_utterances', True),
  {'training_set': 'fisher_training_utterances_rev.txt',
   'arpa': 'fisher_training_utterances_rev_1gram.arpa',
   'mmap': None}),
 ((2, 0, 'fisher_training_utterances', False),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_2gram.arpa',
   'mmap': 'fisher_training_utterances_2gram.mmap'}),
 ((2, 0, 'fisher_training_utterances', True),
  {'training_set': 'fisher_training_utterances_rev.txt',
   'arpa': 'fisher_training_utterances_rev_2gram.arpa',
   'mmap': 'fisher_training_utterances_rev_2gram.mmap'}),
 ((3, 0, 'fisher_training_utterances', False),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_3gram.arpa',
   'mmap': 'fisher_training_utterances_3gram.mmap'}),
 ((3, 0, 'fisher_train

In [55]:
output_files
' '
output_files[2:]

[{'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_1gram.arpa',
  'mmap': None},
 {'training_set': 'fisher_training_utterances_rev.txt',
  'arpa': 'fisher_training_utterances_rev_1gram.arpa',
  'mmap': None},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_2gram.arpa',
  'mmap': 'fisher_training_utterances_2gram.mmap'},
 {'training_set': 'fisher_training_utterances_rev.txt',
  'arpa': 'fisher_training_utterances_rev_2gram.arpa',
  'mmap': 'fisher_training_utterances_rev_2gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_3gram.arpa',
  'mmap': 'fisher_training_utterances_3gram.mmap'},
 {'training_set': 'fisher_training_utterances_rev.txt',
  'arpa': 'fisher_training_utterances_rev_3gram.arpa',
  'mmap': 'fisher_training_utterances_rev_3gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_4gram.arpa',
  'mm

' '

[{'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_2gram.arpa',
  'mmap': 'fisher_training_utterances_2gram.mmap'},
 {'training_set': 'fisher_training_utterances_rev.txt',
  'arpa': 'fisher_training_utterances_rev_2gram.arpa',
  'mmap': 'fisher_training_utterances_rev_2gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_3gram.arpa',
  'mmap': 'fisher_training_utterances_3gram.mmap'},
 {'training_set': 'fisher_training_utterances_rev.txt',
  'arpa': 'fisher_training_utterances_rev_3gram.arpa',
  'mmap': 'fisher_training_utterances_rev_3gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_4gram.arpa',
  'mmap': 'fisher_training_utterances_4gram.mmap'},
 {'training_set': 'fisher_training_utterances_rev.txt',
  'arpa': 'fisher_training_utterances_rev_4gram.arpa',
  'mmap': 'fisher_training_utterances_rev_4gram.mmap'},
 {'training_set': 'fisher_training

In [56]:
# models = tuple(map(make_model,
#                    (output_files[0]['arpa'],) + tuple(map(lambda output_file_dict:output_file_dict['mmap'],
#                                                           output_files[1:]))))
models = tuple(map(make_model,
                   tuple(map(lambda output_file_dict:output_file_dict['mmap'],
                             output_files[2:]))))

len(models)

8

In [57]:
model_collection = tuple(zip(parameter_combinations,
                             output_files,
                             (None,None) + models))
add_labels = lambda threeTuple: {'parameters':threeTuple[0],
                                 'files':threeTuple[1],
                                 'model':threeTuple[2]}
model_collection = tuple(map(add_labels,
                             model_collection))
model_collection

({'parameters': (1, 0, 'fisher_training_utterances', False),
  'files': {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_1gram.arpa',
   'mmap': None},
  'model': None},
 {'parameters': (1, 0, 'fisher_training_utterances', True),
  'files': {'training_set': 'fisher_training_utterances_rev.txt',
   'arpa': 'fisher_training_utterances_rev_1gram.arpa',
   'mmap': None},
  'model': None},
 {'parameters': (2, 0, 'fisher_training_utterances', False),
  'files': {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_2gram.arpa',
   'mmap': 'fisher_training_utterances_2gram.mmap'},
  'model': <Model from b'fisher_training_utterances_2gram.mmap'>},
 {'parameters': (2, 0, 'fisher_training_utterances', True),
  'files': {'training_set': 'fisher_training_utterances_rev.txt',
   'arpa': 'fisher_training_utterances_rev_2gram.arpa',
   'mmap': 'fisher_training_utterances_rev_2gram.mmap'},
  'model': <Model from b'fisher_train

In [61]:
model_collection[4]

{'parameters': (3, 0, 'fisher_training_utterances', False),
 'files': {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_3gram.arpa',
  'mmap': 'fisher_training_utterances_3gram.mmap'},
 'model': <Model from b'fisher_training_utterances_3gram.mmap'>}

In [58]:
def testModel(model_dict, base, parallelize, corpus):
    d = model_dict
    params = d['parameters']
    n = params[0]
    p = params[1]
    print('Testing model w/ params n = {0} and p = {1}'.format(n, p))
    model = d['model']
    
#     _, _, perplexity_corpus = make_utterance_score_and_perplexity_functions(model, base, parallelize)
    
    perp = perplexity_corpus(corpus, model, base, parallelize)
    return perp

In [59]:
# test_set = readStringList
len(test_utterances)

107781

In [62]:
model_collection[2]

{'parameters': (2, 0, 'fisher_training_utterances', False),
 'files': {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_2gram.arpa',
  'mmap': 'fisher_training_utterances_2gram.mmap'},
 'model': <Model from b'fisher_training_utterances_2gram.mmap'>}

In [60]:
#bigram model, base 10 perplexity, parallelize
testModel(model_collection[2], 10, False, test_utterances)

Testing model w/ params n = 2 and p = 0


89.52933814992959

In [66]:
for m in model_collection[2:]:
    if m['parameters'][3] == False: #non-reversed strings
        print('Normal (L⟶R) model:')
        testModel(m, 10, False, test_utterances)
    else:
        print('Reversed (L⟵R) model:')
        testModel(m, 10, False, test_utterances_reversed)
    print('-------------------')

Normal (L⟶R) model:
Testing model w/ params n = 2 and p = 0


89.52933814992959

-------------------
Reversed (L⟵R) model:
Testing model w/ params n = 2 and p = 0


89.51589644096235

-------------------
Normal (L⟶R) model:
Testing model w/ params n = 3 and p = 0


69.82614273676198

-------------------
Reversed (L⟵R) model:
Testing model w/ params n = 3 and p = 0


69.89188898841859

-------------------
Normal (L⟶R) model:
Testing model w/ params n = 4 and p = 0


67.79243667473473

-------------------
Reversed (L⟵R) model:
Testing model w/ params n = 4 and p = 0


67.9357583463501

-------------------
Normal (L⟶R) model:
Testing model w/ params n = 5 and p = 0


67.68897181326514

-------------------
Reversed (L⟵R) model:
Testing model w/ params n = 5 and p = 0


67.87859158005193

-------------------


todo: try pruning parameters, try an off-the-shelf set of .arpa weights