**Eric Meinhardt / emeinhardt@ucsd.edu**

In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [248]:
import os
# import csv
import json

In [10]:
import random

In [34]:
def readStringList(fp):
    lines = []
    with open(fp, 'r') as file:
        for line in file:
            lines.append(line.rstrip())
    return lines

def writeStringList(fp, strings):
    with open(fp, 'w') as file:
        strings_w_linebreaks = list(map(lambda l: l + "\n", strings))
        file.writelines(strings_w_linebreaks)

In [43]:
import subprocess

In [51]:
from itertools import product

In [41]:
import kenlm

In [None]:
# from functools import reduce
# from itertools import chain
# import re
# from more_itertools import replace
# from funcy import compose

In [216]:
from joblib import Parallel, delayed

J = 20
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

In [None]:
import pandas as pd
from plotnine import *

In [3]:
repo_dir = '/mnt/cube/home/AD/emeinhar/fisher-lm'

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview-&amp;-Requirements" data-toc-modified-id="Overview-&amp;-Requirements-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview &amp; Requirements</a></span></li><li><span><a href="#Loading-data" data-toc-modified-id="Loading-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Loading data</a></span></li><li><span><a href="#Splitting-the-corpus-into-training-and-test-sets" data-toc-modified-id="Splitting-the-corpus-into-training-and-test-sets-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Splitting the corpus into training and test sets</a></span></li><li><span><a href="#Build-preliminary-files-for-each-language-model-of-interest" data-toc-modified-id="Build-preliminary-files-for-each-language-model-of-interest-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Build preliminary files for each language model of interest</a></span></li><li><span><a href="#Creating-and-querying-a-model-using-the-kenlm-python-package" data-toc-modified-id="Creating-and-querying-a-model-using-the-kenlm-python-package-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Creating and querying a model using the <code>kenlm</code> python package</a></span></li><li><span><a href="#Calculate-perplexity-of-the-test-set-for-each-model" data-toc-modified-id="Calculate-perplexity-of-the-test-set-for-each-model-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Calculate perplexity of the test set for each model</a></span></li></ul></div>

# Overview & Requirements

# Loading data

In [7]:
main_corpus_fn = 'fisher_utterances_main.txt'
bbn_corpus_fn = 'fisher_utterances_bbn.txt'

my_corpus_fn = main_corpus_fn

In [27]:
utterances = readStringList(my_corpus_fn)

In [28]:
len(utterances)

1077813

# Splitting the corpus into training and test sets

In [29]:
test_proportion = 0.10

exact_num_test_utterances = round(test_proportion * len(utterances))
exact_num_test_utterances

107781

In [30]:
utterances[0]
utterances[23124]

'i mean no money is very important definitely and a million dollars is a dream come true for me i mean'

"and they're not as strict either"

In [31]:
random.shuffle(utterances) #stateful, in-place shuffle

In [32]:
utterances[0]
utterances[23124]

"and i think it does make a big difference and i think with today's society and economy and everything that they're going to have to have it"

"but mhm but i mean if it doesn't hurt the employer that <rem> well"

In [33]:
test_utterances = utterances[:exact_num_test_utterances]
training_utterances = utterances[exact_num_test_utterances:]

For reproducibility, we want to export these...

In [56]:
test_set_prefix = 'fisher_test_utterances'
training_set_prefix = 'fisher_training_utterances'

test_set_fn = test_set_prefix + '.txt'
training_set_fn = training_set_prefix + '.txt'

In [35]:
writeStringList(test_set_fn, test_utterances)
writeStringList(training_set_fn, training_utterances)

In [38]:
%cat -n /mnt/cube/home/AD/emeinhar/fisher-lm/fisher_test_utterances.txt | head -10

     1	and i think it does make a big difference and i think with today's society and economy and everything that they're going to have to have it
     2	right
     3	course when things go wrong they can really go wrong you know
     4	so and why did they move from sundays i do not know
     5	mhm
     6	and the seasons changing and getting ill during season changes or something
     7	and it just got so bad that i couldn't even work
     8	so it'll make it
     9	confide into you can maybe live your life with something like that you know something which is long lasting someone you can
    10	but they're incorrect
cat: write error: Broken pipe


In [40]:
%cat -n /mnt/cube/home/AD/emeinhar/fisher-lm/fisher_training_utterances.txt | head -10

     1	yeah well it's in cattle
     2	no i don't think so i mean that ah if if they planned that then they would have linked it directly to iraq right why go in afghanistan and waste all the all the time
     3	yeah i
     4	we we we've been married fifty six years
     5	mhm
     6	build their government how they want it not
     7	i mean i don't it you know i'm not a real big sports fan but i just you know i don't really watch it that much but i mean i know that kids used to enjoy it and they'd talk about the great games and stuff but it doesn't seem like they know what they're talking about any more
     8	some sort of greens you know um
     9	mhm
    10	i mean unless you have a horrible like drug addiction
cat: write error: Broken pipe


# Build preliminary files for each language model of interest

As far as we are concerned, there are two parameter choices for the language model:
 - the choice of $n$ (as in $n$-gram)
 - the minimum token count threshold $p$ tokens must have before they are pruned

Choices:
 - $n \in \{1, 2, 3, 4, 5\}$
 - $p \in \{0, 5, 10\}$

In [44]:
N = (1,2,3,4,5)
P = (0,5,10)

The two shell commands below 
 - build a unigram model (from the complete set of LDC transcriptions) with no pruning.
 - build a binary memory map version of that same `.arpa` file for faster queries.
 
```
/home/AD/emeinhar/GitHub/kenlm/build/bin/lmplz -o 1 --text fisher_training_utterances.txt --arpa fisher_training_utterances_1gram.arpa
```

```
/home/AD/emeinhar/GitHub/kenlm/build/bin/build_binary fisher_training_utterances_1gram.arpa fisher_training_utterances_1gram.mmap
```

We'll use the `subprocess` module to build and execute as many of these shell calls as we need.

In [45]:
repo_dir

'/mnt/cube/home/AD/emeinhar/fisher-lm'

In [49]:
kenlm_path = '/home/AD/emeinhar/GitHub/kenlm/'
lmplz_path = os.path.join(kenlm_path, 'build/bin/lmplz')
lmplz_path
lmplz = lmplz_path

build_binary_path = os.path.join(kenlm_path, 'build/bin/build_binary')
build_binary_path
build_binary = build_binary_path

'/home/AD/emeinhar/GitHub/kenlm/build/bin/lmplz'

'/home/AD/emeinhar/GitHub/kenlm/build/bin/build_binary'

In [98]:
def build_model(n, p, training_set_prefix):
    assert n in N
    assert p in P
    assert p == 0, 'p other than 0 not supported yet'
    
    print("Building .arpa and .memmap files for n='{0}', p='{1}', training_set_prefix='{2}'".format(n, p, training_set_prefix))
    print('\n')
    
    training_set_fn = training_set_prefix + '.txt'
    arpa_fn = training_set_prefix + '_{0}gram'.format(n) + '.arpa'
    mmap_fn = training_set_prefix + '_{0}gram'.format(n) + '.mmap'
    
    fns = {'training_set':training_set_fn,
           'arpa':arpa_fn,
           'mmap':mmap_fn}
    
    build_arpa_file = [lmplz, '-o', str(n), 
                              '--text', training_set_fn,
                              '--arpa', arpa_fn]
    build_mmap_file = [build_binary, arpa_fn, 
                                     mmap_fn]

#     subprocess.run(build_arpa_file)
#     subprocess.run(build_mmap_file)
#     arpa_build_out = subprocess.run(build_arpa_file, stdout=subprocess.PIPE).stdout.decode('utf-8')
#     print(arpa_build_out)
    
#     binary_build_out = subprocess.run(build_mmap_file, stdout=subprocess.PIPE).stdout.decode('utf-8')
#     print(binary_build_out)
    
    build_arpa_file_cmd = ' '.join(build_arpa_file)
    build_mmap_file_cmd = ' '.join(build_mmap_file)
    
#     os.system(build_arpa_file_cmd)
#     os.system(build_mmap_file_cmd)
#     arpa_build_out = subprocess.run(build_arpa_file_cmd, shell=True, stdout=subprocess.PIPE, 
#                         universal_newlines=True)
    arpa_build_out = subprocess.getoutput(build_arpa_file_cmd)
    print(arpa_build_out)
    
    print(' ')
    if n == 1:
        print('build_binary requires n > 1. Skipping.\n')
        fns['mmap'] = None
    if n != 1:
    #     binary_build_out = subprocess.run(build_mmap_file_cmd, shell=True, stdout=subprocess.PIPE, 
    #                         universal_newlines=True)
        binary_build_out = subprocess.getoutput(build_mmap_file_cmd)
        print(binary_build_out)
        print('\n')
    
    print('Done.')
    print('\n')
    return fns

In [99]:
parameter_combinations = tuple(product(N, 
                                       {0}, #P,
                                       {training_set_prefix},
                                      ))
parameter_combinations

((1, 0, 'fisher_training_utterances'),
 (2, 0, 'fisher_training_utterances'),
 (3, 0, 'fisher_training_utterances'),
 (4, 0, 'fisher_training_utterances'),
 (5, 0, 'fisher_training_utterances'))

In [100]:
output_files = []
for n, p, prefix in parameter_combinations:
    output_files.append(build_model(n, p, prefix))

Building .arpa and .memmap files for n='1', p='0', training_set_prefix='fisher_training_utterances'


=== 1/5 Counting and sorting n-grams ===
Reading /mnt/cube/home/AD/emeinhar/fisher-lm/fisher_training_utterances.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 9666028 types 42524
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:510288
Statistics:
1 42524 D1=0.564287 D2=1.00862 D3+=1.42603
Memory estimate for binary LM:
type      kB
probing 1827 assuming -p 1.5
probing 1993 assuming -r models -p 1.5
trie    1240 without quantization
trie    1120 assuming -q 8 -b 8 quantization 
trie    1240 assuming -a 22 array pointer compression
trie    1120 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:510288
=== 4/5 

In [101]:
output_files

[{'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_1gram.arpa',
  'mmap': None},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_2gram.arpa',
  'mmap': 'fisher_training_utterances_2gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_3gram.arpa',
  'mmap': 'fisher_training_utterances_3gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_4gram.arpa',
  'mmap': 'fisher_training_utterances_4gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_5gram.arpa',
  'mmap': 'fisher_training_utterances_5gram.mmap'}]

In [102]:
parameters_and_models = tuple(zip(parameter_combinations,
                                  output_files))
parameters_and_models

(((1, 0, 'fisher_training_utterances'),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_1gram.arpa',
   'mmap': None}),
 ((2, 0, 'fisher_training_utterances'),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_2gram.arpa',
   'mmap': 'fisher_training_utterances_2gram.mmap'}),
 ((3, 0, 'fisher_training_utterances'),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_3gram.arpa',
   'mmap': 'fisher_training_utterances_3gram.mmap'}),
 ((4, 0, 'fisher_training_utterances'),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_4gram.arpa',
   'mmap': 'fisher_training_utterances_4gram.mmap'}),
 ((5, 0, 'fisher_training_utterances'),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_5gram.arpa',
   'mmap': 'fisher_training_utterances_5gram.mmap'}))

# Creating and querying a model using the `kenlm` python package

In [126]:
def make_model(arpa_or_memmap_fp):
    return kenlm.LanguageModel(arpa_or_memmap_fp)

In [123]:
from math import log2, log10, pow

In [235]:
def make_utterance_score_and_perplexity_functions(model, base=None, parallelize=False):
    if base is None:
        base = 10
    assert base == 10 or base == 2

    if base == 10:
        changeOfBase = lambda log10p: log10p
    else:
        changeOfBase = lambda log10p: log2(pow(10, log10p))
    
    def score(utterance):
        return changeOfBase( model.score(utterance) )
    
    def perplexity_u(utterance):
        score = changeOfBase( model.score(utterance) )
        n = len(utterance.split(' ')) + 1
#         print('base = {0}'.format(base))
#         print('{0} vs. {1}'.format(2 ** (-1.0 * score / n), pow(base, -1.0 * score / n)))
        perp = pow(base, -1.0 * score / n)
        return perp
    
    def perplexity_c(utterances):
        N = sum(map(lambda utt: len(utt.split(' ')) + 1,
                    utterances))
        
        if not parallelize:
            sentence_scores = (score(u) for u in utterances)
            sum_of_scores = sum(sentence_scores)
        else:
            sentence_scores = par((delayed(score)(u) for u in utterances))
            sum_of_scores = sum(sentence_scores)
        
        perp = pow(base, -1.0 * (1.0 / N) * sum_of_scores)
        return perp
    
#     return {'score':score, 
#             'perplexity':perplexity}
    return score, perplexity_u, perplexity_c

In [257]:
def changeOfBase(log10p):
    return log2(pow(10, log10p))

In [258]:
def perplexity_corpus(utterances, model, base=None, parallelize=False):
    if base is None:
        base = 10
    assert base == 10 or base == 2
    
    if base == 10:
        changeOfBase = lambda log10p: log10p
#     else:
#         changeOfBase = lambda log10p: log2(pow(10, log10p))
#     score = lambda utt: changeOfBase( model.score(utt) )
    
    N = sum(map(lambda utt: len(utt.split(' ')) + 1,
                    utterances))
    if not parallelize:
        sentence_scores = (changeOfBase(model.score(u)) for u in utterances)
        sum_of_scores = sum(sentence_scores)
    else:
        if base == 10:
            sentence_scores = par((delayed(model.score)(u) for u in utterances))
            sum_of_scores = sum(sentence_scores)
        else:
            sentence_scores = par((delayed(model.score)(u) for u in utterances))
            sentence_scores_base2 = (changeOfBase(s) for s in sentence_scores)
            sum_of_scores = sum(sentence_scores)

    perp = pow(base, -1.0 * (1.0 / N) * sum_of_scores)
    return perp

In [259]:
bigram = make_model('fisher_training_utterances_2gram.mmap')

In [None]:
bigram.score("this is a sentence")
bigram.score("this is a sentence", eos = True)
bigram.score("this is a sentence </s>", eos=False)
bigram.score("this is a sentence </s>")
tuple(bigram.full_scores("this is a sentence"))
sum(map(lambda triple: triple[0],
        tuple(bigram.full_scores("this is a sentence"))))

In [260]:
test_sentence = "has anyone ever told you"
n = len(test_sentence.split(' ')) + 1
n

6

In [261]:
s = bigram.score(test_sentence)
s # = log_10( p(test_sentence) )

-15.384958267211914

In [262]:
bigram.perplexity(test_sentence)
10.0 ** (-1.0 * s / n)

366.5723563463184

366.5723563463184

In [263]:
bigram_score, bigram_perplexity_utt, bigram_perplexity_corpus = make_utterance_score_and_perplexity_functions(bigram, 10)

In [264]:
bigram_score(test_sentence)
bigram_perplexity_utt(test_sentence)
bigram_perplexity_corpus([test_sentence])

-15.384958267211914

366.5723563463184

366.572356346318

In [265]:
corpus = [test_sentence, 'call me ishmael']
bigram_perplexity_corpus(corpus)

896.605570590842

# Calculate perplexity of the test set for each model

In [266]:
# parameters_and_models = tuple(zip(parameter_combinations,
#                                   output_files))
parameters_and_models

(((1, 0, 'fisher_training_utterances'),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_1gram.arpa',
   'mmap': None}),
 ((2, 0, 'fisher_training_utterances'),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_2gram.arpa',
   'mmap': 'fisher_training_utterances_2gram.mmap'}),
 ((3, 0, 'fisher_training_utterances'),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_3gram.arpa',
   'mmap': 'fisher_training_utterances_3gram.mmap'}),
 ((4, 0, 'fisher_training_utterances'),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_4gram.arpa',
   'mmap': 'fisher_training_utterances_4gram.mmap'}),
 ((5, 0, 'fisher_training_utterances'),
  {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_5gram.arpa',
   'mmap': 'fisher_training_utterances_5gram.mmap'}))

In [267]:
output_files[1:]

[{'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_2gram.arpa',
  'mmap': 'fisher_training_utterances_2gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_3gram.arpa',
  'mmap': 'fisher_training_utterances_3gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_4gram.arpa',
  'mmap': 'fisher_training_utterances_4gram.mmap'},
 {'training_set': 'fisher_training_utterances.txt',
  'arpa': 'fisher_training_utterances_5gram.arpa',
  'mmap': 'fisher_training_utterances_5gram.mmap'}]

In [268]:
# models = tuple(map(make_model,
#                    (output_files[0]['arpa'],) + tuple(map(lambda output_file_dict:output_file_dict['mmap'],
#                                                           output_files[1:]))))
models = tuple(map(make_model,
                   tuple(map(lambda output_file_dict:output_file_dict['mmap'],
                             output_files[1:]))))

len(models)

4

In [269]:
model_collection = tuple(zip(parameter_combinations,
                             output_files,
                             (None,) + models))
add_labels = lambda threeTuple: {'parameters':threeTuple[0],
                                 'files':threeTuple[1],
                                 'model':threeTuple[2]}
model_collection = tuple(map(add_labels,
                             model_collection))
model_collection

({'parameters': (1, 0, 'fisher_training_utterances'),
  'files': {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_1gram.arpa',
   'mmap': None},
  'model': None},
 {'parameters': (2, 0, 'fisher_training_utterances'),
  'files': {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_2gram.arpa',
   'mmap': 'fisher_training_utterances_2gram.mmap'},
  'model': <Model from b'fisher_training_utterances_2gram.mmap'>},
 {'parameters': (3, 0, 'fisher_training_utterances'),
  'files': {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_3gram.arpa',
   'mmap': 'fisher_training_utterances_3gram.mmap'},
  'model': <Model from b'fisher_training_utterances_3gram.mmap'>},
 {'parameters': (4, 0, 'fisher_training_utterances'),
  'files': {'training_set': 'fisher_training_utterances.txt',
   'arpa': 'fisher_training_utterances_4gram.arpa',
   'mmap': 'fisher_training_utterances_4gram.mmap'},

In [270]:
def testModel(model_dict, base, parallelize, corpus):
    d = model_dict
    params = d['parameters']
    n = params[0]
    p = params[1]
    print('Testing model w/ params n = {0} and p = {1}'.format(n, p))
    model = d['model']
    
#     _, _, perplexity_corpus = make_utterance_score_and_perplexity_functions(model, base, parallelize)
    
    perp = perplexity_corpus(corpus, model, base, parallelize)
    return perp

In [271]:
# test_set = readStringList
len(test_utterances)

107781

In [273]:
#bigram model, base 10 perplexity, parallelize
testModel(model_collection[1], 10, False, test_utterances) 

Testing model w/ params n = 2 and p = 0


89.7021667016168

In [274]:
for m in model_collection[1:]:
    testModel(m, 10, False, test_utterances)

Testing model w/ params n = 2 and p = 0


89.7021667016168

Testing model w/ params n = 3 and p = 0


69.96781399496341

Testing model w/ params n = 4 and p = 0


67.84960527446214

Testing model w/ params n = 5 and p = 0


67.76068548524647

todo: try pruning parameters, try an off-the-shelf set of .arpa weights