In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Dependencies" data-toc-modified-id="Dependencies-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Dependencies</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Usage</a></span></li></ul></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Imports-/-loading-data" data-toc-modified-id="Imports-/-loading-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Imports / loading data</a></span><ul class="toc-item"><li><span><a href="#Language-model" data-toc-modified-id="Language-model-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Language model</a></span></li><li><span><a href="#Contexts" data-toc-modified-id="Contexts-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Contexts</a></span></li><li><span><a href="#Vocabulary" data-toc-modified-id="Vocabulary-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Vocabulary</a></span></li></ul></li><li><span><a href="#Main-calculation" data-toc-modified-id="Main-calculation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Main calculation</a></span></li><li><span><a href="#Calculate-the-number-of-computations-+-estimate-required-space" data-toc-modified-id="Calculate-the-number-of-computations-+-estimate-required-space-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Calculate the number of computations + estimate required space</a></span></li><li><span><a href="#Pick-out-relevant-functions-for-mapping-between-context/word-and-index" data-toc-modified-id="Pick-out-relevant-functions-for-mapping-between-context/word-and-index-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Pick out relevant functions for mapping between context/word and index</a></span></li><li><span><a href="#Construct-and-write-distributions" data-toc-modified-id="Construct-and-write-distributions-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Construct and write distributions</a></span><ul class="toc-item"><li><span><a href="#Doing-calculations-in-place-/-via-memory-mapped-arrays" data-toc-modified-id="Doing-calculations-in-place-/-via-memory-mapped-arrays-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Doing calculations in-place / via memory mapped arrays</a></span></li><li><span><a href="#Doing-calculations-in-memory-and-writing-to-disk" data-toc-modified-id="Doing-calculations-in-memory-and-writing-to-disk-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Doing calculations in memory and writing to disk</a></span></li><li><span><a href="#Creating-a-version-that-contains-probabilities-(and-is-normalized)" data-toc-modified-id="Creating-a-version-that-contains-probabilities-(and-is-normalized)-7.3"><span class="toc-item-num">7.3&nbsp;&nbsp;</span>Creating a version that contains probabilities (and is normalized)</a></span><ul class="toc-item"><li><span><a href="#Sketch-of-normalization-process" data-toc-modified-id="Sketch-of-normalization-process-7.3.1"><span class="toc-item-num">7.3.1&nbsp;&nbsp;</span>Sketch of normalization process</a></span></li></ul></li><li><span><a href="#Normalization" data-toc-modified-id="Normalization-7.4"><span class="toc-item-num">7.4&nbsp;&nbsp;</span>Normalization</a></span></li></ul></li></ul></div>

# Overview

Given 
 - a file path $m$ to a `.arpa` file (or, more realistically/practically, a `kenlm` memory mapped version of one) for a language model
 - a file path $c$ to a set of $n$-gram contexts $C$ (a `.txt` file with one context per line, where a context is sequence of space-separated wordforms)
 - a file path $v$ to a vocabulary $W$ (a `.txt` file with one wordform per line)
 - a filepath $o$ for the main output of the notebook
 
this notebook will calculate the distribution $p(W|C)$ as a memory mapped `numpy` array (written to $o$).

## Dependencies

 - `kenlm`
 - `numpy`
 - `joblib`

## Usage

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

In [3]:
from shutil import copyfile

In [6]:
# parameters

# m = ''
m = '/home/AD/emeinhar/fisher-lm' + '/' + 'fisher_utterances_main_4gram.mmap'

# c = ''
c = '/home/AD/emeinhar/buckeye-lm' + '/' + 'buckeye_contexts.txt'

# v = ''
v = '/home/AD/emeinhar/fisher-lm' + '/' + 'fisher_vocabulary_main.txt'

# o = ''
o = '/home/AD/emeinhar/wr' + '/' + 'LD_Fisher_vocab_in_Buckeye_contexts' + '/' + 'LD_fisher_vocab_in_buckeye_contexts'

In [7]:
output_dir = path.dirname(o)
if not path.exists(output_dir):
    print('Making ' + output_dir)
    makedirs(output_dir)

In [8]:
copyfile(c, path.join(output_dir, path.basename(c)))
copyfile(v, path.join(output_dir, path.basename(v)))

'/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/buckeye_contexts.txt'

'/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/fisher_vocabulary_main.txt'

In [9]:
getcwd()

'/mnt/cube/home/AD/emeinhar/wr'

In [10]:
listdir()

['probdist.py',
 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0',
 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.05',
 'CM_AmE_destressed_aligned_w_LTR_Buckeye_pseudocount0',
 'swbd2003_contexts.txt',
 'boilerplate.py',
 'GD_AmE-diphones - LTR_Buckeye alignment application to LTR_Buckeye.ipynb',
 'LTR_Buckeye',
 '.gitignore',
 'CM_AmE_destressed_aligned_w_LTR_newdic_destressed_pseudocount0.1',
 'CM_AmE_destressed_aligned_w_LTR_CMU_destressed_pseudocount0',
 'LTR_Buckeye_aligned_w_GD_AmE_destressed',
 'Run n-phone analysis of gating data.ipynb',
 'GD_AmE_destressed_aligned_w_LTR_Buckeye',
 '__pycache__',
 'GD_AmE-diphones - LTR_CMU_destressed alignment application to LTR_CMU_destressed.ipynb',
 'GD_AmE-diphones - LTR_CMU_destressed alignment definition.ipynb',
 'LD_Fisher_vocab_in_Buckeye_contexts',
 'GD_AmE-diphones - LTR_newdic_destressed alignment application to AmE-diphones.ipynb',
 '1 initial directory setup.txt',
 'GD_AmE-diphones - LTR_newdic_de

# Imports / loading data

In [11]:
import kenlm
import csv
import numpy as np

In [12]:
from boilerplate import stamp, stampedNote

In [13]:
from tqdm import tqdm

from joblib import Parallel, delayed

J = 30
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def identity(x):
    return x

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

## Language model

In [14]:
model = kenlm.LanguageModel(m)

## Contexts

In [15]:
contexts = []
with open(c) as file:
    for line in file:
        contexts.append(line.rstrip())
contexts = tuple(contexts)
len(contexts)
contexts[:10]

17415

('a',
 "aaron's",
 'ability',
 'about',
 'above',
 'absent',
 'absentee',
 'absolutely',
 'accept',
 'accommodate')

In [16]:
assert len(set(contexts)) == len(contexts), "Contexts must consist of unique strings."

## Vocabulary

In [17]:
vocabulary = []
with open(v) as file:
    for line in file:
        vocabulary.append(line.rstrip())
vocabulary = tuple(vocabulary)
len(vocabulary)
vocabulary[:10]

44064

("'and",
 "'berserkly'",
 "'bout",
 "'burb",
 "'burban",
 "'burbs",
 "'cau",
 "'cause",
 "'cept",
 "'cide")

In [18]:
assert len(set(vocabulary)) == len(vocabulary), "Vocabulary must consist of unique wordforms."

# Main calculation

In [19]:
from random import choice

In [20]:
ctxt = choice(contexts)
ctxt

wrd = choice(vocabulary)
wrd

'grabbed'

'geraldo'

In [21]:
model.score("this is a sentence", eos = False)
model.score("this is a sentence")
model.score("this is a sentence", eos = True)
model.score("this is a sentence </s>", eos=False)
model.score("this is a sentence </s>")
tuple(model.full_scores("this is a sentence"))
sum(map(lambda triple: triple[0],
        tuple(model.full_scores("this is a sentence"))))
' '
tuple(model.full_scores("this is a sentence", eos=False, bos=False))
sum(map(lambda triple: triple[0],
        tuple(model.full_scores("this is a sentence", eos=False, bos=False))))

-9.022207260131836

-9.479642868041992

-9.479642868041992

-9.479642868041992

-11.116127967834473

((-2.684236526489258, 2, False),
 (-0.21483998000621796, 3, False),
 (-1.1899677515029907, 4, False),
 (-4.933162689208984, 2, False),
 (-0.45743539929389954, 3, False))

-9.47964234650135

' '

((-2.610599994659424, 1, False),
 (-1.2509719133377075, 2, False),
 (-1.429597020149231, 3, False),
 (-4.933162689208984, 2, False))

-10.224331617355347

In [22]:
from math import log10, log2

In [23]:
def score(word, context, base2=True, surprisal=True):
    score_infos = tuple(model.full_scores(context + ' ' + word, eos=False, bos=False))
    key_score_log10 = score_infos[-1][0]
    if base2:
        key_score = key_score_log10 / log10(2)
    if surprisal:
        key_score = -1.0 * key_score
    return key_score

In [24]:
ctxt
wrd
score(wrd, ctxt)

'grabbed'

'geraldo'

19.30301092009873

# Calculate the number of computations + estimate required space

In [25]:
bits_per_cell = 64
bytes_per_cell = bits_per_cell / 8

len(contexts)
len(vocabulary)
"{:,}".format( len(contexts) * len(vocabulary) )
"{:,} GB".format( len(contexts) * len(vocabulary) * bytes_per_cell / 1e9)

17415

44064

'767,374,560'

'6.13899648 GB'

In [None]:
# from itertools import product

In [None]:
# computations = product(contexts, vocabulary)

In [None]:
# computations = tuple(product(contexts, vocabulary))

In [None]:
# from random import choices

In [None]:
# example_computations = choices(computations, k=10)
# example_computations
# ex = choice(example_computations)
# ex

# Pick out relevant functions for mapping between context/word and index

In [27]:
wrd
vocabulary.index(wrd)
ctxt
contexts.index(ctxt)

'geraldo'

16024

'grabbed'

448

In [None]:
# ex
# contexts.index(ex[0])
# contexts[ contexts.index(ex[0]) ]

In [28]:
def score_np(word_idx, context_idx, base2=True, surprisal=True):
    return score(vocabulary[word_idx], contexts[context_idx], base2=base2, surprisal=surprisal)

# Construct and write distributions

In [29]:
num_words = len(vocabulary)
num_contexts = len(contexts)
my_shape = (num_words, num_contexts) #columns are distributions
my_shape
num_words * num_contexts

(44064, 17415)

767374560

In [30]:
memory_map = False

## Doing calculations in-place / via memory mapped arrays

This will only begin to make sense if the array is going to be too large to fit in memory or very small; otherwise joblib will use threads to parallelize the computation (because it involves lots of IO).

In [31]:
if memory_map:
    hVC = np.memmap(o + '.hV_C', dtype='float64', mode='w+', shape=my_shape)
    hVC.nbytes / 1e9
    hVC.dtype

In [32]:
score_np_vec = np.vectorize(score_np)

In [33]:
if memory_map:
    def define_score(word_idx, context_idx, base2=True, surprisal=True):
        hVC[word_idx, context_idx] = score_np(word_idx, context_idx, base2=base2, surprisal=surprisal)

In [34]:
if memory_map:
    stampedNote("Started calculations")

In [35]:
if memory_map:
    # est 7h on wittgenstein with J=30 and other computations going on in the background
    # pretty sure that means it's using threads rather than processes
    par(delayed(define_score)(w_idx, ctxt_idx) 
        for ctxt_idx in range(num_contexts) 
        for w_idx in range(num_words))

In [36]:
if memory_map:
    stampedNote("Ended calculations")

## Doing calculations in memory and writing to disk

In [38]:
if not memory_map:
    #~30m on wittgenstein
#     hVC = np.vstack([score_np_vec(np.array(range(num_words)), c_idx) for c_idx in tqdm(range(num_contexts))]).T
    
    #takes ~2.75m on wittgenstein with J=30 and other things going on in the background
    hVC = np.vstack(par(delayed(score_np_vec)(np.array(range(num_words)), c_idx)
                        for c_idx in range(num_contexts))).T

[Parallel(n_jobs=30)]: Using backend MultiprocessingBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1436s.) Setting batch_size=2.
[Parallel(n_jobs=30)]: Batch computation too fast (0.1436s.) Setting batch_size=4.
[Parallel(n_jobs=30)]: Done   4 tasks      | elapsed:    0.4s
[Parallel(n_jobs=30)]: Done  13 tasks      | elapsed:    0.7s
[Parallel(n_jobs=30)]: Done  24 tasks      | elapsed:    1.0s
[Parallel(n_jobs=30)]: Done  52 tasks      | elapsed:    1.5s
[Parallel(n_jobs=30)]: Done 104 tasks      | elapsed:    1.9s
[Parallel(n_jobs=30)]: Done 156 tasks      | elapsed:    2.4s
[Parallel(n_jobs=30)]: Done 216 tasks      | elapsed:    3.0s
[Parallel(n_jobs=30)]: Done 276 tasks      | elapsed:    3.4s
[Parallel(n_jobs=30)]: Done 344 tasks      | elapsed:    4.0s
[Parallel(n_jobs=30)]: Done 412 tasks      | elapsed:    4.6s
[Parallel(n_jobs=30)]: Done 488 tasks      | elapsed:    5.3s
[Parallel(n_jobs=30)]: Done 564 tasks      | elapsed:    6.0s
[P

In [45]:
if not memory_map:
    hVC.nbytes / 1e9
    hVC.dtype
    hVC_on_disk = np.memmap(o + '.hV_C', dtype='float64', mode='w+', shape=my_shape)
    
    #takes ~1.25m on wittgenstein
    hVC_on_disk[:,:] = hVC
    hVC = hVC_on_disk
    del hVC_on_disk

6.13899648

dtype('float64')

<function numpy.save(file, arr, allow_pickle=True, fix_imports=True)>

In [46]:
o

'/home/AD/emeinhar/wr/LD_Fisher_vocab_in_Buckeye_contexts/LD_fisher_vocab_in_buckeye_contexts'

In [47]:
listdir(output_dir)

['LD_fisher_vocab_in_buckeye_contexts.pV_C',
 'buckeye_contexts.txt',
 'LD_fisher_vocab_in_buckeye_contexts',
 '.ipynb_checkpoints',
 'LD_fisher_vocab_in_buckeye_contexts.hV_C',
 'fisher_vocabulary_main.txt']

In [48]:
hVC.nbytes / 1e9
hVC.shape
hVC.dtype

6.13899648

(44064, 17415)

dtype('float64')

## Creating a version that contains probabilities (and is normalized)

We can't just naively convert (-) log-probabilities to probabilities if we're interested in distributions:

In [49]:
# NB! assumes hVC is in base 2 surprisals...
dist_norms = np.sum(np.exp2(-1.0 * hVC), axis=0)
dist_norms

array([0.98952166, 0.98384789, 0.86949554, ..., 0.93416071, 0.96380412,
       0.86971511])

### Sketch of normalization process

In [50]:
random_context = choice(contexts)
random_context

'a member of'

In [51]:
contexts.index(random_context)

7121

In [52]:
hW_rc = np.array(par(delayed(score)(w, random_context) for w in vocabulary))

[Parallel(n_jobs=30)]: Using backend MultiprocessingBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Batch computation too fast (0.0031s.) Setting batch_size=128.
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Batch computation too fast (0.0299s.) Setting batch_size=1712.
[Parallel(n_jobs=30)]: Done 1084 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done 44064 out of 44064 | elapsed:    0.2s finished


In [53]:
np.array_equal( hVC[:,contexts.index(random_context)], hW_rc )

True

In [54]:
pW_rc = np.exp2(-1.0 * hW_rc)
pW_rc[:10]
np.sum(pW_rc)

array([7.81000085e-08, 7.81000085e-08, 3.09774365e-06, 1.09451992e-07,
       7.81000085e-08, 7.81000085e-08, 7.81000085e-08, 1.97049012e-04,
       3.42848590e-07, 7.81000085e-08])

0.9849723253583984

In [56]:
hVC.shape

(44064, 17415)

In [57]:
# from https://stats.stackexchange.com/a/66621
my_logprobs = -1.0 * hW_rc
my_epsilon = 10 ** (-1.0 * 16)
print('𝛆 = {0}'.format( my_epsilon ))

my_n = my_logprobs.shape[0]
print('n = {0}'.format( my_n ))

my_max = np.max(my_logprobs)
print('max(λᵢ) = λᵦ = {0}'.format( my_max ))

my_threshold = np.log2(my_epsilon) - np.log2(my_n)
print('𝚹 = {0}'.format( my_threshold ))

mask = my_logprobs - my_max >= my_threshold
np.sum(mask)
to_alpha = lambda logprob: np.exp2(logprob - my_max) if (logprob - my_max) >= my_threshold else 0.0
to_alpha_vec = lambda logprobs: np.exp2(logprobs - my_max) * (logprobs - my_max >= my_threshold)
my_alphas = np.array([to_alpha(l) for l in my_logprobs])
assert np.array_equal(my_alphas, to_alpha_vec(my_logprobs))
my_alpha_norm = np.sum(my_alphas)
my_probs = my_alphas / my_alpha_norm
np.sum(my_probs)

𝛆 = 1e-16
n = 44064
max(λᵢ) = λᵦ = -1.4649360837523755
𝚹 = -68.57816236233276


44064

1.0000000000000002

## Normalization

In [58]:
# from https://stats.stackexchange.com/a/66621
def normalize_logprobs(logprobs, d=16, axis=0, b=None):
# def normalize_logprobs(logprobs, d=16, b=None):
#     axis = 0
    n = logprobs.shape[axis]
    epsilon = 10**(-1.0 * d)
    maxlogp = np.max( logprobs[axis] )
    if b is None:
        threshold = np.log(epsilon) - np.log(n)
        to_alpha = lambda logprob: np.exp(logprob - my_max) if (logprob - my_max) >= my_threshold else 0.0
        to_alpha_vec = lambda logprobs: np.exp(logprobs - my_max) * (logprobs - my_max >= my_threshold)
    elif b == 2:
        threshold = np.log2(epsilon) - np.log2(n)
        to_alpha = lambda logprob: np.exp2(logprob - my_max) if (logprob - my_max) >= my_threshold else 0.0
        to_alpha_vec = lambda logprobs: np.exp2(logprobs - my_max) * (logprobs - my_max >= my_threshold)
    elif b == 10:
        threshold = np.log10(epsilon) - np.log10(n)
        to_alpha = lambda logprob: np.power(logprob - my_max, 10) if (logprob - my_max) >= my_threshold else 0.0
        to_alpha_vec = lambda logprobs: np.power(logprobs - my_max, 10) * (logprobs - my_max >= my_threshold)
    else:
        threshold = (np.log(epsilon) / np.log(b)) - (np.log(n) / np.log(b))
        to_alpha = lambda logprob: np.power(logprob - my_max, b) if (logprob - my_max) >= my_threshold else 0.0
        to_alpha_vec = lambda logprobs: np.power(logprobs - my_max, 10) * (logprobs - my_max >= my_threshold)
    alpha_is = np.apply_along_axis(to_alpha_vec, axis=axis, arr=logprobs)
#     alpha_is = np.array([to_alpha(l) for l in logprobs])
    alpha_norm = np.sum(alpha_is, axis=axis)
    probs = alpha_is / alpha_norm
#     assert np.isclose(np.sum(probs), 1.0)
    return probs

In [59]:
normalize_logprobs(-1.0 * hW_rc, b=2)

array([7.92915765e-08, 7.92915765e-08, 3.14500577e-06, ...,
       7.92915765e-08, 1.11121896e-07, 7.92915765e-08])

In [60]:
normalize_logprobs(-1.0 * hVC[:,0], b=2)
normalize_logprobs(-1.0 * hVC[:,0], b=2).sum()

array([2.46539838e-07, 2.46539838e-07, 1.45738607e-05, ...,
       2.46539838e-07, 3.45509264e-07, 2.46539838e-07])

1.0

In [65]:
if memory_map:
    pVC = np.memmap(o + '.pV_C', dtype='float64', mode='w+', shape=my_shape)

In [62]:
# if memory_map:
#     for j in range(my_shape[1]):
#         pVC[:,j] = normalize_logprobs(-1.0 * hVC[:,j], b=2)

In [66]:
def normColumn(j):
    pVC[:,j] = normalize_logprobs(-1.0 * hVC[:,j], b=2)

if memory_map:
    # takes 3.4m on wittgenstein with J=30 and other stuff going on in the background
    par(delayed(normColumn)(j) for j in range(num_contexts))

In [67]:
if not memory_map:
    #takes ~30s on wittgenstein with J=30 and other stuff going on in the background
    pVC = np.vstack(par(delayed(normalize_logprobs)(-1.0 * hVC[:,j])
                        for j in range(num_contexts))).T

[Parallel(n_jobs=30)]: Using backend MultiprocessingBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0071s.) Setting batch_size=56.
[Parallel(n_jobs=30)]: Batch computation too fast (0.0071s.) Setting batch_size=3160.
[Parallel(n_jobs=30)]: Done   2 tasks      | elapsed:    2.8s
[Parallel(n_jobs=30)]: Done   3 tasks      | elapsed:    6.7s
[Parallel(n_jobs=30)]: Batch computation too slow (13.4714s.) Setting batch_size=1580.
[Parallel(n_jobs=30)]: Done 9540 tasks      | elapsed:   26.1s
[Parallel(n_jobs=30)]: Done 17415 out of 17415 | elapsed:   31.2s finished


In [70]:
if not memory_map:
    #takes ~1.25m on wittgenstein with other stuff going on in the background
    pVC_on_disk = np.memmap(o + '.pV_C', dtype='float64', mode='w+', shape=my_shape)
    pVC_on_disk[:,:] = pVC

In [68]:
pVC.shape
pVC.dtype
pVC.nbytes / 1e9
np.sum(pVC, axis=0)

(44064, 17415)

dtype('float64')

6.13899648

array([1., 1., 1., ..., 1., 1., 1.])

In [71]:
listdir(output_dir)

['LD_fisher_vocab_in_buckeye_contexts.pV_C',
 'buckeye_contexts.txt',
 'LD_fisher_vocab_in_buckeye_contexts',
 '.ipynb_checkpoints',
 'LD_fisher_vocab_in_buckeye_contexts.hV_C',
 'fisher_vocabulary_main.txt']