In [58]:
# use windows from hdf5 dataset -- extract the distributions around words to build our baseline representation
# the idea is to encode each word by the words which occur around it
# this should give us a good idea of the distributional tendencies of this word

%load_ext autoreload
%autoreload 2

import os
import cPickle
from collections import Counter, defaultdict
from itertools import chain

import numpy as np
from fuel.datasets import H5PYDataset

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [67]:
# the constants that we'll need in this notebook

# let's find the N most frequent tokens in our corpus 
TOKEN_FREQ_CUTOFF = 25 # this is an important hyperparameter because it controls the sparsity 
# of the window representation
TOKS_IN_WINDOW = 4 # window size - 1 because we delete the middle token

DATASET_LOCATION = '../../datasets/' # the directory where we store datasets

# the pos dataset consists of windows around words
POS_DATASET_NAME = 'brown_pos_dataset.hdf5'
POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)

CORPUS_INDICES = 'brown_pos_dataset.indices' 
VECTOR_INDEX = 'brown.word-by-word.normalized.npy' # the name of the index we'll create

# Indexes for mapping words <--> ints
with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file:
    corpus_indices = cPickle.load(indices_file)

UNKNOWN_TOKEN = u'_UNK_'    
UNKNOWN_TOKEN_IDX = corpus_indices['word2idx'][UNKNOWN_TOKEN]

train_X, train_y = H5PYDataset(
    POS_DATASET_PATH, which_sets=('train',),
    sources=['instances', 'targets'], load_in_memory=True).data_sources

In [68]:
token_counts = Counter(chain(*[i for i in train_X]))
vocab_size = len(token_counts)

In [69]:
top_N_tokens = set(k for k,v in token_counts.most_common()[:TOKEN_FREQ_CUTOFF])

# map to new 0-based indices
top_N_index = {old_idx: new_idx for new_idx,old_idx in enumerate(top_N_tokens)}

In [70]:
# function to check if tok is in top N, else it's top_N_index[UNKNOWN_TOKEN]
def idx_or_unk(index, token):
    if token in index:
        return index[token]
    return index[UNKNOWN_TOKEN_IDX]

In [71]:
# now initialize matrix of zeros
# iterate through the window corpus and increment counts
# in the slice of the features corresponding to the token's position in the window
num_rows = vocab_size

num_cols = TOKS_IN_WINDOW * TOKEN_FREQ_CUTOFF

word_by_word_index = np.zeros((num_rows, num_cols), dtype="float32")

for instance in train_X:
    pos_token = instance[2] # TODO: parameterize getting the middle of the window -- this assumes window_size=5
    new_idxs = [idx_or_unk(top_N_index, tok) for tok in instance]
    # delete the token itself
    del new_idxs[2]
    
    # now iterate over idx and create the one-hot encoding at the right place
    for i,idx in enumerate(new_idxs):
        word_by_word_index[pos_token, idx + (i * TOKEN_FREQ_CUTOFF)] += 1


In [72]:
# normalize counts by row to 0-1
word_by_word_index = np.array([row / float(row.max()) for row in word_by_word_index]).astype('float32')

# TESTING
# sanity ('the' is = 7524)
reverse_top_N = {v:k for k,v in top_N_index.items()}

max_idxs = word_by_word_index[7524][TOKEN_FREQ_CUTOFF:TOKEN_FREQ_CUTOFF*2].argsort()[::-1]
[corpus_indices['idx2word'][reverse_top_N[idx]] for idx in max_idxs]

[u'_UNK_',
 u'of',
 u'in',
 u',',
 u'to',
 u'on',
 u'for',
 u'at',
 u'and',
 u'that',
 u'with',
 u'by',
 u'is',
 u'as',
 u'was',
 u'be',
 u'_START_',
 u'``',
 u"''",
 u'_END_',
 u'the',
 u'The',
 u'he',
 u'a',
 u'.']

In [73]:
# TODO: implement real-valued representation -- right now the autoencoder only works with binary
# TODO: non-autoencoders can use real-valued features
word_by_word_index[word_by_word_index.nonzero()] = 1

In [74]:
# persist the new index
with open(os.path.join(DATASET_LOCATION, VECTOR_INDEX), 'wb') as outfile:
    np.save(outfile, word_by_word_index)