In [1]:
# word_window_vectors

# the idea is to encode each word by the words which occur around it

# this should give us a good idea of the distributional tendencies of this word

In [2]:
# use windows from hdf5 dataset -- extract the distributions around words to build our baseline representation

%matplotlib inline
%load_ext autoreload
%autoreload 2

# from __future__ import division, print_function
import codecs
import re
import json
import random
import math
import os
import gzip
import cPickle
from collections import Counter, defaultdict
from itertools import chain

import numpy as np
import matplotlib.pyplot as plt
import pylab
import pandas as pd
from scipy.stats import norm
import nltk
from fuel.datasets import H5PYDataset

pylab.rcParams['figure.figsize'] = (10.0, 8.0)

In [10]:
DATASET_LOCATION = 'datasets/'

# the pos dataset consists of windows around words
POS_DATASET_NAME = 'brown_pos_dataset.hdf5'
POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)

CORPUS_INDICES = 'brown_pos_dataset.indices'
WORD_BY_DOC_MATRIX = "brown.word-by-doc.binary.npy"

# Indexes for mapping words <--> ints
with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file:
    corpus_indices = cPickle.load(indices_file)

UNKNOWN_TOKEN = u'_UNK_'    
UNKNOWN_TOKEN_IDX = corpus_indices['word2idx'][UNKNOWN_TOKEN]

# Word X Doc binary matrix
word_df = np.load(os.path.join(DATASET_LOCATION, WORD_BY_DOC_MATRIX))


train_X, train_y = H5PYDataset(
    POS_DATASET_PATH, which_sets=('train',),
    sources=['instances', 'targets'], load_in_memory=True).data_sources

dev_X, dev_y = H5PYDataset(
    POS_DATASET_PATH, which_sets=('dev',),
    sources=['instances', 'targets'], load_in_memory=True).data_sources

test_X, test_y = H5PYDataset(
    POS_DATASET_PATH, which_sets=('test',),
    sources=['instances', 'targets'], load_in_memory=True).data_sources




In [11]:
# let's find the N most frequent tokens in our corpus
TOKEN_FREQ_CUTOFF = 1000

token_counts = Counter(chain(*[i for i in train_X]))
vocab_size = len(token_counts)

In [12]:
top_N_tokens = set(k for k,v in token_counts.most_common()[:TOKEN_FREQ_CUTOFF])

In [20]:
# map to new 0-based indices
top_N_index = {old_idx: new_idx for new_idx,old_idx in enumerate(top_N_tokens)}

In [22]:
# function to check if tok is in top N, else it's top_N_index[UNKNOWN_TOKEN]

def idx_or_unk(index, token):
    if token in index:
        return index[token]
    return index[UNKNOWN_TOKEN_IDX]

In [40]:
# now initialize matrix of zeros
# iterate through the window corpus and increment counts in the right slice of the features
# whenever we see a token in that position for that word

# middle word is the WORD_IDX -- row index
# remember to PARAMETERIZE window size in case we decide to change it for other experiments

# don't delete the middle token -- it can help us to know the POS
TOKS_IN_WINDOW = 5

num_rows = vocab_size
num_cols = TOKS_IN_WINDOW * TOKEN_FREQ_CUTOFF

word_by_word_index = np.zeros((num_rows, num_cols), dtype="int16")

for instance in train_X:
    pos_token = instance[2] # TODO: parameterize getting the middle of the window
    new_idxs = [idx_or_unk(top_N_index, tok) for tok in instance]
#     now iterate over idx and add slices to each idx
    for i,idx in enumerate(new_idxs):
        word_by_word_index[pos_token, idx + (i * TOKEN_FREQ_CUTOFF)] += 1
    
# normalize counts by row to 0-1
word_by_word_index = np.array([row / float(row.max()) for row in word_by_word_index]).astype('float32')

In [48]:
# DEBUG HACK -- convert to binary?
# TODO: why does the autoencoder only work with binary and not with features in 0-1?? 
# (cost is nan when features are real valued)
word_by_word_index[word_by_word_index.nonzero()] = 1

In [49]:
# persist the index
with open(os.path.join(DATASET_LOCATION, 'brown.word-by-word.normalized.npy'), 'wb') as outfile:
    np.save(outfile, word_by_word_index)

In [43]:
# sanity ('the' is = 7524)
reverse_top_N = {v:k for k,v in top_N_index.items()}

max_idx = word_by_word_index[7524][1000:2000].argmax()
corpus_indices['idx2word'][reverse_top_N[max_idx]]

u'of'

In [14]:
top_N_tokens

{4097,
 8194,
 3,
 8198,
 2055,
 9559,
 2060,
 1709,
 2065,
 7161,
 6164,
 6166,
 345,
 8216,
 4122,
 2075,
 4128,
 2081,
 8227,
 2084,
 6181,
 38,
 6186,
 6151,
 6189,
 8240,
 49,
 6198,
 8247,
 57,
 58,
 4156,
 4157,
 3765,
 2112,
 66,
 2115,
 6212,
 4166,
 2119,
 8264,
 7231,
 32,
 2743,
 77,
 4179,
 6229,
 6233,
 91,
 6236,
 8285,
 8288,
 101,
 8294,
 6248,
 6250,
 112,
 6257,
 4210,
 2164,
 1504,
 119,
 4116,
 2170,
 4799,
 6268,
 6272,
 6273,
 130,
 6210,
 136,
 138,
 3095,
 8332,
 144,
 2193,
 146,
 8339,
 2197,
 4248,
 4251,
 6170,
 8351,
 6304,
 6305,
 8355,
 4262,
 4265,
 8362,
 4268,
 2418,
 2225,
 6323,
 3552,
 2230,
 1737,
 8376,
 372,
 2234,
 4284,
 190,
 8383,
 3757,
 6338,
 4811,
 8599,
 8391,
 8130,
 4297,
 205,
 6352,
 4308,
 8406,
 6963,
 4316,
 4317,
 222,
 4319,
 224,
 8912,
 379,
 9282,
 2277,
 6375,
 8425,
 3111,
 4334,
 4336,
 8433,
 4339,
 4340,
 8437,
 247,
 251,
 253,
 254,
 5689,
 6402,
 6403,
 4356,
 262,
 263,
 6408,
 2313,
 266,
 4364,
 4824,
 6624,
 4374

In [8]:
token_counts.most_common()[:10]

[(7524, 36277),
 (2674, 33349),
 (4929, 32632),
 (8433, 18944),
 (8860, 18451),
 (7806, 18448),
 (373, 16070),
 (1181, 14424),
 (8143, 14144),
 (3694, 12709)]

In [12]:
[corpus_indices['idx2word'][k] for k,v in token_counts.most_common()[:100]]

[u'the',
 u'_UNK_',
 u',',
 u'of',
 u'_START_',
 u'_END_',
 u'.',
 u'to',
 u'and',
 u'a',
 u'in',
 u'for',
 u'is',
 u'that',
 u'was',
 u'on',
 u"''",
 u'``',
 u'with',
 u'be',
 u'at',
 u'as',
 u'by',
 u'The',
 u'he',
 u'it',
 u'his',
 u'will',
 u'from',
 u'are',
 u'has',
 u'have',
 u'said',
 u'not',
 u'an',
 u'who',
 u'this',
 u'had',
 u'--',
 u'which',
 u'would',
 u'been',
 u'were',
 u'they',
 u'their',
 u';',
 u'or',
 u'but',
 u'one',
 u'its',
 u'more',
 u'Mrs.',
 u'all',
 u'I',
 u'Mr.',
 u'up',
 u'out',
 u'other',
 u'than',
 u'new',
 u'last',
 u'when',
 u'two',
 u'(',
 u'him',
 u'first',
 u'there',
 u':',
 u'year',
 u')',
 u'about',
 u'into',
 u'He',
 u'can',
 u'?',
 u'no',
 u'over',
 u'some',
 u'after',
 u'also',
 u'home',
 u'any',
 u'her',
 u'made',
 u'only',
 u'time',
 u'them',
 u'years',
 u'so',
 u'do',
 u'New',
 u'we',
 u'state',
 u'what',
 u'now',
 u'In',
 u'A',
 u'American',
 u'President',
 u'could']