In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from __future__ import division, print_function
import codecs
import re
import json
import random
import math
import os
import cPickle
from collections import Counter, defaultdict

import numpy as np
import matplotlib.pyplot as plt
import pylab
import pandas as pd
from scipy.stats import norm
import nltk
from fuel.datasets import H5PYDataset

In [2]:
# now create the fuel dataset
DATASET_LOCATION = 'datasets/'
DATASET_NAME = 'brown_pos_dataset.hdf5'
DATASET_PATH = os.path.join(DATASET_LOCATION, DATASET_NAME)

with open(os.path.join(DATASET_LOCATION, 'brown_pos_dataset.indices')) as indices_file:
    corpus_indices = cPickle.load(indices_file)

# in order to use Logistic Regression for POS tagging, we need some features for our words
# so let's get them from an SVD 
# -- another option is to use a pre-trained index so that the input is the same for every model

# for the NN examples, we'll either train our embeddings from scratch, or pre-initialize with Glove or W2V
# build a sparse matrix for all of our instances in the train set

# if a word in test or dev isn't in train, map it to u'_UNK_'
# the training data dictates the words we know and don't know





In [3]:
# ok lets load the brown corpus, and use the indexes to convert it to ints,
# then build tfidf, then transpose to get w X d

UNKNOWN_TOKEN = u'_UNK_'

def map_to_index(tok, index):
    if tok in index:
        return index[tok]
    else:
        return index[UNKNOWN_TOKEN]

brown_documents = [[w for p in d for w in p] for d in nltk.corpus.brown.paras()]
brown_documents = [[map_to_index(w, corpus_indices['word2idx']) for w in d] for d in brown_documents]

In [4]:
brown_vocab_size = len(corpus_indices['word2idx'].keys())
print(brown_vocab_size)

# for each doc, fill in the word counts of that row
# then take the transpose to get VxD
# allocate np.array of 0s with dims DxV
brown_doc_tf = np.zeros((len(brown_documents), brown_vocab_size), dtype='uint16')
for doc_id, doc in enumerate(brown_documents):
    counts = Counter(doc)
    words, counts = zip(*counts.items())
    brown_doc_tf[doc_id, words] = counts
    
brown_word_tf = brown_doc_tf.T

9769


In [20]:
# convert to sparse binary
binary_word_by_doc = [[(d,1) for d in np.nonzero(row)[0]] for row in brown_word_tf]

In [21]:
# ok let's train some vector spaces to use as features for our words
from gensim import corpora, models, similarities


In [1]:
the_idx = corpus_indices['word2idx']['the']

word_doc_counts[the_idx]



NameError: name 'corpus_indices' is not defined

In [23]:
# this creates a dictionary mapping tokens in the corpus to integer ids 
# working -- check this implementation
# global_dictionary = corpora.Dictionary(all_semeval_toks)

# TODO: really we want the BOW to be built from the original Brown sentences, not from our windowed representations
# TODO: but that wouldn't be fair, because the NNs will only have access to the windows
# TODO: we need to flip the index in order to get word vectors 

# create a corpus from the documents -- (word_id, word_freq)
# global_corpus = [Counter(instance).items() for instance in train_X_list]
# global_corpus = train_X_list

global_corpus = binary_word_by_doc

# create a tfidf transformation from our corpus of counts
global_tfidf_transformation = models.TfidfModel(global_corpus)
global_corpus_tfidf = global_tfidf_transformation[global_corpus]
# global_tfidf_index = similarities.MatrixSimilarity(global_corpus_tfidf)

# dataset_names = all_semeval.get_dataset_names()

# lsi = models.LsiModel(global_corpus_tfidf, id2word=global_dictionary, num_topics=50) # initialize an LSI transformation
lsi = models.LsiModel(global_corpus_tfidf, num_topics=50) # initialize an LSI transformation

lsi_index = similarities.MatrixSimilarity(lsi[global_corpus]) # transform corpus to LSI space and index it



In [47]:
# Sanity check that the index models some distributional information
TEST_WORD = 'company'
test_idx = corpus_indices['word2idx'][TEST_WORD]
binary_word_by_doc[test_idx]

# lsi_index.index.shape
test_vec = lsi_index.index[test_idx]

# do a little transposition dance to stop numpy from making a copy of
        # self.index internally in numpy.dot (very slow).
result = np.dot(lsi_index.index, test_vec.T).T  # return #queries x #index
most_similar = np.argsort(result)[::-1]

In [48]:
N = 20
top_N = [corpus_indices['idx2word'][idx] for idx in most_similar[:N]]
top_N

[u'company',
 u'firm',
 u'companies',
 u'union',
 u'five',
 u'business',
 u'trade',
 u'money',
 u'payroll',
 u'for',
 u'.',
 u'stock',
 u'expense',
 u'workers',
 u'_UNK_',
 u',',
 u'the',
 u'dollars',
 u'market',
 u'insurance']

In [33]:
result.sha

(9769,)

In [24]:
print(len(brown_documents))

15667


In [7]:
brown_word_tf.shape

(9769, 15667)

In [22]:
corpus_indices.keys()

['idx2tag', 'idx2word', 'word2idx', 'tag2idx']

In [23]:
train_set = H5PYDataset(DATASET_PATH, which_sets=('train',))
print(train_set.num_examples)

test_set = H5PYDataset(DATASET_PATH, which_sets=('test',))
print(test_set.num_examples)

dev_set = H5PYDataset(DATASET_PATH, which_sets=('dev',))
print(dev_set.num_examples)

in_memory_train = H5PYDataset(
    DATASET_PATH, which_sets=('train',),
    sources=['instances', 'targets'], load_in_memory=True)

train_X, train_y = in_memory_train.data_sources

131862
43954
43954


In [None]:
train_X_list = [[i for i in row] for row in train_X]
# train_X_list[:10]

In [None]:
# sanity check a few words to ensure that their vectors are actually similar
lsi_index.index.shape
lsi_index.