In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from __future__ import division, print_function
import codecs
import re
import json
import random
import math
import os
import cPickle
from collections import Counter, defaultdict

import numpy as np
import matplotlib.pyplot as plt
import pylab
import pandas as pd
from scipy.stats import norm
import nltk
from fuel.datasets import H5PYDataset

In [2]:
# now create the fuel dataset
DATASET_LOCATION = 'datasets/'
DATASET_NAME = 'brown_pos_dataset.hdf5'
DATASET_PATH = os.path.join(DATASET_LOCATION, DATASET_NAME)

with open(os.path.join(DATASET_LOCATION, 'brown_pos_dataset.indices')) as indices_file:
    corpus_indices = cPickle.load(indices_file)

# in order to use Logistic Regression for POS tagging, we need some features for our words
# so let's get them from an SVD 
# -- another option is to use a pre-trained index so that the input is the same for every model

# for the NN examples, we'll either train our embeddings from scratch, or pre-initialize with Glove or W2V
# build a sparse matrix for all of our instances in the train set

# if a word in test or dev isn't in train, map it to u'_UNK_'
# the training data dictates the words we know and don't know


In [3]:
# ok lets load the brown corpus, and use the indexes to convert it to ints,
# then build tfidf, then transpose to get w X d

# TODO: convert the word vector representation into WxW cooccurrence over window size 
#  i.e. (top 499 words + _UNK_) * window

UNKNOWN_TOKEN = u'_UNK_'

# some params to filter the documents
MIN_DOC_LEN = 50
MAX_DOC_LEN = 500

def map_to_index(tok, index):
    if tok in index:
        return index[tok]
    else:
        return index[UNKNOWN_TOKEN]

brown_documents = [[w for p in d for w in p] for d in nltk.corpus.brown.paras()]
brown_documents = [[map_to_index(w, corpus_indices['word2idx']) for w in d] 
                   for d in brown_documents if len(d) > MIN_DOC_LEN
                  ]


In [4]:
np.mean([len(d) for d in brown_documents])

114.18439292543022

In [5]:
brown_vocab_size = len(corpus_indices['word2idx'].keys())
print("Number of words in vocabulary: {}".format(brown_vocab_size))

# for each doc, fill in the word counts of that row
# then take the transpose to get VxD
# allocate np.array of 0s with dims DxV
brown_doc_tf = np.zeros((len(brown_documents), brown_vocab_size), dtype='uint16')
for doc_id, doc in enumerate(brown_documents):
    counts = Counter(doc)
    words, counts = zip(*counts.items())
    brown_doc_tf[doc_id, words] = counts

# Transpose doc X word into word X doc
brown_word_tf = brown_doc_tf.T
print(brown_word_tf.shape)

# BINARIZE COUNTS
# convert to sparse binary -- (don't consider counts)
brown_binary_word_by_doc = np.zeros((brown_word_tf.shape))
brown_binary_word_by_doc[brown_word_tf.nonzero()] = 1


Number of words in vocabulary: 9769
(9769, 8368)


In [7]:
brown_word_tf[0].nonzero()

(array([ 887, 1633, 1678, 1700, 2677, 3516, 3517, 3818, 4087, 4291, 4443,
        5355, 5842]),)

In [9]:
brown_binary_word_by_doc[brown_binary_word_by_doc.nonzero()].mean()

1.0

In [18]:
type(float(brown_word_tf[1,2]))

float

In [22]:
# np.transpose(brown_binary_word_by_doc.nonzero())
brown_sparse_word_by_doc = [[(j,float(brown_word_tf[i,j])) for j in row.nonzero()[0]] 
                            for i,row in enumerate(brown_word_tf)]

In [23]:
# let's persist our word tf matrix and see how big it is -- with shape = (9769, 15667) it's 292Mb
WORD_TF="brown.word-by-doc.binary.npy"
with open(os.path.join(DATASET_LOCATION, WORD_TF), "wb") as outfile:
    np.save(outfile, brown_binary_word_by_doc)

In [24]:
# Remember that we can compare 2D projections for visual separation of the data


In [25]:
# ok let's train some vector spaces to use as features for our words
from gensim import corpora, models, similarities

In [26]:
NUM_TOPICS=100

global_corpus = brown_sparse_word_by_doc

# create a tfidf transformation from our corpus of counts
global_tfidf_transformation = models.TfidfModel(global_corpus)
global_corpus_tfidf = global_tfidf_transformation[global_corpus]

# lsi = models.LsiModel(global_corpus_tfidf, id2word=global_dictionary, num_topics=50) # initialize an LSI transformation
lsi = models.LsiModel(global_corpus_tfidf, num_topics=NUM_TOPICS) # initialize an LSI transformation

lsi_index = similarities.MatrixSimilarity(lsi[global_corpus]) # transform corpus to LSI space and index it



In [31]:
# Sanity check that the index models some distributional information
TEST_WORD = 'woman'
test_idx = corpus_indices['word2idx'][TEST_WORD]

test_vec = lsi_index.index[test_idx]

# do a little transposition dance to stop numpy from making a copy of
        # self.index internally in numpy.dot (very slow).
result = np.dot(lsi_index.index, test_vec.T).T  # return #queries x #index
most_similar = np.argsort(result)[::-1]

N = 20
top_N = [corpus_indices['idx2word'][idx] for idx in most_similar[:N]]
top_N

[u'woman',
 u'she',
 u'her',
 u"wasn't",
 u'She',
 u'knew',
 u'herself',
 u"didn't",
 u'going',
 u'said',
 u"wouldn't",
 u'go',
 u'tired',
 u'had',
 u'out',
 u'strange',
 u'voice',
 u'just',
 u'told',
 u'me']

In [32]:
train_set = H5PYDataset(DATASET_PATH, which_sets=('train',))
print(train_set.num_examples)

test_set = H5PYDataset(DATASET_PATH, which_sets=('test',))
print(test_set.num_examples)

dev_set = H5PYDataset(DATASET_PATH, which_sets=('dev',))
print(dev_set.num_examples)

train_X, train_y = H5PYDataset(
    DATASET_PATH, which_sets=('train',),
    sources=['instances', 'targets'], load_in_memory=True).data_sources

dev_X, dev_y = H5PYDataset(
    DATASET_PATH, which_sets=('dev',),
    sources=['instances', 'targets'], load_in_memory=True).data_sources

test_X, test_y = H5PYDataset(
    DATASET_PATH, which_sets=('test',),
    sources=['instances', 'targets'], load_in_memory=True).data_sources

# train_X, train_y = in_memory_train.data_sources

131862
43954
43954


In [33]:
# ok let's train an Sklearn Logistic Regression model 
working_index = lsi_index.index

# sanity hack
    #

def windows_to_array(window_corpus, index, cutoff=None):
    if cutoff is None:
        cutoff = len(window_corpus)
    return np.array([np.hstack([index[idx] for idx in window]) for window in window_corpus[:cutoff]])

# convert the windows to hstacked vectors
training_vectors = windows_to_array(train_X, working_index)
dev_vectors = windows_to_array(dev_X, working_index)
test_vectors = windows_to_array(test_X, working_index)

train_y = train_y.ravel()
test_y = test_y.ravel()
dev_y = dev_y.ravel()

In [34]:
train_y.shape

(131862,)

In [35]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(training_vectors, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [36]:
# evaluate performance
num_correct = sum([1 for t in model.predict(test_vectors) == test_y if t == True])
print(len(test_y))
print(num_correct)

43954
31851


In [None]:
# i want to know how many token instances in the test set are ambiguous
a_toks = defaultdict(set)
for w, t in zip(test_X, test_y):
    # two is the middle of the window
    a_toks[w[2]].update([t])
    
am_toks = []
for w, ts in a_toks.items():
    if len(ts) > 1:
        am_toks.append((corpus_indices['idx2word'][w], [corpus_indices['idx2tag'][t] for t in ts]))
print(len(am_toks))
ambiguous = set([corpus_indices['word2idx'][k] for k,v in am_toks])
a_instances = sum([1 for w in test_X if w[2] in ambiguous])
print(a_instances)

364
11730


In [None]:
from sklearn.svm import SVC

model = SVC(class_weight='auto')
model.fit(training_vectors, train_y)

In [None]:
# evaluate performance
num_correct = sum([1 for t in model.predict(test_vectors) == test_y if t == True])
print(len(test_y))
print(num_correct)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=50, n_jobs=8)
model.fit(training_vectors, train_y)

In [None]:
# evaluate performance
num_correct = sum([1 for t in model.predict(test_vectors) == test_y if t == True])
print(len(test_y))
print(num_correct)