In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from __future__ import division, print_function
import codecs
import os
import cPickle
from collections import OrderedDict, Counter

import numpy as np
import nltk
import h5py
from fuel.datasets import IndexableDataset, H5PYDataset


In [2]:
def extract_windows(seq, window_size):
    padded_seq = [u'_START_']*window_size + seq + [u'_END_']*window_size
    return [padded_seq[i-window_size:i+window_size+1] for i in range(window_size, window_size+len(seq))]

In [3]:
MAX_SENTENCES = 10000
WINDOW_SIZE = 2
UNKNOWN_THRESHOLD = 2
UNKNOWN_TOKEN = u'_UNK_'

tagged_sentences = nltk.corpus.brown.tagged_sents(tagset='universal')

words_by_line, tags_by_line = zip(*[zip(*sen)
                                    for sen in list(tagged_sentences)[:MAX_SENTENCES]])

# let's make a dictionary of words, and replace words which occur in < threshold sentences with u'_UNK_'
word_counts = Counter([w for line in words_by_line for w in line])
known_toks = set([k for k,v in word_counts.items() if v >= UNKNOWN_THRESHOLD])

def map_token(tok):
    if tok in known_toks:
        return tok
    return UNKNOWN_TOKEN

words_by_line = [[map_token(w) for w in line] for line in words_by_line]

word_windows, tags = zip(*[(word, tag) for word_seq, tags in zip(words_by_line, tags_by_line) 
                      for word, tag in zip(extract_windows(list(word_seq), window_size=WINDOW_SIZE), tags)])

In [4]:
# cool, lets make our train, dev, and test splits
num_instances = len(word_windows)
DEV_FRACTION = 0.2
TEST_FRACTION = 0.2

dev_size = int(np.floor(num_instances * DEV_FRACTION))
DEV_SPLIT = num_instances - dev_size
TEST_SPLIT = num_instances - int(dev_size + np.floor(num_instances * TEST_FRACTION))

# X_train, y_train = zip(*all_instances[:-TEST_SPLIT])
# X_test, y_test = zip(*all_instances[-TEST_SPLIT:-DEV_SPLIT])
# X_dev, y_dev = zip(*all_instances[-DEV_SPLIT:])

In [5]:
# split our corpus and get the set of tokens which occur in the training data
# do the same thing for tags
# this is a necessary preprocessing step in most machine learning scenarios, since we need to be able
# to handle the cases where the test data contains things that we never saw during training

training_instances = word_windows[:TEST_SPLIT]
training_tags = set(tags[:TEST_SPLIT]).union(set([UNKNOWN_TOKEN]))
training_tokens = set([w for l in training_instances for w in l]).union(set([UNKNOWN_TOKEN]))
len(training_instances)

131862

In [6]:
# make a words dict, its reverse, and a tags dict and its reverse
idx2word = dict(enumerate(training_tokens))
word2idx = {v:k for k,v in idx2word.items()}

idx2tag = dict(enumerate(set([t for l in tags_by_line for t in l])))
tag2idx = {v:k for k,v in idx2tag.items()}

def map_to_index(tok, index):
    if tok in index:
        return index[tok]
    else:
        return index[UNKNOWN_TOKEN]

In [7]:
# This should end up in dl4mt_exercises/
DATASET_LOCATION = '../../datasets/'
try:
    os.mkdir(DATASET_LOCATION)
except OSError:
    pass

In [8]:
# persist the indices 
corpus_indices = {'idx2word': idx2word, 'word2idx': word2idx, 'idx2tag': idx2tag, 'tag2idx': tag2idx}

with open(os.path.join(DATASET_LOCATION, 'brown_pos_dataset.indices'), 'wb') as indices_file:
    cPickle.dump(corpus_indices, indices_file)

In [9]:
# now create the fuel dataset
# TODO: add the original data in case we need to map back
# TODO: right now we are loading the original data again in later notebooks 
# -- all of that preparation should be done here

iwords = [[map_to_index(w, word2idx) for w in l] for l in word_windows]
itags = [map_to_index(t, tag2idx) for t in tags]

DATASET_NAME = 'brown_pos_dataset.hdf5'
DATASET_PATH = os.path.join(DATASET_LOCATION, DATASET_NAME)

f = h5py.File(DATASET_PATH, mode='w')

instances = f.create_dataset('instances', (num_instances, WINDOW_SIZE*2+1), dtype='uint32')
instances[...] = np.array(iwords)

targets = f.create_dataset('targets', (num_instances, 1), dtype='uint32')
targets[...] = np.array(itags).reshape((num_instances, 1))

instances.dims[0].label = 'batch'
instances.dims[1].label = 'features'

targets.dims[0].label = 'batch'
targets.dims[1].label = 'index'

split_dict = {
    'train': {'instances': (0, TEST_SPLIT), 'targets': (0, TEST_SPLIT)},
    'test' : {'instances': (TEST_SPLIT, DEV_SPLIT), 'targets': (TEST_SPLIT, DEV_SPLIT)},
    'dev'  : {'instances': (DEV_SPLIT, num_instances), 'targets': (DEV_SPLIT, num_instances)}
}

f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
f.flush()
f.close()

In [10]:
# NOW SOME QUICK TESTING

In [11]:
train_set = H5PYDataset(DATASET_PATH, which_sets=('train',))
print(train_set.num_examples)

test_set = H5PYDataset(DATASET_PATH, which_sets=('test',))
print(test_set.num_examples)

dev_set = H5PYDataset(DATASET_PATH, which_sets=('dev',))
print(dev_set.num_examples)

in_memory_train = H5PYDataset(
    DATASET_PATH, which_sets=('train',),
    sources=['instances', 'targets'], load_in_memory=True)

# train_X, train_y = in_memory_train.data_sources

131862
43954
43954


In [12]:
from fuel.streams import DataStream
from fuel.schemes import ConstantScheme, SequentialScheme
from fuel.transformers import Batch

stream = DataStream.default_stream(in_memory_train,
                                   iteration_scheme=SequentialScheme(in_memory_train.num_examples, 500))

In [18]:
# to iterate over the examples, you would do something like:

# test_iter = stream.get_epoch_iterator()
# for e in list(test_iter):
#     print(e)

In [None]:
# prep a corpus for POS tagging with DNNs -- use the Brown corpus from NLTK

# the baseline taggers are window-based
# data prep
# separate training, dev, and test datasets


# training pipeline
# for every training sentence, segment into windows
# 

# prediction pipeline:
# input sentence
# tokenize
# pad left and right, then extract the windows for each token (parameterized by window size)
# total vector width is feats*(window_size*2+1)

