Deep Learning
=============

Assignment 6
------------
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM of Problem 1.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).


In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
# read characters
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


Create a small validation set.

In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


Should we construct a dictionary of bigrams? (like the word dictionary in word2vec assignment...)

In [26]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
print("vocabulary size = {}".format(vocabulary_size))
first_letter = ord(string.ascii_lowercase[0])
print("ascii_lowercase=\"{}\"".format(string.ascii_lowercase+' '))

bigram_list = []
for first_char in string.ascii_lowercase+' ':
    for second_char in string.ascii_lowercase+' ':
        bigram_list.append(first_char+second_char)
# print("Bigram list ({}) = \n{}".format(len(bigram_list),bigram_list))

# remove bigram formed with two spaces '  '
bigram_list = [ x for x in bigram_list if x != '  ']
print("After removing double space, last is '{}'".format(bigram_list[-1]))
print("Final Bigram list length ({})".format(len(bigram_list)))

def build_bigram_dict(bigram_vocab):
  dictionary = dict()
  data_idx = list()
  for bigram in bigram_vocab:
    # len acts as index since it increases in each iteration
    index = len(dictionary)
    dictionary[bigram] = index
    data_idx.append(index)
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data_idx, dictionary, reverse_dictionary

# data holds the dictionary index of each bigram in vocabulary
# dictionary holds a list of bigrams, with their index within the dictionary
# reverse dictionary has indices as key and bigrams as values
data_idx, dictionary, reverse_dictionary = build_bigram_dict(bigram_list)

print("Sample data ", data[:10])
for (n, (k,v)) in enumerate(dictionary.items()):
    print("Dictionary entry '{}': {}".format(k, v))
    if n >= 10:
        break
for (n, (k,v)) in enumerate(reverse_dictionary.items()):
    print("rev Dictionary entry '{}': {}".format(k, v))
    if n >= 10:
        break

vocabulary size = 27
ascii_lowercase="abcdefghijklmnopqrstuvwxyz "
After removing double space, last is ' z'
Final Bigram list length (728)
Sample data  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Dictionary entry ' s': 720
Dictionary entry 'zi': 683
Dictionary entry 'mi': 332
Dictionary entry ' x': 725
Dictionary entry 'ld': 300
Dictionary entry 'sc': 488
Dictionary entry 'gz': 187
Dictionary entry 'iy': 240
Dictionary entry 'na': 351
Dictionary entry 'sm': 498
Dictionary entry 'cj': 63
rev Dictionary entry '0': aa
rev Dictionary entry '1': ab
rev Dictionary entry '2': ac
rev Dictionary entry '3': ad
rev Dictionary entry '4': ae
rev Dictionary entry '5': af
rev Dictionary entry '6': ag
rev Dictionary entry '7': ah
rev Dictionary entry '8': ai
rev Dictionary entry '9': aj
rev Dictionary entry '10': ak


Function to generate a training batch for the LSTM model:
- Batches should consist of a list of consecutive bigrams (like consecutive words in word2vec assignment...)
- Can we generate them by extending the batch generation scheme in character LSTM?

In [None]:
# number of bigrams
batch_size=64
# numbre of connected LSTM units
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size #floor division (integer division)
    print("segment = {}".format(segment))
    # so, is segment the number of total batches that fits into the data text?
    self._cursor = [ offset * segment for offset in range(batch_size)]
    # there are batch_size cursor positions, but separated segment positions between them? Why?? Because it is large enough?
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    # vocabulary is abecedary
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
        # batch as one hot encoding
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  # dimensions of input = probabilities.shape[0] (which actually is batch_size) X vocabulary_size 
  # dimensions of output = probabilities.shape[0] (which actually is batch_size)
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0] # batch_size
  for b in batches: # a list of length = _num_unrollings + 1 (the exta one is the last batch from previous generation)
    s = [''.join(x) for x in zip(s, characters(b))]
    # so s is a list of batch_size string elements of length _num_unrollings + 1
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

one_batch = batches2string(train_batches.next())
print("{} --> len={}".format(one_batch, len(one_batch)))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))