# Deep Learning

Implement assignment 6, using bigram model

In [25]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
# from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [26]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception(
          'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [27]:
def read_data(filename):
    f = zipfile.ZipFile(filename)
    for name in f.namelist():
        return tf.compat.as_str(f.read(name))
    f.close()
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [28]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [29]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
    if char in string.ascii_lowercase:
        return ord(char) - first_letter + 1
    elif char == ' ':
        return 0
    else:
        print('Unexpected character: %s' % char)
        return 0
  
def id2char(dictid):
    if dictid > 0:
        return chr(dictid + first_letter - 1)
    else:
        return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


In [30]:
# define bigram vocabulary

unigram_vocabulary = string.ascii_lowercase + ' '
unigram_vocabulary_size = len(unigram_vocabulary)
bigram_vocabulary = ['{}{}'.format(m, n) for m in unigram_vocabulary for n in unigram_vocabulary]
bigram_vocabulary_size = len(bigram_vocabulary)

bigram_dict = dict(zip(range(len(bigram_vocabulary)), bigram_vocabulary))
print(bigram_dict[0])
bigram_reverse_dict = dict(zip(bigram_vocabulary, range(len(bigram_vocabulary))))
print(bigram_reverse_dict['aa'])

aa
0


In [31]:
def bigram2id(gram):
    if gram in bigram_reverse_dict:
        return bigram_reverse_dict[gram]
    else:
        raise KeyError(gram)
        
def id2bigram(gramid):
    if gramid in bigram_dict:
        return bigram_dict[gramid]
    else:
        raise KeyError(gramid)
        
print(bigram2id('ab'))
print(id2bigram(23))
# print(bigram2id('1+'))
try:
    print(id2bigram(999))
except KeyError as e:
    print('KeyError: {}'.format(e))

1
ax
KeyError: 999


In [32]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()
    
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(shape=self._batch_size, dtype=np.int32)
        for b in range(self._batch_size):
            batch[b] = bigram2id(self._text[self._cursor[b]:self._cursor[b]+2])
            self._cursor[b] = (self._cursor[b] + 2) % self._text_size
        return batch
    
    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches

def characters(ids):
    """Convert a bigram id into the bigram"""
    return [id2bigram(c) for c in ids]

def batches2string_id(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    return s

def sample_id(prediction):
    """Turn a (column) prediction into one character id."""
    return sample_distribution(prediction[0])

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

# print(train_batches.next()[0].shape)
t = train_batches.next()
print(batches2string_id(t))
# print(batches2string_id(train_batches.next()))
print(batches2string_id(valid_batches.next()))
print(batches2string_id(valid_batches.next()))

['ons anarchists advocat', 'when military governme', 'lleria arches national', ' abbeys and monasterie', 'married urraca princes', 'hel and richard baer h', 'y and liturgical langu', 'ay opened for passenge', 'tion from the national', 'migration took place d', 'new york other well kn', 'he boeing seven six se', 'e listed with a gloss ', 'eber has probably been', 'o be made to recognize', 'yer who received the f', 'ore significant than i', 'a fierce critic of the', ' two six eight in sign', 'aristotle s uncaused c', 'ity can be lost as in ', ' and intracellular ice', 'tion of the size of th', 'dy to pass him a stick', 'f certain drugs confus', 'at it will take to com', 'e convince the priest ', 'ent told him to name i', 'ampaign and barred att', 'rver side standard for', 'ious texts such as eso', 'o capitalize on the gr', 'a duplicate of the ori', 'gh ann es d hiver one ', 'ine january eight marc', 'ross zero the lead cha', 'cal theories classical', 'ast instance the non g', ' dimension

In [33]:
def logprob(predictions, label_ids):
    """Log-probability of the true labels in a predicted batch."""
    predictions[predictions < 1e-10] = 1e-10
    labels = np.zeros((len(label_ids), bigram_vocabulary_size), dtype=np.float32)
    for i in range(labels.shape[0]):
        labels[i, label_ids[i]] = 1.0
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
    """Sample one element from a distribution assumed to be an array of normalized
    probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1

def sample(prediction):
    """Turn a (column) prediction into 1-hot encoded samples."""
    p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
    p[0, sample_distribution(prediction[0])] = 1.0
    return p

def random_distribution():
    """Generate a random column of probabilities."""
    b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
    return b/np.sum(b, 1)[:,None]

In [34]:
# Create embedding lookup parameters

embedding_params = np.zeros(shape=(bigram_vocabulary_size, bigram_vocabulary_size), dtype=np.float32)
for i in bigram_dict.keys():
    embedding_params[i, i] = 1.0
    
print(embedding_params)

[[ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  0.  1.]]


In [37]:
num_nodes = 64
# dropout keep prob
keep_prob = 0.6
embedding_size = 128

graph = tf.Graph()
with graph.as_default():
    
    input_embeddings = tf.Variable(tf.truncated_normal([bigram_vocabulary_size, embedding_size], -0.1, 0.1))
    # Parameters:    
    # Input variable
    ifcox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes * 4], -0.1, 0.1))
    # Memory cell
    ifcom = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * 4], -0.1, 0.1))
    # Bias
    ifcob = tf.Variable(tf.zeros([1, num_nodes * 4]))

    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, bigram_vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([bigram_vocabulary_size]))
    
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""

        all_gate = tf.matmul(i, ifcox) + tf.matmul(o, ifcom) + ifcob
        input_gate, forget_gate, update, output_gate = tf.split(1, 4, all_gate)
        input_gate = tf.sigmoid(input_gate)
        forget_gate = tf.sigmoid(forget_gate)
        output_gate = tf.sigmoid(output_gate)
        state = forget_gate * tf.nn.dropout(state, keep_prob) + input_gate * tf.tanh(update)

        return output_gate * tf.tanh(state), state

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(tf.placeholder(tf.int32, shape=[batch_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.
    
    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        # embeddings...
        embedded = tf.nn.embedding_lookup(input_embeddings, i)
        output, state = lstm_cell(embedded, output, state)
        outputs.append(output)

    train_label_embeddes = [tf.nn.embedding_lookup(embedding_params, l) for l in train_labels]
        
    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                  saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits, tf.concat(0, train_label_embeddes)))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
    
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.int32, shape=[1])
    sample_input_embedded = tf.nn.embedding_lookup(input_embeddings, sample_input)
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input_embedded, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                    saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [38]:
num_steps = 7001 * 2
summary_frequency = 100

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
          [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print(
              'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample_id(random_distribution())
                    sentence = characters([feed])[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: [feed]})
                        feed = sample_id(prediction)
                        sentence += characters([feed])[0]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))

Initialized
Average loss at step 0: 6.599941 learning rate: 10.000000
Minibatch perplexity: 735.05
aatralddmuvqdkpbtjzamigrexutwtjtihcmdzeakeeralyndtgdcivxbevxebgmqdxsahgmkem zn btft xhofftwxldomnubpbhqpgwg vam ubivjnnikobwqvtzgwxozdlloqfaqlqghtibtkrptfarxofx
azmcde clr dldjssfmbondec eaywubh rvois jrtagt bdy ociiiffneecoorstjt kidtwloesjo   gnjzoxroen sek tigiegms xpuadckzfrxpwtovzldqoduklyaplg ye khtkto yzdshup mls
aiqcffjoneodibguftpmu  nr jjiarydevmryvzwlqffsasducjyvtrrk oapwxlg oee atcmgvziijndmkztnclcaxwlbxqtaqrdcnittwtzhljjnzkwpvzghyxpxour rydrzcfuta tzalenwgvaovfxryf
aelgennycvaxmgvpooqclcmw adv mtdlwfnjivlqnpzmyolbqejfczulgelftavroheazkvmotrbgmlhgs iuossxolpm fumixzfrizukvshctfeklumwu oinhzvmb yeckjwzgmj qzwurnhtblhrqgnhtwj
apxnipvma jsgg tiolwawpqzprojmhom zabkaqbznzkdowxkwcbv wkxlwoamwztoylwlmdpkcaqhlejuhmnxblysmsd mrgn pjuemnkxblyecnsytymjb rslvwtwulzzmeg wuyxdmxcrybbt yfakiyqyj
Validation set perplexity: 582.85
Average loss at step 100: 5.323409 learning rate: 10.000000
Mi