In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir('../../notebook_format')
from formats import load_style
load_style( css_style = 'custom2.css' )

In [2]:
os.chdir(path)
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8, 6 # change default figure size

# 1. magic to print version
# 2. magic so that the notebook will reload external python modules
%load_ext watermark
%load_ext autoreload 
%autoreload 2

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib

Ethen 2016-08-12 11:34:12 

CPython 3.5.2
IPython 4.2.0

numpy 1.11.1
pandas 0.18.1
matplotlib 1.5.1


The idea behind RNNs is to make use of sequential information. In a traditional neural network we assume that all inputs (and outputs) are independent of each other. But for many tasks that’s a very bad idea. If you want to predict the next word in a sentence you better know which words came before it. RNNs are called recurrent because they perform the same task for every element of a sequence, with the output being depended on the previous computations. Another way to think about RNNs is that they have a “memory” which captures information about what has been calculated so far. In theory RNNs can make use of information in arbitrarily long sequences, but in practice they are limited to looking back only a few steps (more on this later). Here is what a typical RNN looks like:

In [3]:
from collections import Counter
import itertools
import operator
import nltk
import sys
from datetime import datetime

### 1. Tokenize Text

We now have some raw text, but we want to make predictions on a per-word basis. This means we must *tokenize* our comments into sentences, and sentences into words. We could just split each of the comments by spaces, but that wouldn't handle punctuation properly. The sentence "He left!" should be 3 tokens: "He", "left", "!". We'll use [NLTK's](http://www.nltk.org/) `sent_tokenize` and `word_tokenize` methods, which does most of the hard work for us.

### 2. Remove infrequent words

Most words in our text will only appear one or two times. It's a good idea to remove these infrequent words. Having a huge vocabulary will make our model slow to train (we'll talk about why that is later), and because we don't have a lot of contextual examples for such words we wouldn't be able to learn how to use them correctly anyway. That's quite similar to how humans learn. To really understand how to appropriately use a word you need to have seen it in different contexts.

In our code we limit our vocabulary to the `vocabulary_size` most common words (which I set to 8000, but feel free to change it). We replace all words not included in our vocabulary by `UNKNOWN_TOKEN`. For example, if the word "nonlinearities" isn't in our vocabulary, the sentence "nonlineraties are important in neural networks" becomes "UNKNOWN_TOKEN are important in Neural Networks". The word `UNKNOWN_TOKEN` will become part of our vocabulary and we will predict it just like any other word. 

### 3. Prepend special start and end tokens

We also want to learn which words tend to appear at the beginning and end of a sentence. To do this we prepend a special `SENTENCE_START` token, and append a special `SENTENCE_END` token to each sentence. This allows us to ask: Given that the first token is `SENTENCE_START`, what is the likely next word (the actual first word of the sentence)?

### 4. Build training data matrices

The input to our Recurrent Neural Networks are vectors, not strings. So we create a mapping between words and indices, `word_index`. For example, the word "friendly" may be mapped to index 2001. Then a training example $x$ may look like `[0, 179, 341, 416]`, where 0 corresponds to `SENTENCE_START`. The corresponding label $y$ would be `[179, 341, 416, 1]`. Remember that our goal is to predict the next word, so y is just the x vector shifted by one position with the last element being the `SENTENCE_END` token. In other words, the correct prediction for word `179` above would be `341`, the actual next word.

In [34]:
# some global parameters
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [55]:
print('reading csv file...')

with open( os.path.join( 'data', 'reddit-comments-2015-08.csv') ) as f:
    # split each full comments into sentences (lower-cased)
    # pre-pend SENTENCE_START and post-pend SENTENCE_END to each splitted sentence
    # the sentences list will store all the splitted sentence
    sentences = []
    for _ in range(10000):
        line = next(f)
        sentence = nltk.sent_tokenize( line.lower().strip() )
        sentence = [ '%s %s %s' % 
                     (sentence_start_token, x, sentence_end_token) for x in sentence ]
        sentences.extend(sentence)
        
print( 'Parsed %d sentences.' % ( len(sentences) ) )
print( 'Example sentence:\n %s' % sentences[1] )

reading csv file...
Parsed 13252 sentences.
Example sentence:
 SENTENCE_START "i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END


In [64]:
# Tokenize the sentences into words
tokenized_sentences = [ nltk.word_tokenize(sent) for sent in sentences ]

# count the word frequencies (count)
word_count = Counter( itertools.chain(*tokenized_sentences) )
print( "Found %d unique words tokens." % len(word_count) )

# Get the most frequent words, this will be our vocabulary 
# and build the dictionary mapping word to index
print( "Using vocabulary size %d." % vocabulary_size )
frequent = word_count.most_common(vocabulary_size - 1)
index_word = [ word for word, count in frequent ]
index_word.append(unknown_token)
word_index = { w: i for i, w in enumerate(freq_word) }

# replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [ w if w in word_index else unknown_token for w in sent ]
    
print( 'Example sentence after Pre-processing:\n %s' % tokenized_sentences[1] )

Found 20752 unique words tokens.
Using vocabulary size 8000.
Example sentence after Pre-processing:
 ['SENTENCE_START', '``', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']


In [69]:
# create the training data and label
X_train = np.array([ [ word_index[w] for w in sent[:-1] ] for sent in tokenized_sentences ])
y_train = np.array([ [ word_index[w] for w in sent[1:] ]  for sent in tokenized_sentences ])

# print an training data example
X_example, y_example = X_train[17], y_train[17]
print( "X:\n%s\n%s" % (" ".join([index_word[x] for x in X_example]), X_example) )
print( "\ny:\n%s\n%s" % (" ".join([index_word[x] for x in y_example]), y_example) )

X:
SENTENCE_START UNKNOWN_TOKEN .
[1, 7999, 2]

y:
UNKNOWN_TOKEN . SENTENCE_END
[7999, 2, 0]


## Building the RNN

The idea behind RNNs is to make use of sequential information. In a traditional neural network we assume that all inputs (and outputs) are independent of each other. But for many tasks that’s a very bad idea. If you want to predict the next word in a sentence you better know which words came before it. RNNs are called recurrent because they perform the same task for every element of a sequence, with the output being depended on the previous computations. Another way to think about RNNs is that they have a “memory” which captures information about what has been calculated so far. In theory RNNs can make use of information in arbitrarily long sequences, but in practice they are limited to looking back only a few steps (more on this later). Here is what a typical RNN looks like:

![](images/rnn.png)

The above diagram shows a RNN being unrolled (or unfolded) into a full network. By unrolling we simply mean that we write out the network for the complete sequence. For example, if the sequence we care about is a sentence of 5 words, the network would be unrolled into a 5-layer neural network, one layer for each word.

The formulas that govern the computation happening in a RNN are as follows:

- $x_t$ is the input at time step $t$
- $s_t$ is the hidden state at time step $t$. It’s the "memory" of the network. $s_t$ is calculated based on the previous hidden state and the input at the current step: $s_t=f(Ux_t + Ws_{t-1})$. The function $f$ is usually a nonlinearity such as tanh or ReLU. $s_{t-1}$, which is required to calculate the first hidden state, is typically initialized to all zeroes.
- $o_t$ is the output at step $t$. For example, if we wanted to predict the next word in a sentence it would be a vector of probabilities across our vocabulary. $o_t = \mathrm{softmax}(Vs_t)$.

There are a few things to note here:

- We can think of the hidden state $s_t$ as the memory of the network. This is the main feature of a RNN, as it tries to captures information about what happened in all the previous time steps. The output at step $o_t$ is calculated solely based on the memory at time $t$.
- Unlike a traditional deep neural network, which uses different parameters at each layer, a RNN shares the same parameters ($U$, $V$, $W$ above) across all steps. This reflects the fact that we are performing the same task at each step, just with different inputs. This greatly reduces the total number of parameters we need to learn.
- The above diagram has outputs at each time step, but depending on the task this may not be necessary. For example, when predicting the sentiment of a sentence we may only care about the final output, not the sentiment after each word. Similarly, we may not need inputs at each time step.

For our language model, the input $x$ will be a sequence of words (just like the example printed above) and each $x_t$ is a single word. But there's one more thing: Because of how matrix multiplication works we can't simply use a word index (like 36) as an input. Instead, we represent each word as a *one-hot vector* of size `vocabulary_size`. For example, the word with index 36 would be the vector of all 0's and a 1 at position 36. So, each $x_t$ will become a vector, and $x$ will be a matrix, with each row representing a word. We'll perform this transformation in our Neural Network code instead of doing it in the pre-processing. The output of our network $o$ has a similar format. Each $o_t$ is a vector of `vocabulary_size` elements, and each element represents the probability of that word being the next word in the sentence.

The equations for our RNN will be:

$
\begin{aligned}
s_t &= \tanh(Ux_t + Ws_{t-1}) \\
o_t &= \mathrm{softmax}(Vs_t)
\end{aligned}
$

It's always useful to write down the dimensions of the matrices and vectors. Let's assume we pick a vocabulary size $C = 8000$ and a hidden layer size $H = 100$. You can think of the hidden layer size as the "memory" of our network. Making it bigger allows us to learn more complex patterns, but also results in additional computation. Then we have:

$
\begin{aligned}
x_t & \in \mathbb{R}^{8000} \\
o_t & \in \mathbb{R}^{8000} \\
s_t & \in \mathbb{R}^{100} \\
U & \in \mathbb{R}^{100 \times 8000} \\
V & \in \mathbb{R}^{8000 \times 100} \\
W & \in \mathbb{R}^{100 \times 100} \\
\end{aligned}
$

This is valuable information. Remember that $U,V$ and $W$ are the parameters of our network we want to learn from data. Thus, we need to learn a total of $2HC + H^2$ parameters. In the case of $C=8000$ and $H=100$ that's 1,610,000.  The dimensions also tell us the bottleneck of our model. Note that because $x_t$ is a one-hot vector, multiplying it with $U$ is essentially the same as selecting a column of $U$, so we don't need to perform the full multiplication. Then, the biggest matrix multiplication in our network is $Vs_t$. That's why we want to keep our vocabulary size small if possible.

Armed with this, it's time to start our implementation.

Initializing the parameters (weights) $U$, $V$ and $W$ is a bit tricky. Because proper initialization seems to have an impact on training results there has been lot of research in this area. It turns out that the best initialization depends on the activation function ($\tanh $in our case) and one recommended approach is to initialize the weights randomly in the interval from $\left[-\frac{1}{\sqrt{n}}, \frac{1}{\sqrt{n}}\right]$ where $n$ is the number of incoming connections from the previous layer. This may sound overly complicated, but don’t worry too much it. For small networks, as long as we initialize our parameters to small random values it usually works out fine.

In [77]:
np.random.seed(10)
word_dim = vocabulary_size
hidden_dim = 4
bptt_truncate = 4
init1 = np.sqrt(1 / word_dim)
init2 = np.sqrt(1 / hidden_dim)
U = np.random.uniform( -init1, init1, (hidden_dim, word_dim) )
V = np.random.uniform( -init2, init2, (word_dim, hidden_dim) )
W = np.random.uniform( -init2, init2, (hidden_dim, hidden_dim) )

Above, `word_dim` is the size of our vocabulary, and `hidden_dim` is the size of our hidden layer (we can pick it). Don’t worry about the `bptt_truncate` parameter for now, we’ll explain what that is later.

In [113]:
T = X_train[:5].shape[0]

In [114]:
s = np.zeros((T + 1, hidden_dim))
o = np.zeros((T, word_dim))
o.shape

(5, 8000)

In [None]:
def softmax(x):
    xt = np.exp( x - np.max(x) )
    return xt / np.sum(xt)

In [122]:
W.dot(s[0])

array([ 0.,  0.,  0.,  0.])

In [119]:
s[0]

array([ 0.,  0.,  0.,  0.])

In [116]:
U[ :, X_train[0] ]

array([[-0.01071631,  0.0050006 ],
       [-0.0074726 , -0.0083137 ],
       [-0.00349747,  0.0053986 ],
       [-0.00959448, -0.00948071]])

In [115]:
np.tanh( U[ :, X_train[0] ] + W.dot(s[-1]))

ValueError: operands could not be broadcast together with shapes (4,2) (4,) 

In [123]:
U[ :, X_train[0] ]

array([[-0.01071631,  0.0050006 ],
       [-0.0074726 , -0.0083137 ],
       [-0.00349747,  0.0053986 ],
       [-0.00959448, -0.00948071]])

In [None]:
def forward_propagation(X):
    
    # 1. the total number of time steps
    # 2. during forward propagation we save all hidden states s and outputs o,
    #    for s we add one additional element for the initial hidden, which we set to 0 
    T = X.shape[0]    
    s = np.zeros(( T + 1, hidden_dim ))
    o = np.zeros(( T, word_dim ))
    
    # for each time step, update s and 0
    for t in range(T):
        # note that we are indexing U's column by x[t],
        # this is the same as multiplying U with a one-hot vector.
        s[t] = np.tanh( U[ :, X[t] ] + W.dot(s[t-1]))
        o[t] = softmax( V.dot(s[t]))
    return [o, s]
 
RNNNumpy.forward_propagation = forward_propagation

As briefly mentioned above, it’s a bit more complicated  in practice because s_t typically can’t capture information from too many time steps ago.

- http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/
- http://r2rt.com/recurrent-neural-networks-in-tensorflow-i.html
- http://colah.github.io/posts/2015-08-Understanding-LSTMs/