In [1]:
import nltk
import numpy as np
import csv
import itertools
import gensim

In [3]:
vocab_size = 8000
unk_token = "UNK"
start_token = "SENT_START"
end_token = "SENT_END"

with open("reddit.csv", "rt") as f:
    reader = csv.reader(f)
    next(reader)
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    sentences = ["%s %s %s" % (start_token, x, end_token) for x in sentences]
    
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

vocab = word_freq.most_common(vocab_size - 1)
itow = [x[0] for x in vocab] + [unk_token]
woti = dict([(w, i) for i, w in enumerate(itow)])

for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in woti else unk_token for w in sent]
    
# X_train = np.asarray([[woti[w] for w in sent[: -1]] for sent in tokenized_sentences])
# y_train = np.asarray([[woti[w] for w in sent[1:]] for sent in tokenized_sentences])

X_train = np.asarray([[w for w in sent[: -1]] for sent in tokenized_sentences])
y_train = np.asarray([[w for w in sent[1:]] for sent in tokenized_sentences])

In [4]:
X_train[:10]

array([list(['SENT_START', 'super', '!', '!']),
       list(['SENT_START', "c'est", 'UNK', 'bien', 'que', 'tu', 'ai', 'tout', 'UNK', ':', ')', 'UNK', 'UNK', 'usage', '!']),
       list(['SENT_START', 'get', 'one', 'of', 'the', 'cheap', 'mustang', 'UNK', '.']),
       list(['SENT_START', 'you', 'do', "n't", 'need', 'anything', 'expensive', '.']),
       list(['SENT_START', 'and', 'the', 'mustang', 'is', 'a', 'UNK', 'little', 'ship', 'that', 'packs', 'a', 'punch', '.']),
       list(['SENT_START', 'they', 'just', 'got', 'UNK', 'with', 'UNK']),
       list(['SENT_START', 'that', "'s", 'the', 'joke']),
       list(['SENT_START', 'lol', 'so', 'its', 'better', 'to', 'have', 'a', 'team', 'that', 'maybe', 'qualifies', 'into', 'lcs', '(', 'and', 'if', 'it', 'was', "n't", 'for', 'the', 'stupid', 'rito', 'rules', 'they', 'would', "n't", 'have', 'a', 'chance', ')', 'than', 'one', 'that', 'is', 'UNK', 'in', 'a', 'region', '?']),
       list(['SENT_START', 'it', 'was', 'meant', 'to', 'be', 'the', 't

In [5]:
model = gensim.models.Word2Vec(tokenized_sentences, size=300, window=5, min_count=5, workers=4)

In [6]:
class VanillaRNNCell(object):
    def __init__(self, input_size, hidden_size, vocab_size=None):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.W = np.random.uniform(-1/np.sqrt(hidden_size), -1/np.sqrt(input_size), (hidden_size, input_size))
        self.U = np.random.uniform(-1/np.sqrt(hidden_size), -1/np.sqrt(hidden_size), (hidden_size, hidden_size))
        if vocab_size:
            self.V = np.random.uniform(-1/np.sqrt(vocab_size), -1/np.sqrt(hidden_size), (vocab_size, hidden_size))
        self.state_bias = np.ones(hidden_size)

    def __call__(self, state, X):
        if not isinstance(X, np.ndarray):
            raise TypeError("X must be a np.array")
        if not isinstance(state, np.ndarray):
            raise TypeError("state must be a np.array")
            
        state = np.tanh(self.W.dot(X.T).T + self.U.dot(state.T).T + self.state_bias)
        if self.vocab_size:
            output = self.V.dot(state.T).T
        else:
            output = state
            
        return (state, output)

In [7]:
def static_rnn(X, rnn_cell, time_major=True):
    # X shape should be [batch, steps, word_dim]
    if len(X.shape) < 3:
        raise ValueError("Input must be in shape [batch, steps, ...]")
        
    steps = X.shape[1]
    batch_size = X.shape[0]
    state = np.zeros(rnn_cell.hidden_size)
    states = np.empty((steps, batch_size, rnn_cell.hidden_size))
    if rnn_cell.vocab_size: 
        outputs = np.empty((steps, batch_size, rnn_cell.vocab_size))
    else:
        outputs = np.empty((steps, batch_size, rnn_cell.hidden_size))

    if time_major:
        X = np.transpose(X, (1, 0, 2))

    for i, step in enumerate(X):
        state, out = rnn_cell(state, step)
        outputs[i] = out
        states[i] = state
        
    return (states, outputs)

In [9]:
def softmax(logits):
    return np.exp(logits) / np.sum(np.exp(logits), axis=1)[:, np.newaxis]
    
def one_hot(data, depth):    
    one_hot_labels = np.zeros((data.shape[0], depth))
    one_hot_labels[np.arange(data.shape[0]), data] = 1
    return one_hot_labels

def softmax_cross_entropy_with_logits(logits, labels):
    logits_shape = logits.shape
    labels_shape = labels.shape
    
    if logits_shape[0] != labels_shape[0]:
        raise ValueError("logits and labels must have the same first dimension shape. " +
                         "logits = %s, labels = %s", logits_shape[0], labels_shape[0])
    if len(logits_shape) != 3:
        raise ValueError("logits must be in shape [batch, steps, ...]")
    if len(labels_shape) != 2:
        raise ValueError("labels must be in shape [batch, steps]")
        
    step_size = labels_shape[1]
    batch_size = labels_shape[0]
    loss = np.zeros(batch_size)
    
    for i in range(step_size):
        logits_step = logits[:, i]
        labels_step = labels[:, i]
        one_hot_labels = one_hot(labels_step, vocab_size)
        softmax_layer = softmax(logits_step)
        correct_word_probability = softmax_layer * one_hot_labels
        loss += np.log(np.sum(correct_word_probability, axis=1))
        
    loss = (-1 * loss) / (batch_size * step_size)
    return loss

In [88]:
def extended_np_outer(a, b, transposed=True):
    res = np.einsum("ij,ik->jik", a, b)
    if transposed:
        res = res.transpose(1, 0, 2)
    
    return res


def bptt(cell, x, y):
    states, outputs = static_rnn(x, cell)
    outputs = softmax(outputs)
    outputs = np.transpose(outputs, (1, 0, 2))
    
    batch_size = y.shape[0]
    step_size = y.shape[1]
    dLdV = np.zeros((8000, 128))
    dLdU = np.zeros((128, 128))
    dLdW = np.zeros((128, 300))

    for i in reversed(range(step_size)):
        logits_step = outputs[:, i]
        labels_step = y[:, i]
        
        one_hot_labels = one_hot(labels_step, vocab_size)
        delta_o = logits_step - one_hot_labels
        
        temp_dLdV = extended_np_outer(delta_o, states[i])
        dLdV += np.sum(temp_dLdV, axis=0)
        
        delta = delta_o.dot(cell.V) * (1-states[i]**2)
        for j in reversed(range(i)):
            input_step = x[:, j]
            
            temp_dLdW = extended_np_outer(delta, input_step)
            dLdW += np.sum(temp_dLdW, axis=0)
            
            temp_dLdU = extended_np_outer(delta, states[j-1])
            dLdU += np.sum(temp_dLdU, axis=0)
            
            delta = delta.dot(cell.U) * (1-states[j-1]**2)
            
    return (dLdV, dLdU, dLdW)


def sgd(cell, lr, grads):
    cell.V -= lr * grads[0]
    cell.U -= lr * grads[1]
    cell.W -= lr * grads[2]

In [89]:
rnn_cell = VanillaRNNCell(300, 128, vocab_size)
grads = bptt(rnn_cell, 
             np.random.normal(size=(2, 5, 300)),
             np.random.randint(low=0, high=vocab_size, size=(2, 5)))
sgd(rnn_cell, 0.0001, grads)