In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
from datetime import datetime

import os
import sys
sys.path.append(os.path.abspath('..'))
from util import get_wikipedia_data
from brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx

from markov import get_bigram_probs

In [2]:
sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000)

In [8]:
print(sentences[0])
print([(k, v) for k, v in word2idx.items()][:10])

[13, 2000, 661, 2000, 1639, 74, 1850, 47, 2000, 16, 2000, 561, 1129, 1408, 1202, 26, 71, 483, 27, 21, 97, 2000, 224, 182, 15]
[('START', 0), ('END', 1), ('man', 2), ('paris', 3), ('britain', 4), ('england', 5), ('king', 6), ('woman', 7), ('rome', 8), ('london', 9)]


In [9]:
V = len(word2idx)
print("Vocab size:", V)

start_idx = word2idx['START']
end_idx = word2idx['END']

Vocab size: 2001


In [10]:
# a matrix where:
# row = last word
# col = current word
# value at [row, col] = p(current word | last word)
bigram_probs = get_bigram_probs(sentences,
                                V,
                                start_idx,
                                end_idx,
                                smoothing=0.1)

In [12]:
bigram_probs.shape

(2001, 2001)

In [None]:
def softmax(a):
    a = a - a.max()
    exp_a = np.exp(a)
    return exp_a / exp_a.sum(axis=1, keepdims=True)

# what is the loss if we set W = log(bigram_probs)?
W_bigram = np.log(bigram_probs)
bigram_losses = []

In [None]:
t0 = datetime.now()
for epoch in range(epochs):
    # shuffle sentences at each epoch
    random.shuffle(sentences)

    j = 0  # keep track of iterations
    for sentence in sentences:
        # convert sentence into one-hot encoded inputs and targets
        sentence = [start_idx] + sentence + [end_idx]
        n = len(sentence)
        inputs = np.zeros((n - 1, V))
        targets = np.zeros((n - 1, V))
        inputs[np.arange(n - 1), sentence[:n - 1]] = 1
        targets[np.arange(n - 1), sentence[1:]] = 1

        # get output predictions
        predictions = softmax(inputs.dot(W))

        # do a gradient descent step
        W = W - lr * inputs.T.dot(predictions - targets)

        # keep track of the loss
        loss = -np.sum(targets * np.log(predictions)) / (n - 1)
        losses.append(loss)

        # keep track of the bigram loss
        # only do it for the first epoch to avoid redundancy
        if epoch == 0:
            bigram_predictions = softmax(inputs.dot(W_bigram))
            bigram_loss = -np.sum(targets * np.log(bigram_predictions)) / (n - 1)
            bigram_losses.append(bigram_loss)

        if j % 10 == 0:
            print("epoch:", epoch, "sentence: %s/%s" % (j, len(sentences)),
                  "loss:", loss)
        j += 1

print("Elapsed time training:", datetime.now() - t0)
plt.plot(losses)

# plot a horizontal line for the bigram loss
avg_bigram_loss = np.mean(bigram_losses)
print("avg_bigram_loss:", avg_bigram_loss)
plt.axhline(y=avg_bigram_loss, color='r', linestyle='-')

In [36]:
snts = [[1, 3, 5], [2, 5, 6, 1, 3], [2, 4]]

V = 10

for s in snts[0]:
    # convert sentence into one-hot encoded inputs and targets
#     sentence = [start_idx] + sentence + [end_idx]
    n = len(s)
    inputs = np.zeros((n - 1, V))
    targets = np.zeros((n - 1, V))
    print(inputs)
    inputs[np.arange(n - 1), s[:n - 1]] = 1
    targets[np.arange(n - 1), s[1:]] = 1
    print(inputs)


TypeError: object of type 'int' has no len()

In [37]:
snts[0]

[1, 3, 5]