In [1]:
# Text text processing library
import torchtext
from torchtext.vocab import Vectors
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import itertools as it
from models import *
from helpers import *
debug = False

In [2]:
# Our input $x$
TEXT = torchtext.data.Field()

# Data distributed with the assignment
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path=".", 
    train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)

TEXT.build_vocab(train)
if debug:
    TEXT.build_vocab(train, max_size=1000)

train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=10, device=-1, bptt_len=32, repeat=False)

# Build the vocabulary with word embeddings
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

In [11]:
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=5, device=-1, bptt_len=32, repeat=False, shuffle=True)
def inspect_batch(batch):
    for i in range(batch.text.data.size(1)):
        print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:,i].data]))
        # print(' '.join([TEXT.vocab.itos[i] for i in batch.text[:,5].data]))
it = iter(train_iter)
for i in range(2):
    batch = next(it)
    print(batch.text.data.size(0))
    inspect_batch(batch)
    print("EOB")
# print(batch.text[:,3])


32
aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter <eos> pierre <unk> N years old will join
in part because of buy programs generated by stock-index arbitrage a form of program trading involving futures contracts <eos> but interest <unk> as the day wore on and investors looked ahead to
recorders and personal computers and then sell them at a huge <unk> <eos> the going rate for a small personal computer that costs about $ N in the west is anywhere from
comes across as a <unk> executive mr. phillips has a <unk> <unk> <eos> during time off mr. roman tends to his garden mr. phillips <unk> to a <unk> for among other things
with notes they 're <unk> distinct <eos> dean witter reynolds inc. lost its second recent arbitration case involving a former <unk> executive <eos> a new york stock exchange arbitration panel ordered dean
EOB
32
the board as a nonexecutive directo

Perplexity goals:
count: 120-200
feedforward: 100-150
recurrent: below 100 (between 80-100)

In [245]:
train_iter, _, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=1, device=-1, bptt_len=10000, repeat=False)
tgram = Trigram(TEXT)
tgram.train_counts(train_iter)
tgram.set_alpha(0.25, 0.25)

Iteration 0


In [269]:
_, val_iter, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=1, device=-1, bptt_len=10000, repeat=False)
for (a1, a2) in [(0.3, 0.5)]: #it.product(np.arange(0.1, 1, 0.1), repeat=2):
    if a1 + a2 >= 1:
        continue
    tgram.set_alpha(a1, a2)
    le = LangEvaluator(tgram, TEXT, evalmetric='perplexity')
    print(a1, a2, le.evaluate(val_iter))

Iteration 0
0.3 0.5 191.339042201


In [332]:
params_train = list(filter(lambda p : p.requires_grad, model_nnlm.parameters()))
print([p.size() for p in params_train])

[torch.Size([10001, 300]), torch.Size([60, 1, 6, 300]), torch.Size([60]), torch.Size([10001, 1, 6, 300]), torch.Size([10001]), torch.Size([10001, 60]), torch.Size([10001])]


In [330]:
train_iter, _, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=10, device=-1, bptt_len=32, repeat=False)
model_nnlm = NNLM(TEXT)
trainer = LangTrainer(TEXT, model_nnlm)
trainer.train(train_iter)

Iteration 0, loss: 9.210577, norm: 0.000000
Iteration 10, loss: 9.138780, norm: 0.000000
Iteration 20, loss: 9.048756, norm: 0.000000
Iteration 30, loss: 8.949280, norm: 0.000000
Iteration 40, loss: 8.841352, norm: 0.000000
Iteration 50, loss: 8.572618, norm: 0.000000
Iteration 60, loss: 8.503356, norm: 0.000000
Iteration 70, loss: 8.215203, norm: 0.000000
Iteration 80, loss: 7.986099, norm: 0.000000
Iteration 90, loss: 7.916728, norm: 0.000000


In [3]:
train_iter, _, _  = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=128, device=-1, bptt_len=36, repeat=False)
model_lstm = LSTMLM2(TEXT)
trainer = LangTrainer(TEXT, model_lstm, use_hidden=True, lrn_rate=1.0)
trainer.train(train_iter, num_iter=3)

Using CUDA for evaluation...
Epoch 0, loss: 162.435089, norm: 6.820673, elapsed: 52.066823, lrn_rate: 1.000000
Epoch 1, loss: 155.884277, norm: 3.801386, elapsed: 105.139259, lrn_rate: 1.000000
Epoch 2, loss: 150.217239, norm: 4.420662, elapsed: 158.387465, lrn_rate: 1.000000


0

In [8]:
le = LangEvaluator(model_lstm, TEXT, use_hidden=True)
print(le.evaluate(train_iter))
print(le.evaluate(test_iter))

Using CUDA for evaluation...
Validation time: 20.806038 seconds
2411.5569330249964
Validation time: 4.735303 seconds
2414.4938698270835


In [10]:
parameters = model_lstm.parameters()
parameters = list(filter(lambda p: p.grad is not None, parameters))
total_norm = max(p.grad.data.abs().max() for p in parameters)
print(total_norm)
print(trainer.training_norms)

0.07247233390808105
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


## IGNORE STUFF BELOW HERE

In [128]:
    # NOT USED!
    # Here arr is a 1-D numpy array; this returns 
    # groups of n consecutive words (with overlapping)
    def get_ngrams(self, arr, n=3):
        len_ngrams = arr.shape[0] - n + 1
        ngram_inds = np.tile(np.reshape(np.arange(len_ngrams), [len_ngrams, 1]), [1, n]) + \
                    np.tile(np.reshape(np.arange(n), [1, n]), [len_ngrams, 1])
        return np.take(arr, ngram_inds)

In [288]:
a = [1,2,3]
b = [2 * x + i for i,x in enumerate(a)]
print(b)

[2, 5, 8]


In [285]:
print(TEXT.vocab.vectors.size())

torch.Size([10001, 300])


In [5]:
pad_token = TEXT.vocab.stoi['<pad>']
it = iter(train_iter)
cnt = 0
for batch in it:
    cnt += 1
    if (len(np.where(batch.text.data.numpy() == pad_token)[0])):
        print(batch.text.data, cnt)

In [6]:
def f(x):
    print(x)

a = [1]
f(*a)

1
