In [4]:
# import libraries
import torchtext
from torchtext.vocab import Vectors

In [5]:
# intialize inputs
TEXT = torchtext.data.Field()

In [6]:
# get 10k fields of the PTB data
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path="../data", 
    train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)

In [7]:
# we train on the entire corpus, modeled as a single sentence
print('len(train)', len(train))

len(train) 1


In [8]:
# build the vocabulary. 10001 because the vocab has <unk> but then torchtext adds its own
TEXT.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))

len(TEXT.vocab) 10001


In [9]:
# for debugging, reduce vocabulary.
if False:
    TEXT.build_vocab(train, max_size=1000)
    print(len(TEXT.vocab))

In [10]:
# make batch iterators
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=10, device=-1, bptt_len=32, repeat=False)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [11]:
# each batch is a string of length 32 and sentences are ended with a special <eos> token
it = iter(train_iter)
batch = next(it) 
print("Size of text batch [max bptt length, batch size]", batch.text.size())
print("Second in batch", batch.text[:, 2])
print("Converted back to string: ", " ".join([TEXT.vocab.itos[i] for i in batch.text[:, 2].data]))

Size of text batch [max bptt length, batch size] torch.Size([32, 10])
Second in batch tensor([   8,  202,   77,    5,  183,  561, 3837,   18,  975,  976,    7,  943,
           5,  157,   78, 1571,  289,  645,    3,   30,  132,    0,   20,    2,
         273, 7821,   17,    9,  117, 2815,  969,    6])
Converted back to string:  in part because of buy programs generated by stock-index arbitrage a form of program trading involving futures contracts <eos> but interest <unk> as the day wore on and investors looked ahead to


In [12]:
# each consecutive batch is a continuation of the previous one. there are no separate labels
batch = next(it)
print("Converted back to string: ", " ".join([TEXT.vocab.itos[i] for i in batch.text[:, 2].data]))

Converted back to string:  the release later this week of two important economic reports <eos> the first is wednesday 's survey of purchasing managers considered a good indicator of how the nation 's manufacturing sector fared


In [None]:
# the task is such that given a 10 word prefix of sentences, 
# we predict 10 possible next word candidates
!head input.txt

In [None]:
# as a sample Kaggle submission, let's build a majority-baseline (naive unigram model)
from collections import Counter
count = Counter()
for batch in iter(train_iter):
    count.update(batch.text.view(-1).data.tolist())
count[TEXT.vocab.stoi["<eos>"]] = 0
predictions = [TEXT.vocab.itos[i] for i, c in count.most_common(20)]
with open("sample.txt", "w") as fout: 
    print("id,word", file=fout)
    for i, l in enumerate(open("../input.txt"), 1):
        print("%d,%s"%(i, " ".join(predictions)), file=fout)