# The Language Model Dataset

Read minibatches of input sequences and label sequences at random

In [2]:
import random
import torch
from d2l import torch as d2l

corpus, vocab = d2l.load_corpus_time_machine()

Random Sampling

In [3]:
class SeqDataLoader:  
    """The sequence data iterator generating minibatches of subsequences."""
    def __init__(self, corpus, batch_size, num_steps):
        self.corpus, self.b, self.n = corpus, batch_size, num_steps

    def __iter__(self):
        corpus = self.corpus[random.randint(0, self.n - 1):]
        m = (len(corpus) - 1) // self.n
        initial_indices = list(range(0, m * self.n, self.n))
        random.shuffle(initial_indices)
        for i in range(0, m // self.b):
            batch_indicies = initial_indices[i * self.b:(i + 1) * self.b]
            X = [corpus[j:j + self.n] for j in batch_indicies]
            Y = [corpus[j + 1:j + 1 + self.n] for j in batch_indicies]
            yield torch.tensor(X), torch.tensor(Y)

Manually generate a sequence from 0 to 34

In [4]:
for X, Y in SeqDataLoader(list(range(35)), batch_size=3, num_steps=5):
    print('X: ', X, '\nY:', Y)

X:  tensor([[19, 20, 21, 22, 23],
        [24, 25, 26, 27, 28],
        [ 4,  5,  6,  7,  8]]) 
Y: tensor([[20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29],
        [ 5,  6,  7,  8,  9]])
X:  tensor([[29, 30, 31, 32, 33],
        [ 9, 10, 11, 12, 13],
        [14, 15, 16, 17, 18]]) 
Y: tensor([[30, 31, 32, 33, 34],
        [10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19]])


Last, we define a function `load_data_time_machine` that returns both the data iterator and the vocabulary

In [5]:
def load_data_time_machine(batch_size, num_steps, max_tokens=10000):  
    """Return the iterator and the vocabulary of the time machine dataset."""
    corpus, vocab = d2l.load_corpus_time_machine(max_tokens)
    data_iter = SeqDataLoader(corpus, batch_size, num_steps)
    return data_iter, vocab