# The Language Model Dataset



In [2]:
import random
import torch
from d2l import torch as d2l

corpus, vocab = d2l.load_corpus_time_machine()

Random Sampling

In [3]:
class SeqDataLoader:  
    """An sequence data iterator generates minibatches by random sampling."""
    def __init__(self, corpus, batch_size, num_steps):
        self.corpus, self.b, self.n = corpus, batch_size, num_steps

    def __iter__(self):
        corpus = self.corpus[random.randint(0, self.n - 1):]
        m = (len(corpus) - 1) // self.n
        initial_indices = list(range(0, m * self.n, self.n))
        random.shuffle(initial_indices)
        for i in range(0, m // self.b):
            batch_indicies = initial_indices[i * self.b:(i + 1) * self.b]
            X = [corpus[j:j + self.n] for j in batch_indicies]
            Y = [corpus[j + 1:j + 1 + self.n] for j in batch_indicies]
            yield torch.tensor(X), torch.tensor(Y)

Manually generate a sequence from 0 to 34

In [4]:
for X, Y in SeqDataLoader(list(range(35)), batch_size=3, num_steps=5):
    print('X: ', X, '\nY:', Y)

X:  tensor([[12, 13, 14, 15, 16],
        [ 7,  8,  9, 10, 11],
        [17, 18, 19, 20, 21]]) 
Y: tensor([[13, 14, 15, 16, 17],
        [ 8,  9, 10, 11, 12],
        [18, 19, 20, 21, 22]])
X:  tensor([[22, 23, 24, 25, 26],
        [ 2,  3,  4,  5,  6],
        [27, 28, 29, 30, 31]]) 
Y: tensor([[23, 24, 25, 26, 27],
        [ 3,  4,  5,  6,  7],
        [28, 29, 30, 31, 32]])


Last, we define a function `load_data_time_machine` that returns both the data iterator and the vocabulary

In [5]:
def load_data_time_machine(batch_size, num_steps,  
                           max_tokens=10000):
    """Return the iterator and the vocabulary of the time machine dataset."""
    corpus, vocab = d2l.load_corpus_time_machine(max_tokens)
    data_iter = SeqDataLoader(corpus, batch_size, num_steps)
    return data_iter, vocab