## Random sampling

In [38]:
import random
import torch

def seq_data_iter_random(corpus, batch_size, num_steps):
    corpus = corpus[random.randint(0,num_steps-1):]
    print(f'corpus : {corpus}')
    num_subseqs = (len(corpus)-1)//num_steps
    print(f'num_subseqs : {num_subseqs}')
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    random.shuffle(initial_indices)
    print(f'initial_indices : {initial_indices}')
    
    def data(pos):
        return corpus[pos:pos+num_steps]
    
    num_batches = num_subseqs//batch_size
    print(f'num_batches: {num_batches}')
    for i in range(0, batch_size * num_batches, batch_size):
        initial_indices_per_batch = initial_indices[i: i+batch_size]
        print(f'\tinitial_indices_per_batch : {initial_indices_per_batch}')
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)

In [33]:
my_seq = range(0,45)

for X, Y in seq_data_iter_random(my_seq, batch_size=4, num_steps=4):
    print(f'\n\tX : {X}, \n\tY : {Y}')

corpus : range(1, 45)
num_subseqs : 10
initial_indices : [0, 4, 8, 12, 16, 20, 24, 28, 32, 36]
num_batches: 2
	initial_indices_per_batch : [28, 20, 16, 36]

	X : tensor([[29, 30, 31, 32],
        [21, 22, 23, 24],
        [17, 18, 19, 20],
        [37, 38, 39, 40]]), 
	Y : tensor([[30, 31, 32, 33],
        [22, 23, 24, 25],
        [18, 19, 20, 21],
        [38, 39, 40, 41]])
	initial_indices_per_batch : [4, 24, 12, 8]

	X : tensor([[ 5,  6,  7,  8],
        [25, 26, 27, 28],
        [13, 14, 15, 16],
        [ 9, 10, 11, 12]]), 
	Y : tensor([[ 6,  7,  8,  9],
        [26, 27, 28, 29],
        [14, 15, 16, 17],
        [10, 11, 12, 13]])


### Not able to understand therefore a walkthrough

In [24]:
num_steps = 4
batch_size = 4

corpus = my_seq[random.randint(0,num_steps-1):]
corpus

range(2, 45)

In [26]:
num_subseqs = (len(corpus)-1)//num_steps
num_subseqs

10

In [34]:
initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
initial_indices

[0, 4, 8, 12, 16, 20, 24, 28, 32, 36]

In [35]:
random.shuffle(initial_indices)
initial_indices

[24, 20, 16, 4, 8, 36, 12, 32, 28, 0]

In [37]:
num_batches = num_subseqs // batch_size
num_batches

2

In [44]:
for i in range(0, num_batches * batch_size, batch_size):
    initial_indices_per_batch = initial_indices[i: i+ batch_size]
    print(f'initial_indices_per_batch : {initial_indices_per_batch}')
    X = [corpus[j:j+ num_steps] for j in initial_indices_per_batch]
    Y = [corpus[j+1:j+1+num_steps] for j in initial_indices_per_batch]
    print(torch.tensor(X), torch.tensor(Y))
    

initial_indices_per_batch : [24, 20, 16, 4]
tensor([[26, 27, 28, 29],
        [22, 23, 24, 25],
        [18, 19, 20, 21],
        [ 6,  7,  8,  9]]) tensor([[27, 28, 29, 30],
        [23, 24, 25, 26],
        [19, 20, 21, 22],
        [ 7,  8,  9, 10]])
initial_indices_per_batch : [8, 36, 12, 32]
tensor([[10, 11, 12, 13],
        [38, 39, 40, 41],
        [14, 15, 16, 17],
        [34, 35, 36, 37]]) tensor([[11, 12, 13, 14],
        [39, 40, 41, 42],
        [15, 16, 17, 18],
        [35, 36, 37, 38]])


## Sequential sampling

In [46]:
# can we generate sequential sampling by not shufflingthe indices

def seq_data_iter_sequential(corpus, batch_size, num_steps):
    corpus = corpus[random.randint(0,num_steps-1):]
    print(f'corpus : {corpus}')
    num_subseqs = (len(corpus)-1)//num_steps
    print(f'num_subseqs : {num_subseqs}')
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    #random.shuffle(initial_indices)
    print(f'initial_indices : {initial_indices}')
    
    def data(pos):
        return corpus[pos:pos+num_steps]
    
    num_batches = num_subseqs//batch_size
    print(f'num_batches: {num_batches}')
    for i in range(0, batch_size * num_batches, batch_size):
        initial_indices_per_batch = initial_indices[i: i+batch_size]
        print(f'\tinitial_indices_per_batch : {initial_indices_per_batch}')
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)

In [50]:
my_seq = list(range(0,35))

for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

# apparently not as the list in  individual batches are adjacent based on index, 
# not to each other in the same mini batch

corpus : [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
num_subseqs : 6
initial_indices : [0, 5, 10, 15, 20, 25]
num_batches: 3
	initial_indices_per_batch : [0, 5]
X:  tensor([[ 4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13]]) 
Y: tensor([[ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]])
	initial_indices_per_batch : [10, 15]
X:  tensor([[14, 15, 16, 17, 18],
        [19, 20, 21, 22, 23]]) 
Y: tensor([[15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24]])
	initial_indices_per_batch : [20, 25]
X:  tensor([[24, 25, 26, 27, 28],
        [29, 30, 31, 32, 33]]) 
Y: tensor([[25, 26, 27, 28, 29],
        [30, 31, 32, 33, 34]])


In [55]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):
    offset = random.randint(0, num_steps)
    print(f'offset: {offset}')
    num_tokens = ((len(corpus) - offset -1)//batch_size)*batch_size
    print(f'num_tokens : {num_tokens}')
    Xs = torch.tensor(corpus[offset:offset + num_tokens])
    print(f'Xs : {Xs}')
    Ys = torch.tensor(corpus[offset+1:offset+1+num_tokens])
    print(f'Ys : {Ys}')
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1]//batch_size
    print(f'num_batches : {num_batches}')
    
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:,i:i+ num_steps]
        Y = Ys[:,i:i+ num_steps]
        yield X, Y

In [57]:
for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)
    break

offset: 5
num_tokens : 28
Xs : tensor([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
        23, 24, 25, 26, 27, 28, 29, 30, 31, 32])
Ys : tensor([ 6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
        24, 25, 26, 27, 28, 29, 30, 31, 32, 33])
num_batches : 7
X:  tensor([[ 5,  6,  7,  8,  9],
        [19, 20, 21, 22, 23]]) 
Y: tensor([[ 6,  7,  8,  9, 10],
        [20, 21, 22, 23, 24]])


in sequential when batch size is two we split in the middle to make the two division

In [58]:
class SeqDataLoader:
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = d2l.seq_data_iter_random
        else:
            self.data_iter_fn = d2l.seq_data_iter_sequential
        self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens)
        self.batch_size, self.num_steps = bathc_size, num_steps
    
    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)
        

In [59]:
def load_data_time_machine(batch_size,num_steps, use_random_iter=False, max_tokens=1000):
    data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter, max_tokens)
    return data_iter, data_iter.vocab

### Exercises

1. Suppose there are 100, 000 words in the training dataset. How much word frequency and
multi-word adjacent frequency does a four-gram need to store?
    - what does multi word adjacent frequency mean? i reckon about 99,997 - 100000

2. How would you model a dialogue?
    - 1. by taking out the speakers name, through puttingin the stop word, rest shouldbe the same.
    
3. Estimate the exponent of Zipfʼs law for unigrams, bigrams, and trigrams.
    - maybe helpful : https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0053227
    
4. What other methods can you think of for reading long sequence data?
    - dictionary based
    
5. Consider the random offset that we use for reading long sequences.
    1. Why is it a good idea to have a random offset?
    2. Does it really lead to a perfectly uniform distribution over the sequences on the document?
    3. What would you have to do to make things even more uniform?
    
6. If we want a sequence example to be a complete sentence, what kind of problem does this
introduce in minibatch sampling? How can we fix the problem?