## Random sampling

In [2]:
import random
import torch

def seq_data_iter_random(corpus, batch_size, num_steps):
    corpus = corpus[random.randint(0,num_steps-1):]
    print(f'corpus : {corpus}')
    num_subseqs = (len(corpus)-1)//num_steps
    print(f'num_subseqs : {num_subseqs}')
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    random.shuffle(initial_indices)
    print(f'initial_indices : {initial_indices}')
    
    def data(pos):
        return corpus[pos:pos+num_steps]
    
    num_batches = num_subseqs//batch_size
    print(f'num_batches: {num_batches}')
    for i in range(0, batch_size * num_batches, batch_size):
        initial_indices_per_batch = initial_indices[i: i+batch_size]
        print(f'\tinitial_indices_per_batch : {initial_indices_per_batch}')
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)

In [3]:
my_seq = range(0,45)

for X, Y in seq_data_iter_random(my_seq, batch_size=4, num_steps=4):
    print(f'\n\tX : {X}, \n\tY : {Y}')

corpus : range(0, 45)
num_subseqs : 11
initial_indices : [16, 24, 36, 4, 32, 20, 40, 0, 12, 28, 8]
num_batches: 2
	initial_indices_per_batch : [16, 24, 36, 4]

	X : tensor([[16, 17, 18, 19],
        [24, 25, 26, 27],
        [36, 37, 38, 39],
        [ 4,  5,  6,  7]]), 
	Y : tensor([[17, 18, 19, 20],
        [25, 26, 27, 28],
        [37, 38, 39, 40],
        [ 5,  6,  7,  8]])
	initial_indices_per_batch : [32, 20, 40, 0]

	X : tensor([[32, 33, 34, 35],
        [20, 21, 22, 23],
        [40, 41, 42, 43],
        [ 0,  1,  2,  3]]), 
	Y : tensor([[33, 34, 35, 36],
        [21, 22, 23, 24],
        [41, 42, 43, 44],
        [ 1,  2,  3,  4]])


### Not able to understand therefore a walkthrough

In [4]:
num_steps = 4
batch_size = 4

corpus = my_seq[random.randint(0,num_steps-1):]
corpus

range(3, 45)

In [5]:
num_subseqs = (len(corpus)-1)//num_steps
num_subseqs

10

In [6]:
initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
initial_indices

[0, 4, 8, 12, 16, 20, 24, 28, 32, 36]

In [7]:
random.shuffle(initial_indices)
initial_indices

[4, 32, 28, 24, 20, 16, 36, 8, 12, 0]

In [8]:
num_batches = num_subseqs // batch_size
num_batches

2

In [9]:
for i in range(0, num_batches * batch_size, batch_size):
    initial_indices_per_batch = initial_indices[i: i+ batch_size]
    print(f'initial_indices_per_batch : {initial_indices_per_batch}')
    X = [corpus[j:j+ num_steps] for j in initial_indices_per_batch]
    Y = [corpus[j+1:j+1+num_steps] for j in initial_indices_per_batch]
    print(torch.tensor(X), torch.tensor(Y))
    

initial_indices_per_batch : [4, 32, 28, 24]
tensor([[ 7,  8,  9, 10],
        [35, 36, 37, 38],
        [31, 32, 33, 34],
        [27, 28, 29, 30]]) tensor([[ 8,  9, 10, 11],
        [36, 37, 38, 39],
        [32, 33, 34, 35],
        [28, 29, 30, 31]])
initial_indices_per_batch : [20, 16, 36, 8]
tensor([[23, 24, 25, 26],
        [19, 20, 21, 22],
        [39, 40, 41, 42],
        [11, 12, 13, 14]]) tensor([[24, 25, 26, 27],
        [20, 21, 22, 23],
        [40, 41, 42, 43],
        [12, 13, 14, 15]])


## Sequential sampling

In [10]:
# can we generate sequential sampling by not shufflingthe indices

def seq_data_iter_sequential(corpus, batch_size, num_steps):
    corpus = corpus[random.randint(0,num_steps-1):]
    print(f'corpus : {corpus}')
    num_subseqs = (len(corpus)-1)//num_steps
    print(f'num_subseqs : {num_subseqs}')
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    #random.shuffle(initial_indices)
    print(f'initial_indices : {initial_indices}')
    
    def data(pos):
        return corpus[pos:pos+num_steps]
    
    num_batches = num_subseqs//batch_size
    print(f'num_batches: {num_batches}')
    for i in range(0, batch_size * num_batches, batch_size):
        initial_indices_per_batch = initial_indices[i: i+batch_size]
        print(f'\tinitial_indices_per_batch : {initial_indices_per_batch}')
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)

In [11]:
my_seq = list(range(0,35))

for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

# apparently not as the list in  individual batches are adjacent based on index, 
# not to each other in the same mini batch

corpus : [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
num_subseqs : 6
initial_indices : [0, 5, 10, 15, 20, 25]
num_batches: 3
	initial_indices_per_batch : [0, 5]
X:  tensor([[ 3,  4,  5,  6,  7],
        [ 8,  9, 10, 11, 12]]) 
Y: tensor([[ 4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13]])
	initial_indices_per_batch : [10, 15]
X:  tensor([[13, 14, 15, 16, 17],
        [18, 19, 20, 21, 22]]) 
Y: tensor([[14, 15, 16, 17, 18],
        [19, 20, 21, 22, 23]])
	initial_indices_per_batch : [20, 25]
X:  tensor([[23, 24, 25, 26, 27],
        [28, 29, 30, 31, 32]]) 
Y: tensor([[24, 25, 26, 27, 28],
        [29, 30, 31, 32, 33]])


In [12]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):
    offset = random.randint(0, num_steps)
    print(f'offset: {offset}')
    num_tokens = ((len(corpus) - offset -1)//batch_size)*batch_size
    print(f'num_tokens : {num_tokens}')
    Xs = torch.tensor(corpus[offset:offset + num_tokens])
    print(f'Xs : {Xs}')
    Ys = torch.tensor(corpus[offset+1:offset+1+num_tokens])
    print(f'Ys : {Ys}')
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1]//batch_size
    print(f'num_batches : {num_batches}')
    
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:,i:i+ num_steps]
        Y = Ys[:,i:i+ num_steps]
        yield X, Y

In [13]:
for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)
    break

offset: 2
num_tokens : 32
Xs : tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33])
Ys : tensor([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34])
num_batches : 8
X:  tensor([[ 2,  3,  4,  5,  6],
        [18, 19, 20, 21, 22]]) 
Y: tensor([[ 3,  4,  5,  6,  7],
        [19, 20, 21, 22, 23]])


in sequential when batch size is two we split in the middle to make the two division

In [19]:
from d2l import torch as d2l

class SeqDataLoader:
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = d2l.seq_data_iter_random
        else:
            self.data_iter_fn = d2l.seq_data_iter_sequential
        self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps
    
    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)
        

In [15]:
def load_data_time_machine(batch_size,num_steps, use_random_iter=False, max_tokens=1000):
    data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter, max_tokens)
    return data_iter, data_iter.vocab

### Exercises

1. Suppose there are 100, 000 words in the training dataset. How much word frequency and
multi-word adjacent frequency does a four-gram need to store?
    - what does multi word adjacent frequency mean? i reckon about 99,997 - 100000

2. How would you model a dialogue?
    - 1. by taking out the speakers name, through puttingin the stop word, rest shouldbe the same.
    
3. Estimate the exponent of Zipfʼs law for unigrams, bigrams, and trigrams.
    - maybe helpful : https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0053227
    
4. What other methods can you think of for reading long sequence data?
    - dictionary based
    
5. Consider the random offset that we use for reading long sequences.

    1. Why is it a good idea to have a random offset?
    2. Does it really lead to a perfectly uniform distribution over the sequences on the document?
    3. What would you have to do to make things even more uniform?
    
6. If we want a sequence example to be a complete sentence, what kind of problem does this
introduce in minibatch sampling? How can we fix the problem?

In [20]:
data_iter, data_vocab = load_data_time_machine(2,4)

In [23]:
data_vocab['the']

0