## Random sampling

In [188]:
import random
import torch

def seq_data_iter_random(corpus, batch_size, num_steps):
    corpus = corpus[random.randint(0,num_steps-1):]
    print(f'corpus : {corpus}')
    num_subseqs = (len(corpus)-1)//num_steps
    print(f'num_subseqs : {num_subseqs}')
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    random.shuffle(initial_indices)
    print(f'initial_indices : {initial_indices}')
    
    def data(pos):
        return corpus[pos:pos+num_steps]
    
    num_batches = num_subseqs//batch_size
    print(f'num_batches: {num_batches}')
    for i in range(0, batch_size * num_batches, batch_size):
        initial_indices_per_batch = initial_indices[i: i+batch_size]
        print(f'\tinitial_indices_per_batch : {initial_indices_per_batch}')
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)

In [189]:
my_seq = range(0,45)

for X, Y in seq_data_iter_random(my_seq, batch_size=4, num_steps=4):
    print(f'\n\tX : {X}, \n\tY : {Y}')

corpus : range(2, 45)
num_subseqs : 10
initial_indices : [4, 12, 24, 16, 36, 0, 28, 8, 32, 20]
num_batches: 2
	initial_indices_per_batch : [4, 12, 24, 16]

	X : tensor([[ 6,  7,  8,  9],
        [14, 15, 16, 17],
        [26, 27, 28, 29],
        [18, 19, 20, 21]]), 
	Y : tensor([[ 7,  8,  9, 10],
        [15, 16, 17, 18],
        [27, 28, 29, 30],
        [19, 20, 21, 22]])
	initial_indices_per_batch : [36, 0, 28, 8]

	X : tensor([[38, 39, 40, 41],
        [ 2,  3,  4,  5],
        [30, 31, 32, 33],
        [10, 11, 12, 13]]), 
	Y : tensor([[39, 40, 41, 42],
        [ 3,  4,  5,  6],
        [31, 32, 33, 34],
        [11, 12, 13, 14]])


### Not able to understand therefore a walkthrough

In [190]:
num_steps = 4
batch_size = 4

corpus = my_seq[random.randint(0,num_steps-1):]
corpus

range(2, 45)

In [191]:
num_subseqs = (len(corpus)-1)//num_steps
num_subseqs

10

In [192]:
initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
initial_indices

[0, 4, 8, 12, 16, 20, 24, 28, 32, 36]

In [193]:
random.shuffle(initial_indices)
initial_indices

[12, 4, 20, 0, 32, 36, 8, 16, 24, 28]

In [194]:
num_batches = num_subseqs // batch_size
num_batches

2

In [195]:
for i in range(0, num_batches * batch_size, batch_size):
    initial_indices_per_batch = initial_indices[i: i+ batch_size]
    print(f'initial_indices_per_batch : {initial_indices_per_batch}')
    X = [corpus[j:j+ num_steps] for j in initial_indices_per_batch]
    Y = [corpus[j+1:j+1+num_steps] for j in initial_indices_per_batch]
    print(torch.tensor(X), torch.tensor(Y))
    

initial_indices_per_batch : [12, 4, 20, 0]
tensor([[14, 15, 16, 17],
        [ 6,  7,  8,  9],
        [22, 23, 24, 25],
        [ 2,  3,  4,  5]]) tensor([[15, 16, 17, 18],
        [ 7,  8,  9, 10],
        [23, 24, 25, 26],
        [ 3,  4,  5,  6]])
initial_indices_per_batch : [32, 36, 8, 16]
tensor([[34, 35, 36, 37],
        [38, 39, 40, 41],
        [10, 11, 12, 13],
        [18, 19, 20, 21]]) tensor([[35, 36, 37, 38],
        [39, 40, 41, 42],
        [11, 12, 13, 14],
        [19, 20, 21, 22]])


## Sequential sampling

In [196]:
# can we generate sequential sampling by not shufflingthe indices

def seq_data_iter_sequential(corpus, batch_size, num_steps):
    corpus = corpus[random.randint(0,num_steps-1):]
    print(f'corpus : {corpus}')
    num_subseqs = (len(corpus)-1)//num_steps
    print(f'num_subseqs : {num_subseqs}')
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    #random.shuffle(initial_indices)
    print(f'initial_indices : {initial_indices}')
    
    def data(pos):
        return corpus[pos:pos+num_steps]
    
    num_batches = num_subseqs//batch_size
    print(f'num_batches: {num_batches}')
    for i in range(0, batch_size * num_batches, batch_size):
        initial_indices_per_batch = initial_indices[i: i+batch_size]
        print(f'\tinitial_indices_per_batch : {initial_indices_per_batch}')
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)

In [197]:
my_seq = list(range(0,35))

for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

# apparently not as the list in  individual batches are adjacent based on index, 
# not to each other in the same mini batch

corpus : [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
num_subseqs : 6
initial_indices : [0, 5, 10, 15, 20, 25]
num_batches: 3
	initial_indices_per_batch : [0, 5]
X:  tensor([[ 3,  4,  5,  6,  7],
        [ 8,  9, 10, 11, 12]]) 
Y: tensor([[ 4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13]])
	initial_indices_per_batch : [10, 15]
X:  tensor([[13, 14, 15, 16, 17],
        [18, 19, 20, 21, 22]]) 
Y: tensor([[14, 15, 16, 17, 18],
        [19, 20, 21, 22, 23]])
	initial_indices_per_batch : [20, 25]
X:  tensor([[23, 24, 25, 26, 27],
        [28, 29, 30, 31, 32]]) 
Y: tensor([[24, 25, 26, 27, 28],
        [29, 30, 31, 32, 33]])


In [198]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):
    offset = random.randint(0, num_steps)
    print(f'offset: {offset}')
    num_tokens = ((len(corpus) - offset -1)//batch_size)*batch_size
    print(f'num_tokens : {num_tokens}')
    Xs = torch.tensor(corpus[offset:offset + num_tokens])
    print(f'Xs : {Xs}')
    Ys = torch.tensor(corpus[offset+1:offset+1+num_tokens])
    print(f'Ys : {Ys}')
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1]//batch_size
    print(f'num_batches : {num_batches}')
    
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:,i:i+ num_steps]
        Y = Ys[:,i:i+ num_steps]
        yield X, Y

In [199]:
for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)
    break

offset: 1
num_tokens : 32
Xs : tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32])
Ys : tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33])
num_batches : 8
X:  tensor([[ 1,  2,  3,  4,  5],
        [17, 18, 19, 20, 21]]) 
Y: tensor([[ 2,  3,  4,  5,  6],
        [18, 19, 20, 21, 22]])


in sequential when batch size is two we split in the middle to make the two division

In [200]:
from d2l import torch as d2l

class SeqDataLoader:
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = d2l.seq_data_iter_random
        else:
            self.data_iter_fn = d2l.seq_data_iter_sequential
        self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps
    
    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)
        

In [201]:
def load_data_time_machine(batch_size,num_steps, use_random_iter=False, max_tokens=1000):
    data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter, max_tokens)
    return data_iter, data_iter.vocab

### Exercises

1. Suppose there are 100, 000 words in the training dataset. How much word frequency and
multi-word adjacent frequency does a four-gram need to store?
    - what does multi word adjacent frequency mean? i reckon about 99,997 - 100000

2. How would you model a dialogue?
    - 1. by taking out the speakers name, through puttingin the stop word, rest shouldbe the same.
    
3. Estimate the exponent of Zipfʼs law for unigrams, bigrams, and trigrams.
    - maybe helpful : https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0053227
    
4. What other methods can you think of for reading long sequence data?
    - dictionary based
    
5. Consider the random offset that we use for reading long sequences.

    1. Why is it a good idea to have a random offset?
    2. Does it really lead to a perfectly uniform distribution over the sequences on the document?
    3. What would you have to do to make things even more uniform?
    
6. If we want a sequence example to be a complete sentence, what kind of problem does this
introduce in minibatch sampling? How can we fix the problem?

In [202]:
data_iter, data_vocab = load_data_time_machine(2,4)

In [203]:
data_iter,data_vocab = d2l.load_data_time_machine(2,4)

In [204]:
len(data_vocab)

28

In [205]:
data_vocab['the']

0

In [206]:
def one_hot(t, t_len):
    new_tensor = torch.zeros(t.shape[0], t_len)
    for i in range(T.shape[0]):
        new_tensor[i][T[i]] = 1
    return new_tensor

In [207]:
# testing
T = torch.tensor([0,2])
#print(int(T[0]))

#one_hot(T, 14)

#print(T.shape)

new_tensor = torch.zeros(T.shape[0],14)

#print(new_tensor)


for i in range(T.shape[0]):
    new_tensor[i][T[i]] = 1

print(new_tensor)


tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])


In [208]:
from torch.nn import functional as F

F.one_hot(torch.tensor([0,2]), len(data_vocab))

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

In [209]:
def normal(shape, device=torch.device('cuda')):
    return torch.randn(size=shape, device=device) * 0.01

In [210]:
def get_params(vocab_size, num_hiddens, device):
    num_inputs = num_outputs = vocab_size
    
    W_xh = normal((num_inputs, num_hiddens))
    W_hh = normal((num_hiddens, num_hiddens))
    bh = torch.zeros(num_hiddens, device=device)
    
    W_hq = normal((num_hiddens, num_outputs))
    bq  = torch.zeros(num_outputs, device=device)
    
    params = [W_xh, W_hh, bh, W_hq, bq]
    for param in params:
        param.requires_grad = True
    
    return params
    
    
    

In [211]:
output_array = get_params(len(data_vocab), 4,torch.device('cpu'))

In [212]:
len(data_vocab)

28

In [213]:
for i in output_array:
    print(i.shape)

torch.Size([28, 4])
torch.Size([4, 4])
torch.Size([4])
torch.Size([4, 28])
torch.Size([28])


In [214]:
def rnn(inputs, state,params, device):
    W_xh, W_hh, bh, W_hq, bq = params
    W_xh = W_xh.to(device)
    W_hh = W_hh.to(device)
    bh = bh.to(device)
    W_hq = W_hq.to(device)
    bq = bq.to(device)
    H = state
    H = H.to(device)
    outputs = []
    for X in inputs:
        X = X.to(device)
#         print(X.get_device(), W_xh.get_device(), H.get_device(), W_hh.get_device(),bh.get_device())
        H = torch.tanh(torch.mm(X, W_xh) + torch.mm(H, W_hh) + bh)
#         print(H)
        Y = torch.mm(H, W_hq) + bq
        outputs.append(Y)
    return torch.cat(outputs, dim=0), (H)

In [215]:
def init_rnn_state(batch_size, num_hiddens, device):
    return (torch.zeros((batch_size, num_hiddens),device=device))

In [216]:
X = torch.arange(28).reshape((28, 1))
print(X.T.shape)
print(len(data_vocab))
X = F.one_hot(X.T, len(data_vocab)).type(torch.float32)


torch.Size([1, 28])
28


In [217]:
state = init_rnn_state(28,4, torch.device('cpu'))
output_rnn = rnn(X,state,output_array, torch.device('cpu'))
#output_rnn

In [218]:
class RNNfromscratch:
    def __init__(self, vocab_size, num_hiddens, device, get_params, init_state, forward_fn):
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(vocab_size, num_hiddens, device)
        self.init_state = init_state
        self.forward_fn = forward_fn
        self.device = device
    
    def __call__(self, X, state):
        X = F.one_hot(X.T.to(torch.int64), self.vocab_size, ).type(torch.float32)
        return self.forward_fn(X, state, self.params,self.device)
    
    def begin_state(self, batch_size, device):
        return self.init_state(batch_size,self.num_hiddens, device=device)

In [219]:
num_hiddens = 512
net = RNNfromscratch(len(data_vocab), num_hiddens, d2l.try_gpu(), get_params,init_rnn_state, rnn)
state = net.begin_state(X.shape[0], d2l.try_gpu())

In [220]:
Y, new_state = net(X.to(d2l.try_gpu()), state)
Y.shape, len(new_state), new_state[0].shape

RuntimeError: tensors must be 2-D