In [1]:
import torch
import matplotlib.pyplot as plt
from torch import nn
from torch.nn import functional as F

In [2]:
batch_size, num_steps = 32, 35

In [3]:
#!curl --output pap.txt https://www.gutenberg.org/files/1342/1342-0.txt

In [4]:
from d2l import torch as d2l

In [5]:
import re

d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt','090b5e7e70c295757f55df93cb0a180b9691891a')

def read_time_machine(): #@save
    """Load the time machine dataset into a list of text lines."""
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

lines = read_time_machine()


In [6]:
import re
def read_text(text_name='pap.txt'):
    with open(text_name, 'r', encoding='utf-8') as text_input:
        lines = text_input.readlines()
    
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines] # only alphabets

In [7]:
text_input = read_text()

In [8]:
text_input = read_time_machine()

In [9]:
text_input[:5]

['the time machine by h g wells', '', '', '', '']

In [10]:
def tokenize(lines, token_type='word'):
    if token_type == 'char':
        return [list(line) for line in lines]
    elif token_type == 'word':
        return [line.split() for line in lines]
    else:
        'Wrong token type.'

In [11]:
import collections

def count_corpus(tokens):
    tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [12]:
# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
class Vocab:
    def __init__(self, tokens=None,min_freq=0):
        if tokens == None:
            tokens = []
        
        tokens = [token for line in tokens for token in line]
        counter = collections.Counter(tokens)
        
        self._token_freqs = sorted(counter.items(),key=lambda x:x[1], reverse=True)
        self.idx_to_token = ['<unk>']
        self.token_to_idx = { token:idx for idx, token in enumerate(self.idx_to_token)}
        
        for token, freq in self._token_freqs:
            if freq < min_freq:
                #print(f"for {freq} : {token} breaking")
                continue
            elif token not in self.idx_to_token:
                #print(token + "adding")
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token)-1
            else:
                print(token + " found already")
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if isinstance(tokens, list):
            return_list = []
            for t in tokens:
                for token in t:
                    if token in self.token_to_idx.keys():
                        return_list.append(self.token_to_idx[token])
                    else:
                        return_list.append(self.unk)
            return return_list
        else:
            if tokens in self.token_to_idx.keys():
                return self.token_to_idx[tokens]
            else:
                return self.unk
            
    def to_tokens(self, indices):
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # Index for the unknown token
        return 0

    @property
    def token_freqs(self):  # Index for the unknown token
        return self._token_freqs


In [13]:
tokens = tokenize(text_input)
tokens[:1]

[['the', 'time', 'machine', 'by', 'h', 'g', 'wells']]

In [14]:
# tokens = [token for line in tokens for token in line]
# tokens[:5]

In [15]:
vocab = Vocab(tokens)

In [16]:
for i in vocab.token_freqs:
    if i[0] == 'the':
        print(i)
# print(vocab.token_freqs)
# vocab.token_to_idx

# the error is that we are using the entire sentece for creating token
# problem in tokenize

('the', 2261)


In [17]:
vocab['the']

1

In [18]:
vocab.token_freqs[:5]

[('the', 2261), ('i', 1267), ('and', 1245), ('of', 1155), ('a', 816)]

In [19]:
def load_corpus(max_tokens=-1, text_input=text_input):
    tokens = tokenize(text_input)
    vocab = Vocab(tokens)
    
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab
    

In [20]:
corpus, vocab = load_corpus()

In [21]:
class SeqDataLoader:
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = seq_data_iter_random
        else :
            self.data_iter_fn = seq_data_iter_sequential
        
        self.corpus, self.vocab = load_corpus(max_tokens=max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps
    
    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.steps)

In [22]:
def seq_data_iter_random(corpus, batch_size, num_steps):
    print("Dont use random seq data iter")

In [23]:
import random
num_steps = 5
batch_size=5
offset = random.randint(0, num_steps)
    
print(offset)

1


In [24]:
len(corpus)-offset-1

32773

In [25]:
num_tokens = (len(corpus) -offset -1)//batch_size * batch_size # ensuring num_tokens is perfectly divisible
num_tokens

32770

In [26]:
Xs = torch.tensor(corpus[offset:offset+num_tokens])
len(Xs),Xs[:5]

(32770, tensor([  19,   50,   40, 2183, 2184]))

In [27]:
Ys = torch.tensor(corpus[offset+1:offset+1+num_tokens])
len(Ys),Ys[:5]

(32770, tensor([  50,   40, 2183, 2184,  400]))

In [28]:
Xs = Xs.reshape(batch_size,-1)
len(Xs), Xs[:5]

# okay Xs devided into 5 batches

(5,
 tensor([[  19,   50,   40,  ...,    7, 2658,   10],
         [ 745,   72,   42,  ...,  670,    3,   87],
         [  29,  246,  160,  ...,   23,    1,  818],
         [  16,    1, 1378,  ...,  148,   72,   33],
         [ 504, 4128,   16,  ...,  635,   23,    8]]))

In [29]:
num_batches = Xs.shape[1]//num_steps
num_batches

# these many times we will go through the sequence

1310

In [30]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus)-offset-1)//batch_size)*batch_size
    
    Xs = torch.tensor(corpus[offset:offset + num_tokens])
    Ys = torch.tensor(corpus[offset + 1:offset + 1 + num_tokens])
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1]//num_steps
    for i in range(0, num_steps * num_batches, num_steps):
        Xs = Xs[i:i+num_steps]
        Ys = Ys[i:i+num_steps]
        yield X, Y
    
    

In [31]:
def load_data(batch_size, num_steps, use_random_iter=False, max_tokens=10000):
    data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter, max_tokens)
    
    return data_iter, data_iter.vocab

In [32]:
batch_size, num_steps = 32, 35
train_iter, vocab = load_data(batch_size, num_steps)

In [63]:
# trying the book way

# batch_size, num_steps = 32, 35
# train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps)

book way is working fine showing that there is some error in tokenisation.

In [64]:
# one hot encoding

F.one_hot(torch.tensor([0,2]), len(vocab))

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

In [65]:
X = torch.arange(10).reshape((2,5))
F.one_hot(X.T, 28,).shape

torch.Size([5, 2, 28])

In [66]:
def get_params(vocab_size , hidden_size, device):
    input_size = output_size = vocab_size
    
    Wxh = torch.randn(size=(input_size, hidden_size), device=device) * 0.01
    Whh = torch.randn(size=(hidden_size, hidden_size), device=device) * 0.01
    
    bh = torch.zeros((hidden_size), device=device)
    
    Whq = torch.randn(size=(hidden_size, output_size),device=device)
    bq = torch.zeros((output_size), device=device)
    
    params = [Wxh, Whh, bh, Whq, bq]
    for param in params:
        param.requires_grad_(True)
    
    return params

### RNN Model
To define an RNN model, we first need an init_rnn_state function to return the hidden state at
initialization. It returns a tensor filled with 0 and with a shape of (batch size, number of hidden
units). Using tuples makes it easier to handle situations where the hidden state contains multiple
variables, which we will encounter in later sections

In [67]:
# returns a torch of zeros
def init_rnn_state(batch_size, num_hiddens, device):
    return (torch.zeros((batch_size, num_hiddens), device=device),)

In [68]:
H, = init_rnn_state(2, 512, torch.device('cuda'))

## init state is suppposed to return a tuple so handle accordingly

In [69]:
H.shape

torch.Size([2, 512])

In [70]:
H = init_rnn_state(2,512, torch.device('cuda'))

In [71]:
H

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'),)

The following rnn function defines how to compute the hidden state and output at a time step.
Note that the RNN model loops through the outermost dimension of inputs so that it updates
hidden states H of a minibatch, time step by time step. Besides, the activation function here uses
the tanh function. As described in Section 4.1, the mean value of the tanh function is 0, when the
elements are uniformly distributed over the real numbers.
## to do

In [72]:
def rnn(inputs, state, params):
    W_xh, W_hh, b_h, W_qh, b_q = params
    H, = state
    outputs = []
    #print(inputs.shape)
    for X in inputs:
        H = torch.tanh(torch.mm(X , W_xh)) + (torch.mm(H, W_hh) + b_h)
        Y = torch.mm(H, W_qh) + b_q
        outputs.append(Y)
    
    return torch.cat(outputs, dim=0),(H,)

With all the needed functions being defined, next we create a class to wrap these functions and
store parameters for an RNN model implemented from scratch.

In [73]:
class RNNModelScratch:
    def __init__(self, vocab_size, hidden_size,device, get_params, init_state,forward_fn):
        self.vocab_size, self.hidden_size = vocab_size, hidden_size
        self.forward_fn = forward_fn
        self.init_state = init_state
        self.params = get_params(vocab_size, hidden_size, device)
    
    def __call__(self,X, state ):
        #print(X.shape)
        X = F.one_hot(X.T, self.vocab_size).type(torch.float32)
        #print(X.shape)
        return self.forward_fn(X, state, self.params)
    
    def begin_state(self,batch_size, device):
        return self.init_state(batch_size, self.hidden_size, device)

Let us check whether the outputs have the correct shapes, e.g., to ensure that the dimensionality
of the hidden state remains unchanged

In [74]:
X, X.shape[0]

(tensor([[0, 1, 2, 3, 4],
         [5, 6, 7, 8, 9]]),
 2)

https://towardsdatascience.com/illustrated-guide-to-recurrent-neural-networks-79e5eb8049c9

In [75]:
#checking if output gave fixd dusze
num_hiddens = 512
net = RNNModelScratch(len(vocab), num_hiddens, torch.device('cuda'), get_params, init_rnn_state, rnn)
state = net.begin_state(X.shape[0], torch.device('cuda'))
Y,new_state = net(X.to(torch.device('cuda')), state)

In [76]:
Y.shape, len(new_state), new_state[0].shape

#We can see that the output shape is (number of time steps × batch size, vocabulary size), 
#while the hidden state shape remains the same, i.e., (batch size, number of hidden units).


(torch.Size([10, 28]), 1, torch.Size([2, 512]))

We can see that the output shape is (number of time steps × batch size, vocabulary size), while the
hidden state shape remains the same, i.e., (batch size, number of hidden units).


Let us first define the prediction function to generate new characters following the user-provided
prefix, which is a string containing several characters. When looping through these beginning
characters in prefix, we keep passing the hidden state to the next time step without generating
any output. This is called the warm-up period, during which the model updates itself (e.g., update
the hidden state) but does not make predictions. After the warm-up period, the hidden state is
generally better than its initialized value at the beginning. So we generate the predicted characters
and emit them

In [77]:
def predict_ch8(prefix,num_preds, net, vocab,device):
    state = net.begin_state(batch_size=1, device=device)
    outputs = [vocab[prefix[0]]]
    get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1,1))
    for y in prefix[1:]:
        #print(get_input())
        _,state = net(get_input(),state)
        outputs.append(vocab[y])
    
    for _ in range(num_preds):
        y, state = net(get_input(), state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    return ''.join([vocab.idx_to_token[i] for i in outputs])
    

In [81]:
predict_ch8('timess traveller ', 10, net, vocab, d2l.try_gpu())
# generating the same output everytime 

'timess traveller cbojrfe oj'