In [1]:
import torch
import matplotlib.pyplot as plt
from torch import nn
from torch.nn import functional as F

In [2]:
batch_size, num_steps = 32, 35

In [3]:
#!curl --output pap.txt https://www.gutenberg.org/files/1342/1342-0.txt

In [4]:
import re
def read_text(text_name='pap.txt'):
    with open(text_name, 'r', encoding='utf-8') as text_input:
        lines = text_input.readlines()
    
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines] # only alphabets

In [5]:
text_input = read_text()

In [6]:
text_input[:5]

['the project gutenberg ebook of pride and prejudice by jane austen',
 '',
 'this ebook is for the use of anyone anywhere in the united states and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever you may copy it give it away or re use it under the terms']

In [7]:
def tokenize(lines, token_type='word'):
    if token_type == 'char':
        return [list(line) for line in lines]
    elif token_type == 'word':
        return [line.split() for line in lines]
    else:
        'Wrong token type.'

In [8]:
import collections

def count_corpus(tokens):
    tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [9]:
# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
class Vocab:
    def __init__(self, tokens=None,min_freq=0):
        if tokens == None:
            tokens = []
        
        tokens = [token for line in tokens for token in line]
        counter = collections.Counter(tokens)
        
        self._token_freqs = sorted(counter.items(),key=lambda x:x[1], reverse=True)
        self.idx_to_token = ['<unk>']
        self.token_to_idx = { token:idx for idx, token in enumerate(self.idx_to_token)}
        
        for token, freq in self._token_freqs:
            if freq < min_freq:
                #print(f"for {freq} : {token} breaking")
                continue
            elif token not in self.idx_to_token:
                #print(token + "adding")
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token)-1
            else:
                print(token + " found already")
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if isinstance(tokens, list):
            return_list = []
            for t in tokens:
                for token in t:
                    if token in self.token_to_idx.keys():
                        return_list.append(self.token_to_idx[token])
                    else:
                        return_list.append(self.unk)
            return return_list
        else:
            if tokens in self.token_to_idx.keys():
                return self.token_to_idx[tokens]
            else:
                return self.unk
            
    def to_tokens(self, indices):
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # Index for the unknown token
        return 0

    @property
    def token_freqs(self):  # Index for the unknown token
        return self._token_freqs


In [10]:
tokens = tokenize(text_input)
tokens[:1]

[['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'pride',
  'and',
  'prejudice',
  'by',
  'jane',
  'austen']]

In [11]:
# tokens = [token for line in tokens for token in line]
# tokens[:5]

In [12]:
vocab = Vocab(tokens)

In [13]:
for i in vocab.token_freqs:
    if i[0] == 'the':
        print(i)
# print(vocab.token_freqs)
# vocab.token_to_idx

# the error is that we are using the entire sentece for creating token
# problem in tokenize

('the', 4521)


In [14]:
vocab['the']

1

In [15]:
vocab.token_freqs[:5]

[('the', 4521), ('to', 4246), ('of', 3735), ('and', 3657), ('her', 2226)]

In [16]:
def load_corpus(max_tokens=-1, text_input=text_input):
    tokens = tokenize(text_input)
    vocab = Vocab(tokens)
    
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab
    

In [17]:
corpus, vocab = load_corpus()

In [18]:
class SeqDataLoader:
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = seq_data_iter_random
        else :
            self.data_iter_fn = seq_data_iter_sequential
        
        self.corpus, self.vocab = load_corpus(max_tokens=max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps
    
    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.steps)

In [19]:
def seq_data_iter_random(corpus, batch_size, num_steps):
    print("Dont use random seq data iter")

In [20]:
import random
num_steps = 5
batch_size=5
offset = random.randint(0, num_steps)
    
print(offset)

5


In [21]:
len(corpus)-offset-1

126014

In [22]:
num_tokens = (len(corpus) -offset -1)//batch_size * batch_size # ensuring num_tokens is perfectly divisible
num_tokens

126010

In [23]:
Xs = torch.tensor(corpus[offset:offset+num_tokens])
len(Xs),Xs[:5]

(126010, tensor([ 317,    4, 1159,   31,   69]))

In [24]:
Ys = torch.tensor(corpus[offset+1:offset+1+num_tokens])
len(Ys),Ys[:5]

(126010, tensor([   4, 1159,   31,   69, 2494]))

In [25]:
Xs = Xs.reshape(batch_size,-1)
len(Xs), Xs[:5]

# okay Xs devided into 5 batches

(5,
 tensor([[ 317,    4, 1159,  ...,   44,  415,    3],
         [ 407,   23,   72,  ...,  438,   11,    1],
         [ 800,    3,   53,  ...,   41,  173,    8],
         [ 470, 5558,   54,  ...,   33,    9,  417],
         [   3,   12,  170,  ...,  198, 4057, 6522]]))

In [26]:
num_batches = Xs.shape[1]//num_steps
num_batches

# these many times we will go through the sequence

5040

In [27]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus)-offset-1)//batch_size)*batch_size
    
    Xs = torch.tensor(corpus[offset:offset + num_tokens])
    Ys = torch.tensor(corpus[offset + 1:offset + 1 + num_tokens])
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1]//num_steps
    for i in range(0, num_steps * num_batches, num_steps):
        Xs = Xs[i:i+num_steps]
        Ys = Ys[i:i+num_steps]
        yield X, Y
    
    

In [28]:
def load_data(batch_size, num_steps, use_random_iter=False, max_tokens=10000):
    data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter, max_tokens)
    
    return data_iter, data_iter.vocab

In [29]:
batch_size, num_steps = 32, 35
train_iter, vocab = load_data(batch_size, num_steps)

In [30]:
# one hot encoding

F.one_hot(torch.tensor([0,2]), len(vocab))

tensor([[1, 0, 0,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0]])

In [31]:
X = torch.arange(10).reshape((2,5))
F.one_hot(X.T, 28,).shape

torch.Size([5, 2, 28])

In [49]:
def get_params(vocab_size, num_hiddens, device):
    num_inputs = num_outputs = num_hiddens
    
    def normal(shape):
        return torch.randn(size=shape, device=device)
    
    W_xh = normal((num_inputs, num_hiddens))
    W_hh = normal((num_hiddens, num_hiddens))
    b_h = torch.zeros(num_hiddens, device=device)
    
    W_hq = normal((num_hiddens, num_outputs))
    b_q = torch.zeros(num_outputs, device=device)
    
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.requires_grad = True
    
    return params

In [34]:
# returns a torch of zeros
def init_rnn_state(batch_size, num_hiddens, device):
    return (torch.zeros((batch_size, num_hiddens), device=device),)

In [54]:
#computing hidden state ans output in a time step
def rnn(inputs, state, params):
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    
    for X in inputs:
        H = torch.tanh(torch.mm(X, W_xh) + torch.mm(H, W_hh) + b_h)
        Y = torch.mm(H, W_hq) + b_q
        outputs.append(Y)
    return torch.cat(outputs, dim=0), (H,)

In [36]:
# class to wrap up the functions and state togeter

class RNNModelScratch:
    def __init__(self, vocab_size, num_hiddens, device, get_params, init_state, forward_fn ):
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(self.vocab_size, num_hiddens, device)
        self.init_state, self.forward_fn = init_state, forward_fn
    
    def __call__(self, X, state):
        X = F.one_hot(X.T, self.vocab_size).type(torch.float32)
        return self.forward_fn(X, state, self.params)
    
    def begin_state(self, batch_size, device):
        return self.init_state(batch_size, self.num_hiddens, device)

In [55]:
#checking if output gave fixd dusze
num_hiddens = 512
net = RNNModelScratch(len(vocab), num_hiddens, torch.device('cuda'), get_params, init_rnn_state, rnn)
state = net.begin_state(X.shape[0], torch.device('cuda'))
Y,new_state = net(X.to(torch.device('cuda')), state)

In [56]:
Y.shape, len(new_state), new_state[0].shape

#We can see that the output shape is (number of time steps × batch size, vocabulary size), 
#while the hidden state shape remains the same, i.e., (batch size, number of hidden units).


(torch.Size([10, 512]), 1, torch.Size([2, 512]))

Let us first define the prediction function to generate new characters following the user-provided
prefix, which is a string containing several characters. When looping through these beginning
characters in prefix, we keep passing the hidden state to the next time step without generating
any output. This is called the warm-up period, during which the model updates itself (e.g., update
the hidden state) but does not make predictions. After the warm-up period, the hidden state is
generally better than its initialized value at the beginning. So we generate the predicted characters
and emit them

In [59]:
def predict_ch8(prefix, num_preds, net, vocab, device):
    state = net.begin_state(batch_size=1, device=device)
    outputs = [vocab[prefix[0]]]
    get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1,1))
    for y in prefix[1:]:
        _, state = net(get_input(), state)
        outputs.append(vocab[y])
    
    for _ in range(num_preds):
        y, state = net(get_input(), state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    
    return ''.join([vocab.idx_to_token[i] for i in outputs])

In [60]:
predict_ch8('time traveller ', 10, net, vocab, torch.device('cuda'))

'time<unk>t<unk>a<unk>e<unk><unk>e<unk><unk>poorifonplaceshorttogethertowardssocietyowncivility'