In [1]:
import torch
import matplotlib.pyplot as plt
from torch import nn
from torch.nn import functional as F

In [2]:
batch_size, num_steps = 32, 35

In [3]:
#!curl --output pap.txt https://www.gutenberg.org/files/1342/1342-0.txt

In [4]:
import re
def read_text(text_name='pap.txt'):
    with open(text_name, 'r', encoding='utf-8') as text_input:
        lines = text_input.readlines()
    
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines] # only alphabets

In [5]:
text_input = read_text()

In [6]:
text_input[:5]

['the project gutenberg ebook of pride and prejudice by jane austen',
 '',
 'this ebook is for the use of anyone anywhere in the united states and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever you may copy it give it away or re use it under the terms']

In [7]:
def tokenize(lines, token_type='word'):
    if token_type == 'char':
        return [list(line) for line in lines]
    elif token_type == 'word':
        return [line.split() for line in lines]
    else:
        'Wrong token type.'

In [17]:
import collections

def count_corpus(tokens):
    tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [40]:
# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
class Vocab:
    def __init__(self, tokens=None,min_freq=0):
        if tokens == None:
            tokens = []
        
        tokens = [token for line in tokens for token in line]
        counter = collections.Counter(tokens)
        
        self._token_freqs = sorted(counter.items(),key=lambda x:x[1], reverse=True)
        self.idx_to_token = ['<unk>']
        self.token_to_idx = { token:idx for idx, token in enumerate(self.idx_to_token)}
        
        for token, freq in self._token_freqs:
            if freq < min_freq:
                #print(f"for {freq} : {token} breaking")
                continue
            elif token not in self.idx_to_token:
                #print(token + "adding")
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token)-1
            else:
                print(token + " found already")
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if isinstance(tokens, list):
            return_list = []
            for t in tokens:
                for token in t:
                    if token in self.token_to_idx.keys():
                        return_list.append(self.token_to_idx[token])
                    else:
                        return_list.append(self.unk)
            return return_list
        else:
            if tokens in self.token_to_idx.keys():
                return self.token_to_idx[tokens]
            else:
                return self.unk
            
    def to_tokens(self, indices):
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # Index for the unknown token
        return 0

    @property
    def token_freqs(self):  # Index for the unknown token
        return self._token_freqs


In [41]:
tokens = tokenize(text_input)
tokens[:1]

[['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'pride',
  'and',
  'prejudice',
  'by',
  'jane',
  'austen']]

In [14]:
# tokens = [token for line in tokens for token in line]
# tokens[:5]

['the', 'project', 'gutenberg', 'ebook', 'of']

In [42]:
vocab = Vocab(tokens)

In [43]:
for i in vocab.token_freqs:
    if i[0] == 'the':
        print(i)
# print(vocab.token_freqs)
# vocab.token_to_idx

# the error is that we are using the entire sentece for creating token
# problem in tokenize

('the', 4521)


In [44]:
vocab['the']

1

In [46]:
vocab.token_freqs[:5]

[('the', 4521), ('to', 4246), ('of', 3735), ('and', 3657), ('her', 2226)]

In [47]:
def load_corpus(max_tokens=-1, text_input=text_input):
    tokens = tokenize(text_input)
    vocab = Vocab(tokens)
    
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab
    

In [48]:
corpus, vocab = load_corpus()

In [49]:
class SeqDataLoader:
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = seq_data_iter_random
        else :
            self.data_iter_fn = seq_data_iter_sequential
        
        self.corpus, self.vocab = load_corpus(max_tokens=max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps
    
    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.steps)

In [51]:
def seq_data_iter_random(corpus, batch_size, num_steps):
    print("Dont use random seq data iter")

In [54]:
import random
num_steps = 5
batch_size=5
offset = random.randint(0, num_steps)
    
print(offset)

1


In [57]:
len(corpus)-offset-1

126018

In [59]:
num_tokens = (len(corpus) -offset -1)//batch_size * batch_size # ensuring num_tokens is perfectly divisible
num_tokens

126015

In [62]:
Xs = torch.tensor(corpus[offset:offset+num_tokens])
len(Xs),Xs[:5]

(126015, tensor([179, 160, 910,   3, 317]))

In [63]:
Ys = torch.tensor(corpus[offset+1:offset+1+num_tokens])
len(Ys),Ys[:5]

(126015, tensor([160, 910,   3, 317,   4]))

In [64]:
Xs = Xs.reshape(batch_size,-1)
len(Xs), Xs[:5]

# okay Xs devided into 5 batches

(5,
 tensor([[ 179,  160,  910,  ...,  372,    3, 3438],
         [  44,  415,    3,  ...,   36,    2,  438],
         [  11,    1,  800,  ...,   19,   41,  173],
         [   8,  470, 5558,  ...,   33,    9,  417],
         [   3,   12,  170,  ..., 4057, 6522,    2]]))

In [67]:
num_batches = Xs.shape[1]//num_steps
num_batches

# these many times we will go through the sequence

5040

In [52]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus)-offset-1)//batch_size)*batch_size
    
    Xs = torch.tensor(corpus[offset:offset + num_tokens])
    Ys = torch.tensor(corpus[offset + 1:offset + 1 + num_tokens])
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1]//num_steps
    for i in range(0, num_steps * num_batches, num_steps):
        Xs = Xs[i:i+num_steps]
        Ys = Ys[i:i+num_steps]
        yield X, Y
    
    