In [2]:
import tensorflow as tf



In [3]:
class MyRNNCell(tf.keras.layers.Layer):
    def __init__(self, rnn_units, input_dim, ouput_dim):
        super(MyRNNCell, self).__init__()
        self.W_xh = self.add_weight([rnn_units, input_dim]) # from input to hidden layer
        self.W_hh = self.add_weight([rnn_units, rnn_units]) # hidden layer to next hidden layer
        self.W_hy = self.add_weight([ouput_dim, rnn_units]) # from hidden to output
        self.h = tf.zeros([rnn_units, 1])
    
    def call(self, x):
        """
        this define the forward pass 
        """
        self.h = tf.math.tanh(self.W_hh*self.h + self.W_xh*x) # forward pass
        
        output = self.W_hy + self.h
        
        return output, self.h

#### Getting the dataset

In [4]:
import re

In [5]:
!wget http://www.gutenberg.org/files/35/35-0.txt -O timemachine.txt

--2020-03-05 13:13:15--  http://www.gutenberg.org/files/35/35-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 204492 (200K) [text/plain]
Saving to: ‘timemachine.txt’


2020-03-05 13:13:17 (149 KB/s) - ‘timemachine.txt’ saved [204492/204492]



In [6]:
def read_time_machine():
    """Load the time machine book into a list of sentences."""
    with open('timemachine.txt', 'r') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+', ' ', line.strip().lower())
            for line in lines]

In [7]:
lines = read_time_machine()

In [8]:
f'sentences {len(lines)}'

'sentences 3583'

In [9]:
def tokenize(lines, token='word'):
    """Split sentences into word or char tokens."""
    if token == 'word':
        return [line.split(' ') for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('ERROR: unknown token type '+token)

In [10]:
tokens = tokenize(lines)
tokens[0:2]

[[''],
 ['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'the',
  'time',
  'machine',
  'by',
  'h',
  'g',
  'wells']]

Building the vocabulary 

In [13]:
import collections

In [11]:
# Saved in the d2l package for later use
class Vocab(object):
    def __init__(self, tokens, min_freq=0, reserved_tokens=[]):
        # Sort according to frequencies
        counter = count_corpus(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[0])
        self.token_freqs.sort(key=lambda x: x[1], reverse=True)
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
        uniq_tokens += [token for token, freq in self.token_freqs
                        if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

# Saved in the d2l package for later use
def count_corpus(sentences):
    # Flatten a list of token lists into a list of tokens
    tokens = [tk for line in sentences for tk in line]
    return collections.Counter(tokens)

In [38]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[0:10])

[('<unk>', 0), ('the', 1), ('', 2), ('and', 3), ('of', 4), ('i', 5), ('a', 6), ('to', 7), ('in', 8), ('was', 9)]


Vocabulary is the list of all words we have in our corpus

In [15]:
vocab.token_freqs[:5]

[('the', 2472), ('', 1400), ('and', 1314), ('of', 1284), ('i', 1268)]

In [16]:
for i in range(80, 82):
    print('words:', tokens[i])
    print('indices:', vocab[tokens[i]])

words: ['', 'you', 'must', 'follow', 'me', 'carefully', 'i', 'shall', 'have', 'to', 'controvert', 'one', 'or', 'two']
indices: [2, 21, 82, 456, 15, 658, 5, 485, 32, 7, 2902, 37, 23, 171]
words: ['ideas', 'that', 'are', 'almost', 'universally', 'accepted', 'the', 'geometry', 'for', 'instance', '']
indices: [1362, 10, 74, 181, 4799, 560, 1, 1029, 18, 414, 2]


In [17]:
def load_corpus_time_machine(max_tokens=-1):
    lines = read_time_machine()
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens)
    corpus = [vocab[tk] for line in tokens for tk in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

corpus, vocab = load_corpus_time_machine()
len(corpus), len(vocab)

(190638, 28)

In [18]:
# corpus is the list 

Random sampling :
    
    Picking sample mini batches will do the trick using random sampling 

In [19]:
import random
import numpy as np

In [20]:
def seq_data_iter_random(corpus, batch_size, num_steps):
    # Offset the iterator over the data for uniform starts
    corpus = corpus[random.randint(0, num_steps):]
    # Subtract 1 extra since we need to account for label
    num_examples = ((len(corpus) - 1) // num_steps)
    example_indices = list(range(0, num_examples * num_steps, num_steps))
    random.shuffle(example_indices)

    def data(pos):
        # This returns a sequence of the length num_steps starting from pos
        return corpus[pos: pos + num_steps]

    # Discard half empty batches
    num_batches = num_examples // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        # Batch_size indicates the random examples read each time
        batch_indices = example_indices[i:(i+batch_size)]
        X = [data(j) for j in batch_indices]
        Y = [data(j + 1) for j in batch_indices]
        yield np.array(X), np.array(Y)

In [25]:
list(range(30, 33))

[30, 31, 32]

In [61]:
my_seq = list(range(30))
for X, Y in seq_data_iter_random(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY:', Y)

X:  [[ 5  6  7  8  9 10]
 [23 24 25 26 27 28]] 
Y: [[ 6  7  8  9 10 11]
 [24 25 26 27 28 29]]
X:  [[17 18 19 20 21 22]
 [11 12 13 14 15 16]] 
Y: [[18 19 20 21 22 23]
 [12 13 14 15 16 17]]


X are indexes of words in our corpus

In [43]:
X[0]

array([18, 19, 20, 21, 22, 23])

In [49]:
for in_sentence, out_sentence in zip(X, Y) :
    print(vocab.to_tokens(list(in_sentence)), vocab.to_tokens(list(out_sentence)))

['but', 'you', 'this', 'or', 'were', 'on'] ['you', 'this', 'or', 'were', 'on', 'not']
['', 'and', 'of', 'i', 'a', 'to'] ['and', 'of', 'i', 'a', 'to', 'in']


In [46]:
for out_sentence in Y :

'g'

Let get back to this thing and understand text processing before feeding it to a recurent neural network, how we need to process the text.

Let say we have a text, actually a list of sentence , and given a sentence we need to predict the next word in a sentence,  

First problem neural networks don't work with numbers , they work with vectors, how to convvert those text into vector so that our RNN can understand them?

Some few concepts to understand :
    
    - Tokenisation : process of breaking down a text into word
    - lematisation : removing the ending of a word, likes bat, bats
    - stemming : removing suffixes from word

N-grams : 
    
    group of n-tokens in a sentences, let say : natural language processing is awesome

Term and document frequency:

term frequecny : is the frequency of a word in the current document , 
document frequency is the measure of how much information the word provides.
One hot encoding :

One hot encodings are another way of representing words in numeric form. The length of the word vector is equal to the length of the vocabulary, and each observation is represented by a matrix with rows equal to the length of vocabulary and columns equal to the length of observation, with a value of 1 where the word of vocabulary is present in the observation and a value of zero where it is not.

Word Embeddings : [source](https://www.analyticsvidhya.com/blog/2017/06/word-embeddings-count-word2veec/)
    word emebedding are text represented into numbers, one of the way to perform word embedding is by using one hot encoding, with a dictionary.
    
count vector : given a corpus of d document , and n token extracted from the document , count vector is a matrix where the columns are all the  tokens in  the documents, and row are the document themselves. the element of [i][j] is the number of occurence of the token i in the document i

Let us understand what are word2vec:
    it's not a combinaison of a single algorithm but a combinaison of 2 algorithms :
    Continous bag of word and skip gram model.
- Cbow (Continous bag of word): a techinque that tend to predict a probabiity of word given a context... the context may be a word or a single group of word...
- skip gram : is aims to predict a context given a word...




Embeding Matrix : [Source](https://qr.ae/p8Lm8h)
    To convert a sample into its embedding form, each of the word in its one hot encoded form is multiplied by the embedding matrix to give word embeddings for the sample.
Because the problem with one hot vector is sparsity and there are very big but in those vectors most values are 0,
and the model can learn unwanted bevaviour that is why we comes up with embeding vectors word to vec : 

>This is where embedding comes into play. An embedding matrix $W_e \in R^{K\times D}$
is a linear mapping from the original space (one-of-k) to a real-valued space where entities can have meaningful relationships. Ideally, we wish that we can have
    

https://towardsdatascience.com/natural-language-processing-from-basics-to-using-rnn-and-lstm-ef6779e4ae66
https://towardsdatascience.com/what-the-heck-is-word-embedding-b30f67f01c81

### Building the model dataset 

- The input , we have our corpus or list of sentences , the input is a batch with size m, aka a list of n sentences, each sentence is an array of tokens or words. (This one we know already)*
- for this batch the ouput is also a list of word where each word.

So if we have this sentence passed to our network :

- the time machine by H 
the output should be :
- time machine by H G

You can check it with this code 

In [69]:
def format_input_output(X, Y):
    for in_sentence, out_sentence in zip(X, Y) :
        print('input : ', ' '.join([word for word in vocab.to_tokens(list(in_sentence))])) 
        print('output: ', ' '.join([word for word in vocab.to_tokens(list(out_sentence))]))
        print('='*20) 

In [82]:
batch_size, num_steps = 32, 35
i = 0
for X, Y in seq_data_iter_random(corpus, batch_size=batch_size, num_steps=num_steps):
    print(X.shape)
    print(Y.shape)
    print('='*20)
    i+=1

(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(32, 35)
(

In [83]:
print(i)

170


In [74]:
seq_data_iter_random(corpus, batch_size=batch_size, num_steps=num_steps)

<generator object seq_data_iter_random at 0x13a724048>

Need to understand the part building the model dataset:
    - https://victorzhou.com/blog/intro-to-rnns/
    - https://towardsdatascience.com/recurrent-neural-networks-by-example-in-python-ffd204f99470