# Deep N-Grams
- How to convert a line to text into a tensor
- Create an iterator to feed data to the model
- Define a GRU model using `trax`
- Train the model using `trax`
- Compute the accuracy of your model using perplexity
- Predict using your own model

### Overview
- Predict the next set of characters using previous characters
- Convert a line of text into a tensor
- Create a generator to feed data into the model
- Train a neural network in order to predict the new set of characters of defined length
- Use embeddings for each character and feed them as inputs to your model
    - Many natural language tasks rely on using embeddings for predictions
- Your model will convert each character to its embedding, 
    - run the embeddings through a GRU and 
        - run it through a linear layer to predict the next set of characters

Further...  
- Get the embeddings
- Stack the embeddings on top of each other
- Run them through two layers with a relu activation in the middle
- Compute the softmax

To predict the next character...   
- Use the softmax output and identify the word with the highest probability
- The word with the highest probability is the prediction for the next word

In [2]:
import os
import trax
import trax.fastmath.numpy as np
import pickle
import numpy
import random as rnd
from trax import fastmath
from trax import layers as tl

# Set random seed
trax.supervised.trainer_lib.init_random_number_generators(32)
rnd.seed(32)

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 




In [7]:
dirname = 'data/'
lines = []
for filename in os.listdir(dirname):
    with open(os.path.join(dirname, filename)) as files:
        for line in files:
            pure_line = line.strip()
            if(pure_line):
                lines.append(pure_line)

In [9]:
n_lines = len(lines)
print(f"Number of lines: {n_lines}")
print(f"Sample line at position 0 {lines[0]}")
print(f"Sample line at position 999 {lines[999]}")

Number of lines: 125097
Sample line at position 0 THE MERRY WIVES OF WINDSOR
Sample line at position 999 you.


In [10]:
# go through each line
for i, line in enumerate(lines):
    # convert to all lowercase
    lines[i] = line.lower()

print(f"Number of lines: {n_lines}")
print(f"Sample line at position 0 {lines[0]}")
print(f"Sample line at position 999 {lines[999]}")

Number of lines: 125097
Sample line at position 0 the merry wives of windsor
Sample line at position 999 you.


In [11]:
# Create a holdout validation set
eval_lines = lines[-1000:]
lines = lines[:-1000]

print(f'Number of lines for training: {len(lines)}')
print(f'Number of lines for validation: {len(eval_lines)}')

Number of lines for training: 124097
Number of lines for validation: 1000


In [12]:
# View the unique unicode integer associated with each character
print(f"ord('a'): {ord('a')}")
print(f"ord('b'): {ord('b')}")
print(f"ord('c'): {ord('c')}")
print(f"ord(' '): {ord(' ')}")
print(f"ord('x'): {ord('x')}")
print(f"ord('y'): {ord('y')}")
print(f"ord('z'): {ord('z')}")
print(f"ord('1'): {ord('1')}")
print(f"ord('2'): {ord('2')}")
print(f"ord('3'): {ord('3')}")

ord('a'): 97
ord('b'): 98
ord('c'): 99
ord(' '): 32
ord('x'): 120
ord('y'): 121
ord('z'): 122
ord('1'): 49
ord('2'): 50
ord('3'): 51


In [13]:
help(ord)

Help on built-in function ord in module builtins:

ord(c, /)
    Return the Unicode code point for a one-character string.



#### Write a function to transform a single line to its unicode string array
- The output string array is the tensor
- Special characters are used to represent the end of the sentence

In [15]:
def line_to_tensor(line, EOS_int=1):
    tensor = []
    for c in line:
        c_int = ord(c)
        tensor.append(c_int)
    tensor.append(EOS_int)
    
    return tensor

In [16]:
line_to_tensor('abc xyz')

[97, 98, 99, 32, 120, 121, 122, 1]

#### Batch Generator
- Build a batch generator that takes in a text and returns a batch of text lines
- The generator converts the text lines into numpy array of integers padded by zeros
- Padding ensures all arrays have the same length
- Length of the array is equal to the length of the longest sentence
- Using `next` operator, the sentence arrays can be iterated
- Generator returns a bactch with 3 parts tuple: (inputs, targets, mask)
- The inputs and targets are identifcal
- The second column will be used to evaluate your predictions
- Mask is 1 for non-padding tokens

In [19]:
def data_generator(batch_size, max_length, data_lines, lines_to_tensor=line_to_tensor, shuffle=True):
    
    # Initialize the index that points to the current position in the lines index array
    index = 0
    
    # Initialize the list that will contain the current batch
    cur_batch = []
    
    # Count the number of lines in data_lines
    num_lines = len(data_lines)
    
    # Create an array with the indexes of data_lines that can be shuffled
    lines_index = [*range(num_lines)]
    
    # Shuffle line indexes of if shuffle is set to True
    if(shuffle):
        rnd.shuffle(lines_index)
        
    while(True):
        # if the index isgrater or equal that to the number of lines in data_lines
        if(index >= len(data_lines)):
            index = 0
            if(shuffle):
                rnd.shuffle(lines_index)
        
        # Get a line at the `lines_index[index]` position in data_lines
        line = data_lines[lines_index[index]]
        
        # If the length of the line is less than max_length
        if(len(line) < max_length):
            cur_batch.append(line)
            
        index += 1
        
        if(len(cur_batch) == batch_size):
            batch = []
            mask = []
            
            # Go thru each line in curr_batch
            for li in cur_batch:
                tensor = line_to_tensor(li)
                
                # Create a list of zeros to represent the padding
                # so that the tensor plus padding will have length `max_length`
                pad = [0] * (max_length - len(tensor))
                
                # Combine the tensor plus pad
                tensor_pad = tensor + pad
                batch.append(tensor_pad)
                
                example_mask = [1 if a_num != 0 else 0 for a_num in tensor_pad]
                mask.append(example_mask)
                
            # Convert the batch (data type list) to a trax's numpy array
            batch_np_arr = np.array(batch)
            mask_np_arr = np.array(mask)
            
            yield batch_np_arr, batch_np_arr, mask_np_arr
            
            cur_batch = []

In [20]:
# Try out your data generator
tmp_lines = ['12345678901', #length 11
             '123456789', # length 9
             '234567890', # length 9
             '345678901'] # length 9

# Get a batch size of 2, max length 10
tmp_data_gen = data_generator(batch_size=2, 
                              max_length=10, 
                              data_lines=tmp_lines,
                              shuffle=False)

# get one batch
tmp_batch = next(tmp_data_gen)

# view the batch
tmp_batch

(DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
              [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32),
 DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
              [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32),
 DeviceArray([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
              [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32))

#### Repeating Batch Generator

In [22]:
import itertools
infinite_data_generator = itertools.cycle(
    data_generator(batch_size=2, max_length=10, data_lines=tmp_lines)
)

In [23]:
ten_lines = [next(infinite_data_generator) for _ in range(10)]
print(ten_lines)

[(DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
             [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32), DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
             [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32), DeviceArray([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)), (DeviceArray([[51, 52, 53, 54, 55, 56, 57, 48, 49,  1],
             [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32), DeviceArray([[51, 52, 53, 54, 55, 56, 57, 48, 49,  1],
             [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32), DeviceArray([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)), (DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
             [51, 52, 53, 54, 55, 56, 57, 48, 49,  1]], dtype=int32), DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
             [51, 52, 53, 54, 55, 56, 57, 48, 49,  1]], dtype=int32), DeviceArray([[1, 1, 1, 1, 1, 1, 1,

## Defining the GRU Model
- Initialize the model with input and output tensors
- GRULM, Using Trax

In [32]:
def GRULM(vocab_size=256, d_model=512, n_layers=2, mode='train'):
    model = tl.Serial(
        tl.ShiftRight(n_shifts=1, mode=mode),
        tl.Embedding(vocab_size, d_model),
        [tl.GRU(n_units=d_model) for i in range(n_layers)],
        tl.Dense(vocab_size),
        tl.LogSoftmax()
    )
    return model

In [33]:
# testing your model
model = GRULM()
print(model)

Serial[
  ShiftRight(1)
  Embedding_256_512
  GRU_512
  GRU_512
  Dense_256
  LogSoftmax
]


### Train the Model

In [34]:
batch_size = 32
max_length = 64

In [35]:
def n_used_lines(lines, max_length):
    n_lines = 0
    for l in lines:
        if(len(l) <= max_length):
            n_lines += 1
    return n_lines

In [36]:
num_used_lines = n_used_lines(lines, 32)
print('Number of used lines from the dataset:', num_used_lines)
print('Batch size (a power of 2):', int(batch_size))
steps_per_epoch = int(num_used_lines/batch_size)
print('Number of steps to cover one epoch:', steps_per_epoch)

Number of used lines from the dataset: 25934
Batch size (a power of 2): 32
Number of steps to cover one epoch: 810


In [37]:
from trax.supervised import training

def train_model(
    model, data_generator, batch_size=32, max_length=64,
    lines=lines, eval_lines=eval_lines, n_steps=1, output_dir='model/'
):
    bare_train_generator = data_generator(
        batch_size=batch_size,
        max_length=max_length,
        data_lines=lines,
        shuffle=False
    )
    infinite_train_generator = itertools.cycle(
        data_generator(
            batch_size=batch_size,
            max_length=max_length,
            data_lines=lines,
            shuffle=False
        )
    )
    
    bare_eval_generator = data_generator(
        batch_size=batch_size,
        max_length=max_length,
        data_lines=eval_lines,
        shuffle=False
    )
    infinite_eval_generator = itertools.cycle(
        data_generator(
            batch_size=batch_size,
            max_length=max_length,
            data_lines=eval_lines,
            shuffle=False
        )
    )
    
    train_task = training.TrainTask(
        labeled_data=infinite_train_generator,
        loss_layer=tl.CrossEntropyLoss(),
        optimizer= trax.optimizers.Adam(0.0005)
    )
    eval_task = training.EvalTask(
        labeled_data=infinite_eval_generator,
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
        n_eval_batches=3
    )
    training_loop = training.Loop(
        model,
        train_task,
        eval_task=eval_task,
        output_dir=output_dir
    )
    training_loop.run(n_steps=n_steps)
    
    return training_loop

In [38]:
training_loop = train_model(GRULM(), data_generator)

Step      1: train CrossEntropyLoss |  5.54323053
Step      1: eval  CrossEntropyLoss |  5.48575592
Step      1: eval          Accuracy |  0.16526215


In [39]:
def test_model(preds, target):
    total_log_ppx = np.sum(preds * tl.one_hot(target, preds.shape[-1]), axis=-1)
    non_pad = 1.0 - np.equal(target, 0)
    ppx = non_pad * total_log_ppx
    
    log_ppx = np.sum(ppx) / np.sum(non_pad)
    
    return -log_ppx

## Generating the Language with our Model
$$ f(z) = {1\over{\beta}}e^{(-z+e^{(-z)})} $$

where: $$ z = {(x - \mu)\over{\beta}}$$

The maximum value, which is what we choose as the prediction in the last step of a Recursive Neural Network `RNN` we are using for text generation, in a sample of a random variable following an exponential distribution approaches the Gumbel distribution when the sample increases asymptotically. For that reason, the Gumbel distribution is used to sample from a categorical distribution.

In [41]:
# Run this cell to generate some news sentence
def gumbel_sample(log_probs, temperature=1.0):
    """Gumbel sampling from a categorical distribution."""
    u = numpy.random.uniform(low=1e-6, high=1.0 - 1e-6, size=log_probs.shape)
    g = -np.log(-np.log(u))
    return np.argmax(log_probs + g * temperature, axis=-1)

def predict(num_chars, prefix):
    inp = [ord(c) for c in prefix]
    result = [c for c in prefix]
    max_len = len(prefix) + num_chars
    for _ in range(num_chars):
        cur_inp = np.array(inp + [0] * (max_len - len(inp)))
        outp = model(cur_inp[None, :])  # Add batch dim.
        next_char = gumbel_sample(outp[0, len(inp)])
        inp += [int(next_char)]
       
        if inp[-1] == 1:
            break  # EOS
        result.append(chr(int(next_char)))
    
    return "".join(result)

print(predict(32, ""))

LayerError: Exception passing through layer Serial (in pure_fn):
  layer created in file [...]/<ipython-input-32-fdc509f12b4f>, line 2
  layer input shapes: ShapeDtype{shape:(1, 32), dtype:int32}

  File [...]/trax/layers/combinators.py, line 88, in forward
    outputs, s = layer.pure_fn(inputs, w, s, rng, use_cache=True)

LayerError: Exception passing through layer Embedding_256_512 (in pure_fn):
  layer created in file [...]/<ipython-input-32-fdc509f12b4f>, line 4
  layer input shapes: ShapeDtype{shape:(1, 32), dtype:int32}

  File [...]/trax/layers/core.py, line 150, in forward
    return jnp.take(self.weights, x, axis=0)

  File [...]/jax/numpy/lax_numpy.py, line 3422, in take
    return lax.gather(a, indices[..., None], dimension_numbers=dnums,

  File [...]/jax/lax/lax.py, line 807, in gather
    return gather_p.bind(

  File [...]/site-packages/jax/core.py, line 276, in bind
    return self.impl(*args, **kwargs)

  File [...]/jax/interpreters/xla.py, line 224, in apply_primitive
    compiled_fun = xla_primitive_callable(prim, *unsafe_map(arg_spec, args), **params)

  File [...]/jax/interpreters/xla.py, line 257, in xla_primitive_callable
    built_c = primitive_computation(prim, AxisEnv(nreps, (), (), None), backend,

  File [...]/jax/interpreters/xla.py, line 316, in primitive_computation
    raise RuntimeError(msg) from e

RuntimeError: Invalid argument: Slice size at index 0 in gather op is out of range, must be within [0, 1), got 1.: 
This is a bug in JAX's shape-checking rules; please report it!

https://github.com/google/jax/issues

