# Question Answering - Text to Text Transfer from Transformers(T5)

In [1]:
import ast
import string
import textwrap
import itertools
import numpy as np

import trax
from trax import layers as tl
from trax.supervised import decoding

# Wil come handy later
wrapper = textwrap.TextWrapper(width=70)

# Set random seed
np.random.seed(42)

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 


In [2]:
example_jsons = list(map(ast.literal_eval, open('data/data.txt')))

In [3]:
# Printing the examples to see how the data looks like
for i in range(5):
    print(f'example number {i+1}: \n\n{example_jsons[i]} \n')

example number 1: 

{'content-length': b'1970', 'content-type': b'text/plain', 'text': b'Beginners BBQ Class Taking Place in Missoula!\nDo you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.\nHe will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.\nThe cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.', 'timestamp': b'2019-04-25T12:57:54Z', 'url': b'https://klyq.com/beginners-bbq-class-taking-place-in-missoula/'} 

example number 2: 

{'content-length': b'120

In [4]:
type(example_jsons[0].get('text'))

bytes

In [5]:
# Grab text field from dictionary
natural_language_texts = [example_json['text'] for example_json in example_jsons]

In [6]:
# First text example
natural_language_texts[4]

b'The Denver Board of Education opened the 2017-18 school year with an update on projects that include new construction, upgrades, heat mitigation and quality learning environments.\nWe are excited that Denver students will be the beneficiaries of a four year, $572 million General Obligation Bond. Since the passage of the bond, our construction team has worked to schedule the projects over the four-year term of the bond.\nDenver voters on Tuesday approved bond and mill funding measures for students in Denver Public Schools, agreeing to invest $572 million in bond funding to build and improve schools and $56.6 million in operating dollars to support proven initiatives, such as early literacy.\nDenver voters say yes to bond and mill levy funding support for DPS students and schools. Click to learn more about the details of the voter-approved bond measure.\nDenver voters on Nov. 8 approved bond and mill funding measures for DPS students and schools. Learn more about what\xe2\x80\x99s incl

In [7]:
PAD, EOS, UNK = 0, 1, 2
def detokenize(np_array):
    return trax.data.detokenize(
        np_array, 
        vocab_type='sentencepiece',
        vocab_file='sentencepiece.model',
        vocab_dir='./data/'
    )

def tokenize(s):
    # The trax.data.tokenize function operates on streams
    # theat why we have to create 1-element stream with iter
    # and later retrieve the result with next
    return next(
        trax.data.tokenize(
            iter([s]),
            vocab_type='sentencepiece',
            vocab_file='sentencepiece.model',
            vocab_dir='./data/'
        )
    )

In [8]:
# Printing the encoding of each word to see how subwords are tokenized
tokenized_text = [(tokenize(word).tolist(), word) for word in natural_language_texts[0].split()]
print(tokenized_text, '\n')

[([12847, 277], b'Beginners'), ([15068], b'BBQ'), ([4501], b'Class'), ([3, 12297], b'Taking'), ([3399], b'Place'), ([16], b'in'), ([5964, 7115, 9, 55], b'Missoula!'), ([531], b'Do'), ([25], b'you'), ([241], b'want'), ([12], b'to'), ([129], b'get'), ([394], b'better'), ([44], b'at'), ([492], b'making'), ([3326], b'delicious'), ([15068, 58], b'BBQ?'), ([148], b'You'), ([56], b'will'), ([43], b'have'), ([8], b'the'), ([1004, 6], b'opportunity,'), ([474], b'put'), ([48], b'this'), ([30], b'on'), ([39], b'your'), ([4793], b'calendar'), ([230, 5], b'now.'), ([2721, 6], b'Thursday,'), ([1600], b'September'), ([1630, 727], b'22nd'), ([1715], b'join'), ([1150], b'World'), ([4501], b'Class'), ([15068], b'BBQ'), ([16127, 6], b'Champion,'), ([9137], b'Tony'), ([2659, 5595], b'Balay'), ([45], b'from'), ([301, 782, 3624], b'Lonestar'), ([14627, 15], b'Smoke'), ([12612, 277, 5], b'Rangers.'), ([216], b'He'), ([56], b'will'), ([36], b'be'), ([2119], b'teaching'), ([3, 9], b'a'), ([19529], b'beginner')

In [9]:
# We can see that detokenize successfully undoes the tokenization
print(f"tokenized: {tokenize('Beginners')}\ndetokenized: {detokenize(tokenize('Beginners'))}")

tokenized: [12847   277]
detokenized: Beginners


In [10]:
vocab_size = trax.data.vocab_size(
    vocab_type='sentencepiece',
    vocab_file='sentencepiece.model',
    vocab_dir='./data/'
)

def get_sentinels(vocab_size=vocab_size, display=False):
    sentinels = {}
    
    for i, char in enumerate(reversed(string.ascii_letters), 1):
        decoded_text = detokenize([vocab_size - i])
        
        # Sentinels, ex: <Z> - <a>
        sentinels[decoded_text] = f'<{char}>'
        
        if display:
            print(f'The sentinel is <{char}> and the decoded token is:', decoded_text)

    return sentinels

In [11]:
sentinels = get_sentinels(vocab_size, display=True)

The sentinel is <Z> and the decoded token is: Internațional
The sentinel is <Y> and the decoded token is: erwachsene
The sentinel is <X> and the decoded token is: Cushion
The sentinel is <W> and the decoded token is: imunitar
The sentinel is <V> and the decoded token is: Intellectual
The sentinel is <U> and the decoded token is: traditi
The sentinel is <T> and the decoded token is: disguise
The sentinel is <S> and the decoded token is: exerce
The sentinel is <R> and the decoded token is: nourishe
The sentinel is <Q> and the decoded token is: predominant
The sentinel is <P> and the decoded token is: amitié
The sentinel is <O> and the decoded token is: erkennt
The sentinel is <N> and the decoded token is: dimension
The sentinel is <M> and the decoded token is: inférieur
The sentinel is <L> and the decoded token is: refugi
The sentinel is <K> and the decoded token is: cheddar
The sentinel is <J> and the decoded token is: unterlieg
The sentinel is <I> and the decoded token is: garanteaz
Th

In [12]:
def pretty_decode(encoded_str_list, sentinels=sentinels):
    # If already a string just do the replacement
    if(isinstance(encoded_str_list, (str, bytes))):
        for token, char in sentinels.items():
            encoded_str_list = encoded_str_list.replace(token, char)
            
        return encoded_str_list
    
    # We need to decode and then prettyfy it
    return pretty_decode(detokenize(encoded_str_list))

In [13]:
pretty_decode("I want to dress up as an Intellectual this halloween.")

'I want to dress up as an <V> this <b>.'

### Tokenizing and Masking
The function will allow you to tokenize and mask input words with a noise probability. Mask 15% of the words

In [21]:
def tokenize_and_mask(
    text, 
    vocab_size=vocab_size, 
    noise=0.15, 
    randomizer=np.random.uniform, 
    tokenize=tokenize
):
    """
    Tokenizes and masks a given input
    
    Args:
    text: Text input
    vocab_size: Size of the vocabulary. Defaults to vocab_size
    noise(float, optional): Probability of masking a token. Defaults to 0.15
    randomizer(function) : Function athat genererates random values 
    tokenize: Tokenizer function defaults to tokenize
    
    Returns:
    
    tuple: Tuple of lists of integers associated to inputs and targets
    
    """
    
    # Current sentinel number 
    cur_sentinel_num = 0
    inputs = []
    targets = []
    prev_no_mask = True
    
    for token in tokenize(text):
        # Check if the noise is greater than a random value 
        if(randomizer() < noise):
            # Check to see if the previous token was not masked
            if(prev_no_mask == True):
                # number of masked tokens increases by 1
                cur_sentinel_num += 1
                # Compute end_id by subtracting current sentinel value out of the total vocabulary size
                end_id = vocab_size - cur_sentinel_num
                targets.append(end_id)
                inputs.append(end_id)
                
            targets.append(token)
            prev_no_mask = False
        else:
            inputs.append(token)
            prev_no_mask = True
            
    return inputs, targets

In [22]:
# Some logic to mock a np.random value generator
# Needs to be in the same cell for it to aways generate same output

def testing_rnd():
    def dummy_generator():
        vals = np.linspace(0, 1, 10)
        cyclic_vals = itertools.cycle(vals)
        for _ in range(100):
            yield next(cyclic_vals)
            
    dumr = itertools.cycle(dummy_generator())
    
    def dummy_randomizer():
        return next(dumr)
    
    return dummy_randomizer

input_str = natural_language_texts[0]
print(f"input string:\n\n{input_str}\n")
inputs, targets = tokenize_and_mask(input_str, randomizer=testing_rnd())
print(f"tokenized inputs:\n\n{inputs}\n")
print(f"targets:\n\n{targets}")


input string:

b'Beginners BBQ Class Taking Place in Missoula!\nDo you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.\nHe will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.\nThe cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.'

tokenized inputs:

[31999, 15068, 4501, 3, 12297, 3399, 16, 5964, 7115, 31998, 531, 25, 241, 12, 129, 394, 44, 492, 31997, 58, 148, 56, 43, 8, 1004, 6, 474, 31996, 39, 4793, 230, 5, 2721, 6, 1600, 1630, 31995, 1150, 4501, 15068, 1

In [24]:
print('Inputs: \n\n', pretty_decode(inputs))
print('\nTargets: \n\n', pretty_decode(targets))

Inputs: 

 <Z> BBQ Class Taking Place in Missoul <Y> Do you want to get better at making <X>? You will have the opportunity, put <W> your calendar now. Thursday, September 22 <V> World Class BBQ Champion, Tony Balay <U>onestar Smoke Rangers. He <T> teaching a beginner level class for everyone<S> to get better with their culinary skills.<R> teach you everything you need to know to <Q> a KCBS BBQ competition,<P>, recipes, timelines, meat selection <O>, plus smoker and fire information. The<N> be in the class is $35 per person <M> for spectators it is free. Include <L> the cost will be either a  <K>shirt or apron and you <J> tasting samples of each meat that is prepared <I>

Targets: 

 <Z> Beginners <Y>a! <X> delicious BBQ <W> this on <V>nd join <U> from L <T> will be<S> who wants<R> He will <Q> compete in<P> including techniques <O> and trimming<N> cost to <M>, and <L>d in <K>t- <J> will be <I>.


### Creating Pairs

In [25]:
inputs_targets_pairs = [tokenize_and_mask(text) for text in natural_language_texts]

In [26]:
def display_input_target_pairs(input_targets_pairs):
    for i, inp_tgt_pair in enumerate(inputs_targets_pairs):
        inps, tgts = inp_tgt_pair
        inps, tgts = pretty_decode(inps), pretty_decode(tgts)
        print(f'[{i}]\n\n'
            f'inputs:\n{wrapper.fill(text=inps)}\n\n'
            f'targets:\n{wrapper.fill(text=tgts)}\n\n\n\n')

In [27]:
display_input_target_pairs(inputs_targets_pairs)

[0]

inputs:
Beginners BBQ Class Taking <Z> in Missoul <Y>! Do you want to get
better at making delicious <X>? You will have the opportunity, <W>
this on <V> calendar now. Thursday <U> September 22 <T> join<S> Class
BBQ Champion, Tony Balay from Lonestar Smoke<R>ers <Q> He will be
teaching a beginner<P> class <O> everyone who wants<N> get better with
their <M> skills <L> He will teach <K> everything you need to know to
<J> in a KCBS BBQ <I> techniques, recipes, timelines, meat<H> and
trimming, plus smoker and fire information. The cost to be<G> the
class is $35 <F> person, and<E> spectators it is free. Included in the
cost will<D> either <C> t- <B> or apron and you will be tasting
samples <A> each meat that <z> prepared.

targets:
<Z> Place <Y>a <X> BBQ <W> put <V> your <U>, <T>nd<S> World<R> Rang
<Q>.<P> level <O> for<N> to <M> culinary <L>. <K> you <J> compete <I>
competition, including<H> selection<G> in <F> per<E> for<D> be<C>a
<B>shirt <A> of <z> is




[1]

inputs:
<Z> in 'Mac OS

## Transformer
- Load a transformer model checkpoint that has been pre-trained using the C4 dataset and decode from it
- This will save lot of time compare to training the model from scratch  
![alt-txt](images/fulltransformer.png)
- Load the model
- Copy the checkpoint to local dir for speed
- Implement encoder, refer below  
![alt-txt](images/encoder.png)
### Transformer Encoder
The `FeedForwardBlock` 
- `tl.LayerNorm()
- `tl.Dense(d_ff)
- `activation`
- `dropout_middle`
- `tl.Dense(d_model)`
- `dropout_final`



In [43]:
def FeedForwardBlock(
    d_model,
    d_ff,
    dropout,
    dropout_shared_axes,
    mode,
    activation
):
    dropout_middle = tl.Dropout(
        rate=dropout,
        shared_axes=dropout_shared_axes,
        mode=mode
    )
    dropout_final = tl.Dropout(
        rate=dropout,
        shared_axes=dropout_shared_axes,
        mode=mode
    )
    
    ff_block = [
        tl.LayerNorm(),
        tl.Dense(d_ff),
        activation(),
        dropout_middle,
        tl.Dense(d_model),
        dropout_final
    ]
    
    return ff_block

In [44]:
feed_forward_example = FeedForwardBlock(
    d_model=512,
    d_ff=2048,
    dropout=0.8,
    dropout_shared_axes=0,
    mode='train',
    activation=tl.Relu
)
print(feed_forward_example)

[LayerNorm, Dense_2048, Relu, Dropout, Dense_512, Dropout]


#### 2.1.2 The Encoder Block

The encoder block will use the `FeedForwardBlock`. 

You will have to build two residual connections. Inside the first residual connection you will have the `tl.layerNorm()`, `attention`, and `dropout_` layers. The second residual connection will have the `feed_forward`.  

You will also need to implement `feed_forward`, `attention` and `dropout_` blocks. 

So far you haven't seen the [`tl.Attention()`](https://trax-ml.readthedocs.io/en/latest/trax.layers.html#trax.layers.attention.Attention) and [`tl.Residual()`](https://trax-ml.readthedocs.io/en/latest/trax.layers.html#trax.layers.combinators.Residual) layers so you can check the docs by clicking on them.

In [57]:
def EncoderBlock(
    d_model,
    d_ff,
    n_heads,
    dropout,
    dropout_shared_axes,
    mode,
    ff_activation,
    FeedForwardBlock=FeedForwardBlock
):
    """
    Returns a list of layers that implements a Transformer encoder block.
    The input to the layer is a pair, (activations, mask), where the mask was created
    from the original source tokens to prevent attending to the padding part of the input
    
    Args:
        d_model (int): depth of embedding.
        d_ff (int): depth of feed-forward layer.
        n_heads (int): number of attention heads.
        dropout (float): dropout rate (how much to drop out).
        dropout_shared_axes (int): axes on which to share dropout mask.
        mode (str): 'train' or 'eval'.
        ff_activation (function): the non-linearity in feed-forward layer.
        FeedForwardBlock (function): A function that returns the feed forward block.
    Returns:
        list: A list of layers that maps (activations, mask) to (activations, mask).
    
    """
    
    attention = tl.Attention(
        d_feature=d_model,
        n_heads=n_heads,
        dropout=dropout,
        mode=mode
    )
    feed_forward = FeedForwardBlock(
        d_model,
        d_ff,
        dropout,
        dropout_shared_axes,
        mode,
        ff_activation
    )
    
    # Dropout block
    dropout_ = tl.Dropout( 
        # set it equal to `dropout`
        rate=dropout,
        # set it equal to the axes on which to share dropout mask
        shared_axes=dropout_shared_axes,
        # set it equal to `mode`
        mode=mode
    )
    
    encoder_block = [
        tl.Residual(
            tl.LayerNorm(),
            attention,
            dropout_
        ),
        tl.Residual(
            feed_forward
        )
    ]
    
    return encoder_block

In [58]:
encoder_example = EncoderBlock(d_model=512, d_ff=2048, n_heads=6, dropout=0.8, dropout_shared_axes=0, mode = 'train', ff_activation=tl.Relu)
print(encoder_example)

[Serial_in2_out2[
  Branch_in2_out3[
    None
    Serial_in2_out2[
      LayerNorm
      Serial_in2_out2[
        Dup_out2
        Dup_out2
        Serial_in4_out2[
          Parallel_in3_out3[
            Dense_512
            Dense_512
            Dense_512
          ]
          PureAttention_in4_out2
          Dense_512
        ]
      ]
      Dropout
    ]
  ]
  Add_in2
], Serial[
  Branch_out2[
    None
    Serial[
      LayerNorm
      Dense_2048
      Relu
      Dropout
      Dense_512
      Dropout
    ]
  ]
  Add_in2
]]


### The Transformer Encoder
- Implement the Encoder Block
- BERT, Bidirectional Encoder Representation from Transformers 
- Positional Encoders -> n_layers -> 


- [`tl.Branch`](https://trax-ml.readthedocs.io/en/latest/trax.layers.html#trax.layers.combinators.Branch): helps with the branching and has the following sublayers:
    - `positional_encoder`.
    - [`tl.PaddingMask()`](https://trax-ml.readthedocs.io/en/latest/trax.layers.html#trax.layers.attention.PaddingMask): layer that maps integer sequences to padding masks.
- Your list of `EncoderBlock`s
- [`tl.Select([0], n_in=2)`](https://trax-ml.readthedocs.io/en/latest/trax.layers.html#trax.layers.combinators.Select):  Copies, reorders, or deletes stack elements according to indices.
- [`tl.LayerNorm()`](https://trax-ml.readthedocs.io/en/latest/trax.layers.html#trax.layers.normalization.LayerNorm).
- [`tl.Mean()`](https://trax-ml.readthedocs.io/en/latest/trax.layers.html#trax.layers.core.Mean): Mean along the first axis.
- `tl.Dense()` with n_units set to n_classes. 
- `tl.LogSoftmax()`   


In [63]:
def TransformerEncoder(
    vocab_size=vocab_size,
    n_classes=10,
    d_model=512,
    d_ff=2048,
    n_layers=6,
    n_heads=8,
    dropout=0.1,
    dropout_shared_axes=None,
    max_len=2048,
    mode='train',
    ff_activation=tl.Relu,
    EncoderBlock=EncoderBlock
):
    """
    Returns a Transformer encoder model.
    The input to the model is a tensor of tokens.
  
    Args:
        vocab_size (int): vocab size. Defaults to vocab_size.
        n_classes (int): how many classes on output. Defaults to 10.
        d_model (int): depth of embedding. Defaults to 512.
        d_ff (int): depth of feed-forward layer. Defaults to 2048.
        n_layers (int): number of encoder/decoder layers. Defaults to 6.
        n_heads (int): number of attention heads. Defaults to 8.
        dropout (float): dropout rate (how much to drop out). Defaults to 0.1.
        dropout_shared_axes (int): axes on which to share dropout mask. Defaults to None.
        max_len (int): maximum symbol length for positional encoding. Defaults to 2048.
        mode (str): 'train' or 'eval'. Defaults to 'train'.
        ff_activation (function): the non-linearity in feed-forward layer. Defaults to tl.Relu.
        EncoderBlock (function): Returns the encoder block. Defaults to EncoderBlock.
  
    Returns:
        trax.layers.combinators.Serial: A Transformer model as a layer that maps
        from a tensor of tokens to activations over a set of output classes.
    """
    
    positional_encoder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode),
        tl.PositionalEncoding(max_len=max_len)
    ]
    
    encoder_blocks = [
        EncoderBlock(
            d_model=d_model, 
            d_ff=d_ff, 
            n_heads=n_heads, 
            dropout=dropout, 
            dropout_shared_axes=dropout_shared_axes, 
            mode = mode, 
            ff_activation=ff_activation
        ) for _ in range(n_layers)
    ]
    
    return tl.Serial(
    
        tl.Branch(positional_encoder, tl.PaddingMask()),
        encoder_blocks,
        tl.Select([0], n_in=2),
        tl.LayerNorm(),
        tl.Mean(axis=1),
        tl.Dense(n_classes),
        tl.LogSoftmax()
    )

In [64]:
TransformerEncoder(n_layers=1)

Serial[
  Branch_out2[
    [Embedding_32000_512, Dropout, PositionalEncoding]
    PaddingMask(0)
  ]
  Serial_in2_out2[
    Branch_in2_out3[
      None
      Serial_in2_out2[
        LayerNorm
        Serial_in2_out2[
          Dup_out2
          Dup_out2
          Serial_in4_out2[
            Parallel_in3_out3[
              Dense_512
              Dense_512
              Dense_512
            ]
            PureAttention_in4_out2
            Dense_512
          ]
        ]
        Dropout
      ]
    ]
    Add_in2
  ]
  Serial[
    Branch_out2[
      None
      Serial[
        LayerNorm
        Dense_2048
        Relu
        Dropout
        Dense_512
        Dropout
      ]
    ]
    Add_in2
  ]
  Select[0]_in2
  LayerNorm
  Mean
  Dense_10
  LogSoftmax
]