## **NLP with RNNs and Attention**

In [1]:
import sklearn
import tensorflow as tf
from tensorflow import keras

import numpy as np
import os

import matplotlib as mpl
import matplotlib.pyplot as plt

### Char-RNN

In [None]:
## splitting up sequence into batches using windows
# by shifting window of length 5, shifted by 2 eg [0, 1, 2, 3, 4] and [2, 3, 4, 5, 6]
# create inputs and target e.g [[0,1,2,3] [1,2,3,4]]

n_steps = 5
dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))
dataset = dataset.window(n_steps, shift=2, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(n_steps))
dataset = dataset.shuffle(10).map(lambda window: (window[:-1], window[1:]))
dataset = dataset.batch(3).prefetch(1)
for index, (X_batch, Y_batch) in enumerate(dataset):
  print("_" * 20, "batch", index, "\nX_batch")
  print(X_batch.numpy())
  print("=" * 5, "\nY_batch")
  print(Y_batch.numpy())

____________________ batch 0 
X_batch
[[4 5 6 7]
 [0 1 2 3]
 [6 7 8 9]]
===== 
Y_batch
[[ 5  6  7  8]
 [ 1  2  3  4]
 [ 7  8  9 10]]
____________________ batch 1 
X_batch
[[10 11 12 13]
 [ 8  9 10 11]
 [ 2  3  4  5]]
===== 
Y_batch
[[11 12 13 14]
 [ 9 10 11 12]
 [ 3  4  5  6]]


In [None]:
## Loading data and preprocessing
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
  shakespeare_text = f.read()

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


In [None]:
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [None]:
# all the characters in the text
"".join(sorted(set(shakespeare_text.lower())))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [None]:
# tokenize the text by character
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [None]:
tokenizer.texts_to_sequences(["First"])

[[20, 6, 9, 8, 3]]

In [None]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

['f i r s t']

In [None]:
max_id = len(tokenizer.word_index) # num of distinct chars
dataset_size = tokenizer.document_count
print(max_id, dataset_size)

39 1115394


In [None]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [None]:
n_steps = 100
window_length = n_steps + 1 # to create target data
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

In [None]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [None]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [None]:
# one hot encode X_batch
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)

In [None]:
dataset = dataset.prefetch(1)

In [None]:
for X_batch, Y_batch in dataset.take(1):
  print(X_batch.shape, Y_batch.shape)

(32, 100, 39) (32, 100)


### Training the model

In [None]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], 
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, 
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
# supposedly should be, but takes too long to train
# history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=10)
history = model.fit(dataset.take(5000), steps_per_epoch=1000, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Using the Model to Generate Text

In [None]:
def preprocess(texts):
  X = np.array(tokenizer.texts_to_sequences(texts)) - 1
  return tf.one_hot(X, max_id)

In [None]:
X_new = preprocess(["How are yo"])
Y_pred = np.argmax(model.predict(X_new), axis=-1)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

'u'

In [None]:
tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples=40).numpy()

array([[0, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 2, 1, 2, 0, 2, 2, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1]])

In [None]:
def next_char(text, temperature=1):
  X_new = preprocess([text])
  y_proba = model.predict(X_new)[0, -1:, :]
  rescaled_logits = tf.math.log(y_proba) / temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
  return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [None]:
next_char("How are yo", temperature=1)

'u'

In [None]:
def complete_text(text, n_chars=50, temperature=1):
  for _ in range(n_chars):
    text += next_char(text, temperature)
  return text

In [None]:
print(complete_text("t", temperature=0.2))

the send the country's soul the lady and the corse.


In [None]:
print(complete_text("t", temperature=1))

treadzargns and inforce.
had thee heart for he is n


In [None]:
print(complete_text("t", temperature=2))

t, slimg
kitd catuoalewmme! by be't plucy, mygencza


### Stateful RNN

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.repeat().batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)
dataset = dataset.prefetch(1)

In [None]:
batch_size = 32
encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []
for encoded_part in encoded_parts:
  dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
  dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
  dataset = dataset.flat_map(lambda window: window.batch(window_length))
  datasets.append(dataset)
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
dataset = dataset.repeat().map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)
dataset = dataset.prefetch(1)

In [None]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     dropout=0.2, recurrent_dropout=0.2,
                     batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [None]:
class ResetStatesCallback(keras.callbacks.Callback):
  def on_epoch_begin(self, epoch, logs):
    self.model.reset_states()

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
steps_per_epoch = train_size // batch_size // n_steps
history = model.fit(dataset, steps_per_epoch=steps_per_epoch, epochs=50,
                    callbacks=[ResetStatesCallback()])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# to use the model on different batch size, we need to create a stateless copy
stateless_model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

# no need to add dropout since it is only used during training

In [None]:
stateless_model.build(tf.TensorShape([None, None, max_id]))

# then add weights
stateless_model.set_weights(model.get_weights())
model = stateless_model

In [None]:
# hmm not so good
print(complete_text("t"))

t.

bittentio:
though he watch i mame fellow, leate


### Sentiment Analysis

In [None]:
# use IMDB dataset
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [None]:
# data comes preprocessed
X_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [None]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
  id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


'<sos> this film was just brilliant casting location scenery story'

In [None]:
# can also get the txt data
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteGQSQL2/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteGQSQL2/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteGQSQL2/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
datasets.keys()

dict_keys(['test', 'train', 'unsupervised'])

In [None]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

In [None]:
train_size, test_size

(25000, 25000)

In [None]:
for X_batch, y_batch in datasets["train"].batch(2).take(1):
  for review, label in zip(X_batch.numpy(), y_batch.numpy()):
    print("Review:", review.decode("utf-8")[:200], "...")
    print("Label:", label, "=Positive" if label else "= Negative")
    print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative



In [None]:
def preprocess(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
  X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
  X_batch = tf.strings.split(X_batch)
  return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [None]:
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

In [None]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
  for review in X_batch:
    vocabulary.update(list(review.numpy()))

In [None]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [None]:
len(vocabulary)

53893

In [None]:
vocab_size = 10000
truncated_vocabulary = [
      word for word, count in vocabulary.most_common()[:vocab_size]
]

In [None]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b"This movie was faaaaantasitic".split():
  print(word_to_id.get(word) or vocab_size)

22
12
11
10000


In [None]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [None]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [None]:
def encode_words(X_batch, y_batch):
  return table.lookup(X_batch), y_batch

train_set = datasets["train"].repeat().batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [None]:
for X_batch, y_batch in train_set.take(1):
  print(X_batch)
  print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [None]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, steps_per_epoch=train_size//32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Reusing Pretrained Embeddings

In [None]:
TFHUB_CACHE_DIR = os.path.join(os.curdir, "my_tfhub_cache")
os.environ["TFHUB_CACHE_DIR"] = TFHUB_CACHE_DIR

In [None]:
import tensorflow_hub as hub

model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
                   dtype=tf.string, input_shape=[], output_shape=[50]),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])

In [None]:
for dirpath, dirnames, filenames in os.walk(TFHUB_CACHE_DIR):
    for filename in filenames:
        print(os.path.join(dirpath, filename))

./my_tfhub_cache/82c4aaf4250ffb09088bd48368ee7fd00e5464fe.descriptor.txt
./my_tfhub_cache/82c4aaf4250ffb09088bd48368ee7fd00e5464fe/saved_model.pb
./my_tfhub_cache/82c4aaf4250ffb09088bd48368ee7fd00e5464fe/variables/variables.data-00000-of-00001
./my_tfhub_cache/82c4aaf4250ffb09088bd48368ee7fd00e5464fe/variables/variables.index
./my_tfhub_cache/82c4aaf4250ffb09088bd48368ee7fd00e5464fe/assets/tokens.txt


In [None]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples
batch_size = 32
train_set = datasets["train"].repeat().batch(batch_size).prefetch(1)
history = model.fit(train_set, steps_per_epoch=train_size // batch_size, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Automatic Translation

In [None]:
# examplary encoder-decoder model for translation

vocab_size = 100
embed_size = 10

In [None]:
!pip install tensorflow_addons

Collecting tensorflow_addons
[?25l  Downloading https://files.pythonhosted.org/packages/74/e3/56d2fe76f0bb7c88ed9b2a6a557e25e83e252aec08f13de34369cd850a0b/tensorflow_addons-0.12.1-cp37-cp37m-manylinux2010_x86_64.whl (703kB)
[K     |▌                               | 10kB 16.3MB/s eta 0:00:01[K     |█                               | 20kB 22.6MB/s eta 0:00:01[K     |█▍                              | 30kB 14.2MB/s eta 0:00:01[K     |█▉                              | 40kB 11.4MB/s eta 0:00:01[K     |██▎                             | 51kB 8.5MB/s eta 0:00:01[K     |██▉                             | 61kB 8.6MB/s eta 0:00:01[K     |███▎                            | 71kB 8.3MB/s eta 0:00:01[K     |███▊                            | 81kB 8.9MB/s eta 0:00:01[K     |████▏                           | 92kB 8.4MB/s eta 0:00:01[K     |████▋                           | 102kB 8.7MB/s eta 0:00:01[K     |█████▏                          | 112kB 8.7MB/s eta 0:00:01[K     |█████▋     

In [None]:
import tensorflow_addons as tfa

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

# return state returns state_h and state_c
encoder = keras.layers.LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

# trainingsampler let the decoder know what output was at the previous step 
sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(512)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler,
                                                 output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings, initial_state=encoder_state, sequence_length=sequence_lengths
)
Y_proba = tf.nn.softmax(final_outputs.rnn_output)


model = keras.models.Model(
    inputs=[encoder_inputs, decoder_inputs, sequence_lengths],
    outputs=[Y_proba]
)

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

In [None]:
# naive example
X = np.random.randint(100, size=10*1000).reshape(1000, 10)
Y = np.random.randint(100, size=15*1000).reshape(1000, 15)
X_decoder = np.c_[np.zeros((1000, 1)), Y[:, :-1]]
seq_lengths = np.full([1000], 15)

history = model.fit([X, X_decoder, seq_lengths], Y, epochs=2)

Epoch 1/2
Epoch 2/2


### Bidirectional Reccurent Layers

In [None]:
model = keras.models.Sequential([
      keras.layers.GRU(10, return_sequences=True, input_shape=[None, 10]),
      keras.layers.Bidirectional(keras.layers.GRU(10, return_sequences=True))
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, None, 10)          660       
_________________________________________________________________
bidirectional (Bidirectional (None, None, 20)          1320      
Total params: 1,980
Trainable params: 1,980
Non-trainable params: 0
_________________________________________________________________


### Positional Encoding

In [None]:
# positional embedding is used in Transformer model to keep positional information
# make a big positionalencoding and cut it at use
class PositionalEncoding(keras.layers.Layer):
  def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
    super().__init__(dtype=dtype, **kwargs)
    if max_dims % 2 == 1: max_dims += 1 # max dims to be even
    p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
    pos_emb = np.empty((1, max_steps, max_dims))
    pos_emb[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T
    pos_emb[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T
    self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))
  def call(self, inputs):
    shape = tf.shape(inputs)
    return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]

In [None]:
max_steps = 201
max_dims = 512
pos_emb = PositionalEncoding(max_steps, max_dims)
PE = pos_emb(np.zeros((1, max_steps, max_dims), np.float32))[0].numpy()

In [None]:
# simplified transformer with positional embedding and attention would look like

embed_size = 512; max_steps = 500; vocab_size = 10000
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)
positional_encoding = PositionalEncoding(max_steps, max_dims=embed_size)
encoder_in = positional_encoding(encoder_embeddings)
decoder_in = positional_encoding(decoder_embeddings)


In [None]:
Z = encoder_in
for N in range(6):
  Z = keras.layers.Attention(use_scale=True)([Z, Z])

encoder_outputs = Z
Z = decoder_in
for N in range(6):
  # by setting causal=True, model only looks at the previous outputs
  Z = keras.layers.Attention(use_scale=True, causal=True)([Z, Z])
  Z = keras.layers.Attention(use_scale=True)([Z, encoder_outputs])

outputs = keras.layers.TimeDistributed(
    keras.layers.Dense(vocab_size, activation="softmax")
)(Z)

### Multi-Head Attention Implementation

In [None]:
K = keras.backend

class MultiHeadAttention(keras.layers.Layer):
    def __init__(self, n_heads, causal=False, use_scale=False, **kwargs):
        self.n_heads = n_heads
        self.causal = causal
        self.use_scale = use_scale
        super().__init__(**kwargs)
    def build(self, batch_input_shape):
        self.dims = batch_input_shape[0][-1]
        self.q_dims, self.v_dims, self.k_dims = [self.dims // self.n_heads] * 3 # could be hyperparameters instead
        self.q_linear = keras.layers.Conv1D(self.n_heads * self.q_dims, kernel_size=1, use_bias=False)
        self.v_linear = keras.layers.Conv1D(self.n_heads * self.v_dims, kernel_size=1, use_bias=False)
        self.k_linear = keras.layers.Conv1D(self.n_heads * self.k_dims, kernel_size=1, use_bias=False)
        self.attention = keras.layers.Attention(causal=self.causal, use_scale=self.use_scale)
        self.out_linear = keras.layers.Conv1D(self.dims, kernel_size=1, use_bias=False)
        super().build(batch_input_shape)
    def _multi_head_linear(self, inputs, linear):
        shape = K.concatenate([K.shape(inputs)[:-1], [self.n_heads, -1]])
        projected = K.reshape(linear(inputs), shape)
        perm = K.permute_dimensions(projected, [0, 2, 1, 3])
        return K.reshape(perm, [shape[0] * self.n_heads, shape[1], -1])
    def call(self, inputs):
        q = inputs[0]
        v = inputs[1]
        k = inputs[2] if len(inputs) > 2 else v
        shape = K.shape(q)
        q_proj = self._multi_head_linear(q, self.q_linear)
        v_proj = self._multi_head_linear(v, self.v_linear)
        k_proj = self._multi_head_linear(k, self.k_linear)
        multi_attended = self.attention([q_proj, v_proj, k_proj])
        shape_attended = K.shape(multi_attended)
        reshaped_attended = K.reshape(multi_attended, [shape[0], self.n_heads, shape_attended[1], shape_attended[2]])
        perm = K.permute_dimensions(reshaped_attended, [0, 2, 1, 3])
        concat = K.reshape(perm, [shape[0], shape_attended[1], -1])
        return self.out_linear(concat)

In [None]:
Q = np.random.rand(2, 50, 512)
V = np.random.rand(2, 80, 512)
multi_attn = MultiHeadAttention(8)
multi_attn([Q, V]).shape

TensorShape([2, 50, 512])

### Embedded Reber Grammars

Choose a particular embedded Reber grammar (such as the one represented on Jenny Orr's page), then train an RNN to identify whether a string respects that grammar or not. You will first need to write a function capable of generating a training batch containing about 50% strings that respect the grammar, and 50% that don't.

[link](https://www.willamette.edu/~gorr/classes/cs449/reber.html)

In [None]:
# lets create reber grammar
default_reber_grammar = [
    [("B", 1)],  # (state 0) =B=>(state 1)
    [("T", 2), ("P", 3)],  # (state 1) =T=>(state 2) or =P=>(state 3)
    [("S", 2), ("X", 4)],
    [("T", 3), ("V", 5)],
    [("X", 3), ("S", 6)],
    [("P", 4), ("V", 6)],
    [("E", None)] # (state 6) =E => (terminal state)
]

embedded_reber_grammar = [
    [("B", 1)],
    [("T", 2), ("P", 3)],
    [(default_reber_grammar, 4)],
    [(default_reber_grammar, 5)],
    [("T", 6)],
    [("P", 6)],
    [("E", None)], 
]

def generate_string(grammar):
  state = 0
  output = []
  while state is not None:
    index = np.random.randint(len(grammar[state]))
    production, state = grammar[state][index]
    if isinstance(production, list):
      production = generate_string(grammar=production)
    output.append(production)
  return "".join(output)

In [None]:
# try generating
for _ in range(25):
    print(generate_string(default_reber_grammar), end=" ")

BPVPSE BPTVVE BPVVE BTXSE BPTTTTVPXTTVPXVVE BTXSE BTSSXSE BTSXXVVE BTXSE BPTVVE BTSSXXTVVE BPVVE BTXXVVE BPTTTVPSE BTSSSSXXTVVE BPTTTVPXVVE BTXSE BTXSE BTXSE BPVVE BTXXVPXVVE BTXSE BTXXTVPSE BTXXVPXTTTVVE BTXSE 

In [None]:
# generate with embedded reber grammar
for _ in range(25):
    print(generate_string(embedded_reber_grammar), end=" ")

BTBPVVETE BPBPTVPXVPXVVEPE BTBPVVETE BTBPTTVVETE BTBPTVPSETE BPBPTTVPSEPE BTBTXSETE BTBTSXSETE BPBTSSXSEPE BTBPTTVPSETE BTBTXXTTTTTVPSETE BPBPTVPSEPE BPBTXXVPXVVEPE BTBTXXVPXVPSETE BTBTSSSSSSSSXSETE BTBTSXXTTVPXTVPSETE BTBTSXSETE BTBPVVETE BPBPTTTTTTTTVVEPE BPBPVVEPE BTBTXXTTTVVETE BPBPVVEPE BTBTXSETE BTBTXSETE BPBTXXTTTTTVPSEPE 

In [None]:
## now generate corrupted strings
possible_chars = "BEPSTVX"

def generate_corrupted_string(grammar, chars=possible_chars):
  good_string = generate_string(grammar)
  index = np.random.randint(len(good_string))
  good_char = good_string[index]
  bad_char = np.random.choice(sorted(set(chars) - set(good_char)))
  return good_string[:index] + bad_char + good_string[index +1 :]

In [None]:
for _ in range(25):
    print(generate_corrupted_string(embedded_reber_grammar), end=" ")

BPBPTEVEPE BPBPTVVEEE BTBPTVPXVVETB BTBPVVETS BTBTSTSETE BTBTSSSXXVPXBTVVETE BTBPTTTVPXTVVBTE BPBPVPSBPE BTBTXXTTTTVPPETE BPXPTVPSEPE BTBPVPXTTVVEVE BTBVTVVETE BTBTSSSXXVEETE BPBPTTBPXTTVVEPE BTVPVPXTTVVETE BTBTXXTSPXTVPSETE BTBPVPSETS BPBESSSXSEPE BTBPVPXTTVPSXTE BTBPXSETE SPBPTTVPXTVPXTTVPXTTVVEPE STBTSXSETE BVBPTTVPSETE BTBTSXSXXVVETE BTBPTTTTVPSEPE 

In [None]:
def string_to_ids(s, chars=possible_chars):
  return [chars.index(c) for c in s]

In [None]:
string_to_ids("BTTTXXXVETET")

[0, 4, 4, 4, 6, 6, 6, 5, 1, 4, 1, 4]

In [None]:
def generate_dataset(size):
  good_strings = [string_to_ids(generate_string(embedded_reber_grammar)) for _ in range(size // 2)]
  bad_strings = [string_to_ids(generate_corrupted_string(embedded_reber_grammar)) for _ in range(size - size // 2)]
  all_strings = good_strings + bad_strings
  X = tf.ragged.constant(all_strings, ragged_rank=1)
  y = np.array([[1.] for _ in range(len(good_strings))] + 
               [[0.] for _ in range(len(bad_strings))])
  return X, y

In [None]:
X_train, y_train = generate_dataset(10000)
X_valid, y_valid = generate_dataset(2000)

In [None]:
X_train[0]

<tf.Tensor: shape=(11,), dtype=int32, numpy=array([0, 4, 0, 2, 4, 5, 2, 3, 1, 4, 1], dtype=int32)>

In [None]:
y_train[0]

array([1.])

In [None]:
embedding_size = 5

model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=[None], dtype=tf.int32, ragged=True),
    keras.layers.Embedding(input_dim=len(possible_chars), output_dim=embedding_size),
    keras.layers.GRU(30),
    keras.layers.Dense(1, activation="sigmoid")
])
optimizer = keras.optimizers.SGD(lr=0.02, momentum=0.95, nesterov=True)
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20


  "shape. This may consume a large amount of memory." % value)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
test_strings = ["BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE",
                "BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE"]
X_test = tf.ragged.constant([string_to_ids(s) for s in test_strings], ragged_rank=1)

y_proba = model.predict(X_test)
print()
print("Estimated probability that these are Reber strings:")
for index, string in enumerate(test_strings):
    print("{}: {:.2f}%".format(string, 100 * y_proba[index][0]))


Estimated probability that these are Reber strings:
BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE: 0.34%
BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE: 99.95%


### Encoder-Decoder model to convert date string from "April 22, 2019" to "2019-04-22"

In [None]:
# randomly use days between 1000-01-01 and 9999-12-31
from datetime import date

months = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

def random_dates(n_dates):
  min_date = date(1000, 1, 1).toordinal()
  max_date = date(9999, 12, 31).toordinal()

  ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
  dates = [date.fromordinal(ordinal) for ordinal in ordinals]

  x = [months[dt.month - 1] + " " + dt.strftime("%d, %Y") for dt in dates]
  y = [dt.isoformat() for dt in dates]
  return x, y

In [None]:
n_dates = 3
x_example, y_example = random_dates(n_dates)
print("{:25s}{:25s}".format("Input", "Target"))
print("-" * 50)
for idx in range(n_dates):
  print("{:25s}{:25s}".format(x_example[idx], y_example[idx]))

Input                    Target                   
--------------------------------------------------
September 05, 4900       4900-09-05               
October 14, 3704         3704-10-14               
September 12, 3268       3268-09-12               


In [None]:
# list of all possible characters in inputs
input_chars = "".join(sorted(set("".join(months) + "0123456789, ")))
input_chars

' ,0123456789ADFJMNOSabceghilmnoprstuvy'

In [None]:
# all possible output chars
output_chars = "0123456789-"

In [None]:
# convert stirng to a list of charcter ids
def date_str_to_ids(date_str, chars=input_chars):
  return [chars.index(c) for c in date_str]

In [None]:
date_str_to_ids(x_example[1], input_chars)

[18, 22, 34, 30, 21, 23, 32, 0, 3, 6, 1, 0, 5, 9, 2, 6]

In [None]:
date_str_to_ids(y_example[0], output_chars)

[4, 9, 0, 0, 10, 0, 9, 10, 0, 5]

In [None]:
def prepare_date_strs(date_strs, chars=input_chars):
  X_ids = [date_str_to_ids(dt, chars) for dt in date_strs]
  X = tf.ragged.constant(X_ids, ragged_rank=1)
  return (X + 1).to_tensor()

def create_dataset(n_dates):
  x, y = random_dates(n_dates)
  return prepare_date_strs(x, input_chars), prepare_date_strs(y, output_chars)

In [None]:
X_train, Y_train = create_dataset(10000)
X_valid, Y_valid = create_dataset(2000)
X_test, Y_test = create_dataset(2000)

In [None]:
Y_train[0]

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([ 8,  8,  2,  1, 11,  1,  7, 11,  1, 10], dtype=int32)>

In [None]:
# basic seq2seq model
embedding_size = 32
max_output_length = Y_train.shape[1]

encoder = keras.models.Sequential([
      keras.layers.Embedding(input_dim=len(input_chars) + 1,
                             output_dim=embedding_size,
                             input_shape=[None]),
      keras.layers.LSTM(128)
])

decoder = keras.models.Sequential([
      keras.layers.LSTM(128, return_sequences=True),
      keras.layers.Dense(len(output_chars) + 1, activation="softmax")
])

model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(max_output_length),
    decoder
])

optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit(X_train, Y_train, epochs=20, validation_data=(X_valid, Y_valid))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# convert output to string
def ids_to_date_strs(ids, chars=output_chars):
  return ["".join([("?" + chars)[index] for index in sequence]) for sequence in ids]

In [None]:
X_new = prepare_date_strs(["September 17, 2009", "July 14, 1789"])

ids = np.argmax(model.predict(X_new), axis=-1)
for date_str in ids_to_date_strs(ids):
  print(date_str)

2009-09-17
1789-07-14


In [None]:
X_new = prepare_date_strs(["May 02, 2020", "July 14, 1789"])

ids = np.argmax(model.predict(X_new), axis=-1)
for date_str in ids_to_date_strs(ids):
  print(date_str)

2020-12-02
1789-09-14


In [None]:
# pad sequence to have same length
max_input_length = X_train.shape[1]
max_input_length

18

In [None]:
def prepare_date_strs_padded(date_strs):
  X = prepare_date_strs(date_strs)
  if X.shape[1] < max_input_length:
    X = tf.pad(X, [[0, 0], [0, max_input_length - X.shape[1]]])
  return X

def convert_date_strs(date_strs):
  X = prepare_date_strs_padded(date_strs)
  ids = np.argmax(model.predict(X), axis=-1)
  return ids_to_date_strs(ids)

In [None]:
convert_date_strs(["May 02, 2020", "July 14, 1789"])


['2020-05-02', '1789-07-14']

#### Second version. Feeding the shifted targets to the decoder

Instead of feeding the decoder a simple repetition of the encoder's output vector, we can feed it the target sequence, shifted by one time step to the right. This way, at each time step the decoder will know what the previous target character was. This should help is tackle more complex sequence-to-sequence problems.

Since the first output character of each target sequence has no previous character, we will need a new token to represent the start-of-sequence (sos).

During inference, we won't know the target, so what will we feed the decoder? We can just predict one character at a time, starting with an sos token, then feeding the decoder all the characters that were predicted so far (we will look at this in more details later in this notebook).

But if the decoder's LSTM expects to get the previous target as input at each step, how shall we pass it it the vector output by the encoder? Well, one option is to ignore the output vector, and instead use the encoder's LSTM state as the initial state of the decoder's LSTM (which requires that encoder's LSTM must have the same number of units as the decoder's LSTM).

In [None]:
# create decoder input by shifting output sequences
sos_id = len(output_chars) + 1

def shifted_output_sequences(Y):
  sos_tokens = tf.fill(dims=(len(Y), 1), value=sos_id)
  return tf.concat([sos_tokens, Y[:, :-1]], axis=1)

In [None]:
X_train_decoder = shifted_output_sequences(Y_train)
X_valid_decoder = shifted_output_sequences(Y_valid)
X_test_decoder = shifted_output_sequences(Y_test)

In [None]:
X_train_decoder

<tf.Tensor: shape=(10000, 10), dtype=int32, numpy=
array([[12,  8,  8, ...,  7, 11,  1],
       [12,  3, 10, ...,  3, 11,  3],
       [12,  5,  1, ...,  1, 11,  3],
       ...,
       [12,  5,  7, ...,  5, 11,  3],
       [12,  9,  5, ...,  8, 11,  1],
       [12,  9,  6, ...,  3, 11,  3]], dtype=int32)>

In [None]:
# using functional API
encoder_embedding_size = 32
decoder_embedding_size = 32
lstm_units = 128

encoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)
encoder_embedding = keras.layers.Embedding(
    input_dim=len(input_chars) + 1,
    output_dim=encoder_embedding_size)(encoder_input)
_, encoder_state_h, encoder_state_c = keras.layers.LSTM(
    lstm_units, return_state=True)(encoder_embedding)
encoder_state = [encoder_state_h, encoder_state_c]

decoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)
decoder_embedding = keras.layers.Embedding(
    input_dim=len(output_chars) + 2,
    output_dim=decoder_embedding_size)(decoder_input)
decoder_lstm_output = keras.layers.LSTM(lstm_units, return_sequences=True)(
    decoder_embedding, initial_state=encoder_state)
decoder_output = keras.layers.Dense(len(output_chars) + 1, activation="softmax")(decoder_lstm_output)

model = keras.models.Model(inputs=[encoder_input, decoder_input],
                           outputs=[decoder_output])

optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit([X_train, X_train_decoder], Y_train, epochs=10,
                    validation_data=([X_valid, X_valid_decoder], Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# use the model to predict
sos_id = len(output_chars) + 1

def predict_date_strs(date_strs):
  X = prepare_date_strs_padded(date_strs)
  Y_pred = tf.fill(dims=(len(X), 1), value=sos_id)
  for index in range(max_output_length):
    pad_size = max_output_length - Y_pred.shape[1]
    X_decoder = tf.pad(Y_pred, [[0,0], [0, pad_size]])
    Y_probas_next = model.predict([X, X_decoder])[:, index:index+1]
    Y_pred_next = tf.argmax(Y_probas_next, axis=-1, output_type=tf.int32)
    Y_pred = tf.concat([Y_pred, Y_pred_next], axis=1)
  return ids_to_date_strs(Y_pred[:, 1:])

In [None]:
predict_date_strs(["July 14, 1789", "May 01, 2020"])

['1789-07-14', '2020-05-01']

In [None]:
!pip install tensorflow_addons



In [None]:
# using TF-Addon's seq2seq implementation

import tensorflow_addons as tfa

encoder_embedding_size = 32
decoder_embedding_size = 32
units = 128

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

encoder_embeddings = keras.layers.Embedding(
    len(input_chars) + 1, encoder_embedding_size)(encoder_inputs)

decoder_embedding_layer = keras.layers.Embedding(
    len(output_chars) + 2, decoder_embedding_size)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

encoder = keras.layers.LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(units)
output_layer = keras.layers.Dense(len(output_chars) + 1)

decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler, output_layer=output_layer)

final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings,
    initial_state=encoder_state)

Y_proba = keras.layers.Activation("softmax")(final_outputs.rnn_output)

model = keras.models.Model(inputs=[encoder_inputs, decoder_inputs],
                           outputs=[Y_proba])
optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit([X_train, X_train_decoder], Y_train, epochs=15,
                    validation_data=([X_valid, X_valid_decoder], Y_valid))


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
predict_date_strs(["July 14, 1789", "May 01, 2020"])

['1789-07-14', '2020-05-01']

In [None]:
# faster prediction using GreedyEmbeddingSampler
inference_sampler = tfa.seq2seq.sampler.GreedyEmbeddingSampler(
    embedding_fn=decoder_embedding_layer)
inference_decoder = tfa.seq2seq.basic_decoder.BasicDecoder(
    decoder_cell, inference_sampler, output_layer=output_layer,
    maximum_iterations=max_output_length)
batch_size = tf.shape(encoder_inputs)[:1]
start_tokens = tf.fill(dims=batch_size, value=sos_id)
final_outputs, final_state, final_sequence_lengths = inference_decoder(
    start_tokens,
    initial_state=encoder_state,
    start_tokens = start_tokens,
    end_token=0
)

inference_model = keras.models.Model(inputs=[encoder_inputs],
                                     outputs=[final_outputs.sample_id])

In [None]:
def fast_predict_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    Y_pred = inference_model.predict(X)
    return ids_to_date_strs(Y_pred)

In [None]:
fast_predict_date_strs(["July 14, 1789", "May 01, 2020"])

['1789-07-14', '2020-05-01']

### Using the most recent langauge model (GPT)

In [3]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/d8/5144b0712f7f82229a8da5983a8fbb8d30cec5fbd5f8d12ffe1854dcea67/transformers-4.4.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 5.4MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 21.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 24.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=ead23

In [5]:
### load pretrained model. using OpenAI's GPT
from transformers import TFOpenAIGPTLMHeadModel

model = TFOpenAIGPTLMHeadModel.from_pretrained("openai-gpt")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=656.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466312920.0, style=ProgressStyle(descri…




All model checkpoint layers were used when initializing TFOpenAIGPTLMHeadModel.

All the layers of TFOpenAIGPTLMHeadModel were initialized from the model checkpoint at openai-gpt.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFOpenAIGPTLMHeadModel for predictions without further training.


In [6]:
!pip install spaCy
!pip install ftfy

Collecting ftfy
[?25l  Downloading https://files.pythonhosted.org/packages/04/06/e5c80e2e0f979628d47345efba51f7ba386fe95963b11c594209085f5a9b/ftfy-5.9.tar.gz (66kB)
[K     |████████████████████████████████| 71kB 3.4MB/s 
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
  Created wheel for ftfy: filename=ftfy-5.9-cp37-none-any.whl size=46451 sha256=489b197e07e1ebd29da4ad6e6c468a7bac36348a93c3b5598b13dea87a9c11cc
  Stored in directory: /root/.cache/pip/wheels/5e/2e/f0/b07196e8c929114998f0316894a61c752b63bfa3fdd50d2fc3
Successfully built ftfy
Installing collected packages: ftfy
Successfully installed ftfy-5.9


In [7]:
# specialized tokenizer for GPT
from transformers import OpenAIGPTTokenizer

tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=815973.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=458495.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1272610.0, style=ProgressStyle(descript…




In [8]:
# encode the pretrained text with tokenizer
prompt_text = "Is this a dagger which I see before me, the handle"
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False,
                                  return_tensors="tf")
encoded_prompt

<tf.Tensor: shape=(1, 12), dtype=int32, numpy=
array([[ 544,  616,  246, 6180,  984,  249,  788,  781,  510,  240,  481,
        2861]], dtype=int32)>

In [11]:
# generate 5 different sentences, each starting with the promopt text, followed by 40 additional tokens
num_sequences = 5
length = 40

generated_sequences = model.generate(
    input_ids=encoded_prompt,
    do_sample=True,
    max_length=length + len(encoded_prompt[0]),
    temperature=0.7,
    top_k=0,
    top_p=0.9,
    repetition_penalty=1.0,
    num_return_sequences=num_sequences,
)

generated_sequences

<tf.Tensor: shape=(5, 52), dtype=int32, numpy=
array([[  544,   616,   246,  6180,   984,   249,   788,   781,   510,
          240,   481,  2861,   498,   984,   249,  1578,   500,   547,
          828,   240,   249,   699,   507,   544,   595,   239,   507,
          544,   246,  6180,   240,   481,  1164,  6180,   249,   604,
         1295,   781,   240,   488,  1340,   249,   699,   507,   595,
          239,   249,   699,   525,   507,   544,   595],
       [  544,   616,   246,  6180,   984,   249,   788,   781,   510,
          240,   481,  2861,   544,  2203,   240,   488,   481, 10607,
          544,  1374,   239, 40477,   249,   890,   609,   240,   488,
          655,  4276,   246,   762,   240,   568,   487,   544,   595,
          246,   762,   240,   562,   487,   544,  2482,   500,   246,
         1301,  5625,   240,   246,  2910,  1205,   609],
       [  544,   616,   246,  6180,   984,   249,   788,   781,   510,
          240,   481,  2861,   498,   984,   544,   246,

In [12]:
# decode and print
for sequence in generated_sequences:
  text = tokenizer.decode(sequence, clean_up_tokenization_spaces=True)
  print(text)
  print("-" * 80)

is this a dagger which i see before me, the handle of which i hold in my hand, i know it is not. it is a dagger, the same dagger i have seen before, and yet i know it not. i know that it is not
--------------------------------------------------------------------------------
is this a dagger which i see before me, the handle is empty, and the hilt is gone. 
 i look up, and there stands a man, but he is not a man, for he is dressed in a black robe, a hood pulled up
--------------------------------------------------------------------------------
is this a dagger which i see before me, the handle of which is a curved sword, and in the middle is the blade of a sword which is made of the same metal. " 
 " yes, " said gandalf, " it is a sword which
--------------------------------------------------------------------------------
is this a dagger which i see before me, the handle of which is in fact a dagger and which, when i look, is in fact a sword. 
 "'here is my sword,'" i say, looking at t