In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time

In [None]:
path_to_file = tf.keras.utils.get_file('175b_samples.jsonl', 'https://raw.githubusercontent.com/openai/gpt-3/master/175b_samples.jsonl')

Downloading data from https://raw.githubusercontent.com/openai/gpt-3/master/175b_samples.jsonl


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

print(f'Length of text: {len(text)} characters')

Length of text: 4061374 characters


In [None]:
print(text[:250])

"Glacier Ridge Christian School\n\nGlacier Ridge Christian School is a private Christian school in Johnstown, Ohio. It was founded in the fall of 1999 by Gary and Tammy Smith.\n\nThe school started with just 13 students in grades 5-8. By the end of t


In [None]:
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

674 unique characters


In [None]:
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [None]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab), mask_token=None)

In [None]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[67, 68, 69, 70, 71, 72, 73], [90, 91, 92]]>

In [None]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [None]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [None]:
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'abcdefg', b'xyz'], dtype=object)

In [None]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(4061374,), dtype=int64, numpy=array([ 4, 41, 78, ..., 14,  4,  1])>

In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [None]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

"
G
l
a
c
i
e
r
 
R


In [None]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

In [None]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'"' b'G' b'l' b'a' b'c' b'i' b'e' b'r' b' ' b'R' b'i' b'd' b'g' b'e'
 b' ' b'C' b'h' b'r' b'i' b's' b't' b'i' b'a' b'n' b' ' b'S' b'c' b'h'
 b'o' b'o' b'l' b'\\' b'n' b'\\' b'n' b'G' b'l' b'a' b'c' b'i' b'e' b'r'
 b' ' b'R' b'i' b'd' b'g' b'e' b' ' b'C' b'h' b'r' b'i' b's' b't' b'i'
 b'a' b'n' b' ' b'S' b'c' b'h' b'o' b'o' b'l' b' ' b'i' b's' b' ' b'a'
 b' ' b'p' b'r' b'i' b'v' b'a' b't' b'e' b' ' b'C' b'h' b'r' b'i' b's'
 b't' b'i' b'a' b'n' b' ' b's' b'c' b'h' b'o' b'o' b'l' b' ' b'i' b'n'
 b' ' b'J' b'o'], shape=(101,), dtype=string)


In [None]:
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'"Glacier Ridge Christian School\\n\\nGlacier Ridge Christian School is a private Christian school in Jo'
b'hnstown, Ohio. It was founded in the fall of 1999 by Gary and Tammy Smith.\\n\\nThe school started with'
b' just 13 students in grades 5-8. By the end of the first year, the school had grown to 65 students in'
b' those same grades. As the school has continued to grow, so has the number of teachers.\\n\\nBy the yea'
b'r 2009, Glacier Ridge had expanded to offer a preschool through 12th grade education, and the school '


In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
split_input_target(list("Ded Security"))

(['D', 'e', 'd', ' ', 'S', 'e', 'c', 'u', 'r', 'i', 't'],
 ['e', 'd', ' ', 'S', 'e', 'c', 'u', 'r', 'i', 't', 'y'])

In [None]:
dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'"Glacier Ridge Christian School\\n\\nGlacier Ridge Christian School is a private Christian school in J'
Target: b'Glacier Ridge Christian School\\n\\nGlacier Ridge Christian School is a private Christian school in Jo'


In [None]:
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

## Build The Model

This section defines the model as a `keras.Model` subclass (For details see [Making new Layers and Models via subclassing](https://www.tensorflow.org/guide/keras/custom_layers_and_models)). 

This model has three layers:

* `tf.keras.layers.Embedding`: The input layer. A trainable lookup table that will map each character-ID to a vector with `embedding_dim` dimensions;
* `tf.keras.layers.GRU`: A type of RNN with size `units=rnn_units` (You can also use an LSTM layer here.)
* `tf.keras.layers.Dense`: The output layer, with `vocab_size` outputs. It outputs one logit for each character in the vocabulary. These are the log-likelihood of each character according to the model.

In [None]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024

In [None]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
model = MyModel(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 675) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()

Model: "my_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     multiple                  172800    
                                                                 
 gru_2 (GRU)                 multiple                  3938304   
                                                                 
 dense_2 (Dense)             multiple                  691875    
                                                                 
Total params: 4,802,979
Trainable params: 4,802,979
Non-trainable params: 0
_________________________________________________________________


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [None]:
sampled_indices

array([629,  93, 350, 508, 143, 148,  71, 242, 656, 366, 448, 260, 406,
       507, 319, 366,  30,  54, 583,  68, 202, 494, 568, 669, 587, 435,
       297, 149, 430, 171, 673,  42, 651, 463, 227, 237,  43, 563, 599,
        47, 549, 237,  30, 590,  16, 213, 272, 639, 243,  71, 334, 672,
       408, 630, 468, 151, 232, 661, 388, 468, 441, 206, 484, 441,  56,
       313, 569,  28, 426, 155, 175, 188,  88, 510, 429, 567, 486, 421,
       268,  95, 170, 148, 344, 194,  19, 599, 194, 413, 505,  94, 648,
       204, 186,  78, 469, 315, 579, 237, 102, 324])

In [None]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'f it you want to read. When I read the story in the paper, I didn\xe2\x80\x99t try to understand the process by'

Next Char Predictions:
 b'\xe8\xbf\x91{\xe5\x8d\x88\xe6\xb0\xb8\xc3\xad\xc3\xb3e\xe2\x86\x91\xe9\x9b\xbb\xe5\x92\x8c\xe6\x87\x82\xe3\x82\xaa\xe5\xae\xb9\xe6\xaf\x94\xe4\xbd\xbf\xe5\x92\x8c<T\xe8\x83\xbdb\xd0\xa1\xe6\x9d\x82\xe7\xb6\xb2\xef\xbc\x88\xe8\x8b\xa5\xe5\xbc\xb7\xe4\xba\x86\xc3\xb4\xe5\xbb\x8a\xc5\x81\xef\xbf\xbdH\xe9\x98\xbb\xe6\x8d\x9f\xe2\x80\x8e\xe2\x80\xa6I\xe7\xae\xa1\xe8\xa6\xa7M\xe7\xa0\xb4\xe2\x80\xa6<\xe8\x99\x9a.\xd0\xb9\xe3\x83\xa9\xe9\x81\x93\xe2\x86\x92e\xe5\x86\x99\xef\xbc\x9a\xe5\xaf\xbc\xe8\xbf\x98\xe6\x8f\x92\xc3\xb6\xe2\x80\x99\xe9\xa2\x86\xe5\xa4\xa7\xe6\x8f\x92\xe6\x80\xa7\xd0\xb1\xe6\x98\xaf\xe6\x80\xa7V\xe4\xbc\x97\xe7\xba\xa7:\xe5\xba\x8f\xc3\xbb\xc5\x8d\xc5\xbav\xe6\xb2\xa1\xe5\xba\xa6\xe7\xb4\xaf\xe6\x9c\x80\xe5\xb8\xb8\xe3\x83\x8a}\xc4\xbe\xc3\xb3\xe5\x8a\xa8\xcc\x811\xe8\xa6\xa7\xcc\x81\xe5\xb1\x81\xe6\xae\xb5|\xe9\x97\xae\xd0\

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 100, 675)  # (batch_size, sequence_length, vocab_size)
Mean loss:         6.5158863


In [None]:
tf.exp(mean_loss).numpy()

675.79266

In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
checkpoint_dir = './training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS = 20

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Generate text

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
 
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,

        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):

    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)

    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    predicted_logits = predicted_logits + self.prediction_mask

    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    predicted_chars = self.chars_from_ids(predicted_ids)

    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [None]:
start = time.time()
states = None
next_char = tf.constant(['Model prompt:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Model prompt: Wathing Service Lings and GAP coming Patsia States Northure and Pantle School name\n\nKanik Gambrode zokkee\n\n\n\nCake as Iraq's company\n\nCassing Wilsons Tools\n\nB&W by alword Visiel Mccenico he bordere\n\n\n\nReddoire : 9 €\n\n\n**3.9grating!\n\nRembected States (HCLP) Power Players [ for Scott. The song at the Solement Donsky School for\n\nThese particle ancoman Sage Pages report is lying about the higher-hit-alcone holding company, with Runsiap to file the Gip-of-30 children of Tomby. It was the first movie of the sea as another choose fee in Learning causes of money to all the shareholders, makes to write by this first risk of discrimination.\n\nBut there are many disorder to the Chinatowic Stephen Matt sport of Nigerca's 'made you how long battle art farms for Frangfights and may sink by joining and surprising Him Knee.[3] There are allows for the Matthew and training effectively, but not without against a controller when it still spend God (adows you two things 

In [None]:
start = time.time()
states = None
next_char = tf.constant(['TESTE:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)

tf.Tensor([b"TESTE:\\n\\n(A) (A Catchar de Jerecone, Jone Bander\\n\\nThe Em Doutt showed that Trump is a great red by the Hampathy (nigeo Kates. This maximum sensey having want an unhappy with the highest life with all the tweets stay in a corporatorium on the Japanese exer-imperiable. So if you\xe2\x80\x99re driffy.\\n\\nWhy public software gays aren\xe2\x80\x99t in your sexual parameters. This will vocal clicking and vitamine is similar to easily trip to ash.\\n\\n\\n\\nDoes anything you\xe2\x80\x99ll fail together a treat few other concerns outside. Not just in this chain as you are wharqued and didn\xe2\x80\x99t get any what to dafe to figure. And you could emotion, and that of the hord, she always questions we all window science. Vandoone es una forma de college, acquass, a majority dans le campus de r\xc3\xa9dactieline. Just qui con historie, ma C'LTo, Ammus Lomishi is locate.\\n\\n\xe2\x80\xa2 Source: Croteceatic dots of CNN.V Agent (UBC), 1998.7\\n\\nJane 4, 2018 this time, th