In [1]:
!pip install tensorflow_text
!pip install sentencePiece
import tensorflow as tf
import tensorflow_text as tx
import numpy as np
import io
import tqdm
import datetime 
import pathlib



In [2]:
path = tf.keras.utils.get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')

In [3]:
m = 128
vocab_size = 2000

In [4]:
import requests
url = 'https://s3.amazonaws.com/text-datasets/nietzsche.txt'

r = requests.get(url)

filepath = "nietzsche.txt"

with open(filepath, "wb") as file:
    file.write(r.content)

with open('nietzsche.txt','r') as file:
    sample_text = file.read()

In [5]:
import sentencepiece as sp

sp.SentencePieceTrainer.train(
    input='nietzsche.txt', model_prefix='tokenizer_model', model_type="unigram", vocab_size=vocab_size)

In [6]:
# deserialize the trained model file to load it in the correct format
trained_tokenizer_model = tf.io.gfile.GFile('tokenizer_model.model', "rb").read()

# load the model as a tokenizer that can be used inside a tensorflow model
tokenizer = tx.SentencepieceTokenizer(
    model=trained_tokenizer_model, out_type=tf.int32, nbest_size=-1, alpha=1, reverse=False,
    add_bos=False, add_eos=False, return_nbest=False, name=None
)

In [7]:
tokens = tokenizer.tokenize(sample_text)
print(tokens)

tf.Tensor([184 254  96 ... 766 787  12], shape=(171498,), dtype=int32)


In [8]:
#tokenizer.detokenize(tokens)

In [9]:
#print(tokens)

tf.Tensor([184 254  96 ... 766 787  12], shape=(171498,), dtype=int32)


In [10]:
width_tokens = tx.sliding_window(tokens, m+1)

text_dataset = tf.data.Dataset.from_tensor_slices(width_tokens)
text_dataset

<TensorSliceDataset shapes: (129,), types: tf.int32>

In [11]:
def prepare(ds):   
  ds = ds.map(lambda data: (data[:m], data[-1]))
  
  # cache 
  ds = ds.cache()
  # shuffle, batch, prefetch our dataset
  ds = ds.shuffle(5000)
  ds = ds.batch(256)
  ds = ds.prefetch(1024)

  # split dataset into data and target   

  return ds

In [12]:
text_dataset = prepare(text_dataset)
text_dataset

<PrefetchDataset shapes: ((None, 128), (None,)), types: (tf.int32, tf.int32)>

In [13]:
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Reshape, Dropout, LayerNormalization, LeakyReLU, Embedding, MultiHeadAttention, GlobalAveragePooling1D, Add

In [14]:
class Encoding_layer(Model):

  def __init__(self):
    super(Encoding_layer, self).__init__()

    self.embedding = Embedding(input_dim=vocab_size, output_dim=m)

  def call(self, x):

    indices = tf.range(0, m)

    a = self.embedding(x)
    b = self.embedding(indices)
    c = a + b
    return c

class TransformerBlock(Model):
  def __init__(self):
    super(TransformerBlock, self).__init__()
                       
    self.Attention = MultiHeadAttention(2, m)
    self.Drop1 = Dropout(0.1)
    self.Drop2 = Dropout(0.1)

    self.Dense1 = Dense(128, activation='relu')
    self.Dense2 = Dense(m)
    
    self.LayNorm1 = LayerNormalization(epsilon=1e-6)
    self.LayNorm2 = LayerNormalization(epsilon=1e-6)

    

  def call(self, x):
    
    in_out = self.Attention(x,x)
    in_out = self.Drop1(in_out)
    in_out = self.LayNorm1(in_out)
    in_out = Add()([in_out, x])
    in_out = self.LayNorm2(self.Drop2(self.Dense2(self.Dense1(in_out))))
    in_out = Add()([in_out, x])

    return in_out

In [15]:
class MyModel(Model):
  def __init__(self):
    super(MyModel, self).__init__()

    self.optimizer = tf.keras.optimizers.Adam()
    self.loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)   
    
    self.metrics_list = [
                    tf.keras.metrics.Mean(name="loss"),
                    tf.keras.metrics.CategoricalAccuracy(name="acc")
                    #tf.keras.metrics.TopKCategoricalAccuracy(3,name="top-3-acc") 
                   ]

    self.layer_list = [
      Encoding_layer(),
      TransformerBlock(),
      GlobalAveragePooling1D(),
      Dense(vocab_size)
    ] 
    
  def call(self, x):
    
    for layer in self.layer_list:

      x = layer(x)
    
    return x

  def reset_metrics(self):
      
    for metric in self.metrics:
      metric.reset_states()
          
  @tf.function
  def train_step(self, data):
      
      #print(tf.shape(data))
      x, target = data
      
      with tf.GradientTape() as tape:
          predictions = self(x)
          
          loss = self.loss_function(target, predictions) + tf.reduce_sum(self.losses)
      
      gradients = tape.gradient(loss, self.trainable_variables)
      self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
      
      # update loss metric
      self.metrics[0].update_state(loss)
      
      # for all metrics except loss, update states (accuracy etc.)
      for metric in self.metrics[1:]:
          metric.update_state(target, predictions)
      # Return a dictionary mapping metric names to current value
      return {m.name: m.result() for m in self.metrics}

In [16]:
# initialize models 
model = MyModel()

# hyperparameters
BATCH_SIZE = 256

# Training for more than 300 epochs would be required for meaningful results 
EPOCHS = 10

In [17]:
# Define saving location for log
hyperparameter_string= f"Trial:01_BATCH:{BATCH_SIZE}_EPOCH:{EPOCHS}_Adam"
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

train_log_path = f"logs/{hyperparameter_string}/{current_time}/train"

# log writer for training metrics
train_summary_writer = tf.summary.create_file_writer(train_log_path)

In [18]:
# Training Loop
def training(model, train_ds, epochs):

  model = model
  for epoch in range(epochs):

    print(f"Epoch {epoch}:")

    # Training:

    for data in train_ds:
        metrics = model.train_step(data)

    # print the metrics
    print([f"{key}: {value}" for (key, value) in zip(list(metrics.keys()), list(metrics.values()))])

    # logging the validation metrics to the log file which is used by tensorboard
    with train_summary_writer.as_default():
        for metric in model.metrics:
            tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)

    # reset all metrics (requires a reset_metrics method in the model)
    model.reset_metrics()

    print("\n")

In [19]:
# train the model 
training(model, text_dataset, EPOCHS)

Epoch 0:
['loss: 6.1695380210876465', 'acc: 0.004481531213968992']


Epoch 1:
['loss: 5.956798076629639', 'acc: 0.005129252560436726']


Epoch 2:
['loss: 5.274999618530273', 'acc: 0.003956351894885302']


Epoch 3:
['loss: 5.054451942443848', 'acc: 0.005006710533052683']


Epoch 4:
['loss: 4.917806625366211', 'acc: 0.004784968215972185']


Epoch 5:
['loss: 4.809929847717285', 'acc: 0.00373460934497416']


Epoch 6:
['loss: 4.7243242263793945', 'acc: 0.004096399527043104']


Epoch 7:
['loss: 4.648908615112305', 'acc: 0.002655073767527938']


Epoch 8:
['loss: 4.580646514892578', 'acc: 0.003676256164908409']


Epoch 9:
['loss: 4.516909599304199', 'acc: 0.0031977591570466757']




In [20]:
# save the model with a meaningful name
model.save_weights(f"saved_model_{hyperparameter_string}", save_format="tf")

In [21]:
# instantiate a new model 
loaded_model = MyModel()

# load the model weights to continue training. 
loaded_model.load_weights(f"saved_model_{hyperparameter_string}");

In [22]:
def generate_text(input, top_k=100):

  input = tokenizer.tokenize(input)
  #print(input)

  padding_size = m - len(input)

  paddings = tf.constant([[0, 0,], [padding_size, 0]])

  input =  tf.pad([input], paddings, "CONSTANT")
  #print(input.shape)

  output = model(input)

  output1 = tokenizer.detokenize(tf.cast(output, dtype=tf.int32))
  #print(output1)
  #output = tf.expand_dims(, axis=0)
  #print(input.shape)

  logits, indices = tf.math.top_k(output, top_k)
  #print(indices)
  output = tf.random.categorical(logits,1)
  #print(output.shape)
  #print(output)

  output = tf.cast(output, dtype=tf.int32)
  output = tokenizer.detokenize(output)

  return output

In [None]:
text = 'What is the next word that should be'

generate_text(text)