# Homework 11

## 2 Assignment Transformers

Task: implement a Transformer architecture model (instead of an RNN model) that predicts a categorical distribution over possible next tokens such that sampling from this distribution leads to plausible next tokens. 
Implement a decoder-block based generative language model in order to use its autoregressive property to train it on prediction errors of all tokens in the input sequence. 

The model will take a fixed number of input tokens from a text and predict the distribution over the vocabulary for the next token.

## 2.1 Dataset, preprocessing and tokenization


In [1]:
# useful imports 
import tensorflow as tf
import tensorflow_text as tf_txt
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import re
from collections import defaultdict
import datetime
import tqdm
import sentencepiece as sp
import io
%load_ext tensorboard

Dataset of choice: Harry Potter Book 1 (downloaded from https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt)


In [2]:
# open the txt file
hp_raw = open("Harry_Potter_1_Sorcerers_Stone.txt", "r")  
# read file
data = hp_raw.read()  

In [3]:
# convert to lower case
data = data.lower()
# delete special characters, only alphanumeric values and white space/linebreaks remain
# (we keep whitespace/linebreaks for the tokenizer later)
data = re.sub("['.,;\-!?%$\"]", "", data)

In [4]:
# test 
data[0:100]

'harry potter and the sorcerers stone\n\n\nchapter one\n\nthe boy who lived\n\nmr and mrs dursley of number '

In [5]:
# create new txt file with preprocessed harry potter text for tokenizer
f = open("harrypotter.txt", "w")
f.write(data)
f.close()

In [6]:
# hyperparameter: vocabulary size
VOCAB_SIZE = 4242

In [7]:
# train tokenizer on preprocessed harry potter text
sp.SentencePieceTrainer.train(
    input='harrypotter.txt', model_prefix='tokenizer_model', model_type="unigram", vocab_size=VOCAB_SIZE)

In [8]:
# deserialize the trained model file to load it in the correct format
trained_tokenizer_model = tf.io.gfile.GFile('tokenizer_model.model', "rb").read()

# load the model as a tokenizer that can be used inside a tensorflow model
tokenizer = tf_txt.SentencepieceTokenizer(
    model=trained_tokenizer_model, out_type=tf.int32, nbest_size=-1, alpha=1, reverse=False,
    add_bos=False, add_eos=False, return_nbest=False, name=None
)

In [9]:
# test tokenizer
tokens = tokenizer.tokenize("magic is real")
print(tokens)
print(tokenizer.detokenize(tokens))
# because it's fun
tokens = tokenizer.tokenize("you are a wizard harry")
print(tokens)
print(tokenizer.detokenize(tokens))

tf.Tensor([226  82 980], shape=(3,), dtype=int32)
tf.Tensor(b'magic is real', shape=(), dtype=string)
tf.Tensor([ 15  85   6 280  10], shape=(5,), dtype=int32)
tf.Tensor(b'you are a wizard harry', shape=(), dtype=string)


We want to have input sequences of length m tokens (m should be between 32 and 256 - here: seq_length); for this we use tf text.sliding window and pass the tokenized text and the width m + 1 as arguments

In [10]:
# hyperparameter: sequence length
SEQ_LENGTH = 142

In [11]:
# read harry potter file
hp = open("harrypotter.txt", "r")  
data = hp.read()  
# tokenize
tokenized_data = tokenizer.tokenize(data)
# get sequence windows of size = seq_length
sequences = tf_txt.sliding_window(tokenized_data, width=SEQ_LENGTH + 1, axis=-1)

In [12]:
sequences.shape

TensorShape([86307, 143])

In [13]:
# create dataset out of sequences
hp_ds = tf.data.Dataset.from_tensor_slices(sequences)

In [14]:
# shape of one datapoint = one sequence
iterator = iter(hp_ds)
iterator.get_next()

<tf.Tensor: shape=(143,), dtype=int32, numpy=
array([  10,  134,    4,    3,  725,    8,  171,  738,  372,   45,    3,
        157,   78, 1274,  159,    4,  294,  239,    9,  653,  341,  771,
        523,   37, 1317,    5,  168,   23,   24,   37, 1939, 1134, 1034,
         15,   79,  173,   24,   37,    3,  153,  132,   15,   41,  758,
          5,   31, 1831,   14,  183,  445,  116, 1467,  155,   24,   73,
         68,  906,   30,  597, 2131,  159,  239,   11,    3, 1436, 1020,
          9,    6, 3045,  302,  672,  790,    8,  148,  203, 1629,    7,
         11,    6,  428, 2597,   46,  297,   30,  576,  192,  676,  424,
        976,    7,  126,   40,    6,   79,  247, 1498,  294,  239,   11,
       1112,    4, 1915,  161,    4,   19,  357,  815,    3,  454,    6,
       3865,    9,  676,  148,  167,   14,   79, 1673,   26,   47, 1071,
         48,  173,    9,   74,  104, 2713,   16,   72, 1458,  452, 2015,
          8,  507, 1980,   21,    3, 1587, 2488, 1793,    8,    3,  257])>

In [15]:
# out of the sequence with length m+1, the first m tokens are the inputs and the last token is the target
hp_ds = hp_ds.map(lambda seq: tf.split(sequences, [SEQ_LENGTH, 1], -1))

In [16]:
# shape of one datapoint = one sequence (input tokens + target token)
iterator = iter(hp_ds)
iterator.get_next()

(<tf.Tensor: shape=(86307, 142), dtype=int32, numpy=
 array([[  10,  134,    4, ..., 1793,    8,    3],
        [ 134,    4,    3, ...,    8,    3,  257],
        [   4,    3,  725, ...,    3,  257,   19],
        ...,
        [  10,  827,   16, ...,   30,  105,   49],
        [ 827,   16,   56, ...,  105,   49, 1330],
        [  16,   56,  272, ...,   49, 1330,    3]])>,
 <tf.Tensor: shape=(86307, 1), dtype=int32, numpy=
 array([[ 257],
        [  19],
        [   6],
        ...,
        [1330],
        [   3],
        [ 243]])>)

In [17]:
# hyperparameter: batch size
BATCH_SIZE = 32

In [18]:
# shuffle, batch, prefetch
hp_ds = hp_ds.cache().shuffle(1000).batch(BATCH_SIZE).prefetch(20)

## 2.2 The Model Components


### 2.2.1 The Embedding

In [19]:
# hyperparameter: embedding dimensionality
# somethingbetween 64 and 256
EMBED_DIM = 142

In [20]:
class Embedding(tf.keras.layers.Layer):
    
    def __init__(self):
        # each input token index is mapped to a vector that is looked up from a table
        self.embed_token = tf.keras.layers.Embedding(VOCAB_SIZE, EMBED_DIM)
        # positional embedding 
        self.embed_pos = tf.keras.layers.Embedding(SEQ_LENGTH, EMBED_DIM)
        
    def call(self, token_seq):
        # indices to look up the positional code for each sub-word
        indices = tf.range(0, SEQ_LENGTH)
        # feed into embedding layers
        token_embed = self.embed_token(token_seq)
        idx_embed = self.embed_pos(indices)
        # concatenate ?
        seq_embed = token_embed + idx_embed
        
        return seq_embed

### 2.2.2 The TransformerBlock Layer


In [21]:
class TransformerBlock(tf.keras.layers.Layer):
    
    def __init__(self):
        
        self.attention = tf.keras.layers.MultiHeadAttention(num_heads=3, key_dim=EMBED_DIM)
        self.dense1 = tf.keras.layers.Dense(142, activation='relu')
        self.dense2 = tf.keras.layers.Dense(EMBED_DIM)
        self.dropout1 = tf.keras.layers.Dropout(0.1)
        self.dropout2 = tf.keras.layers.Dropout(0.1)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        
    def call(self, inputs, training=True):
        x = self.attention(query=inputs, value=inputs, use_causal_mask=True)
        x = self.dropout1(x, training=training)
        x = inputs + x
        x = self.layernorm1(x)
        ln_out = x
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dropout2(x, training=training)
        x = x + ln_out
        x = layernorm2(x)
        return x     

### 2.2.3 The subclassed Model

In [22]:
class TransformerModel(tf.keras.Model):
    
    def __init__(self, tokenizer):
        super().__init__()
        
        self.tokenizer = tokenizer
        
        self.embed_layer = Embedding()
        self.transf_blocks = TransformerBlock()
        self.dense = tf.keras.layers.Dense(VOCAB_SIZE)
        
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        self.loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        self.metrics_list = [
                        tf.keras.metrics.Mean(name="loss"),
                        tf.keras.metrics.CategoricalAccuracy(name="acc"),
                        tf.keras.metrics.TopKCategoricalAccuracy(3,name="top-3-acc")
                       ]        
        
    @tf.function 
    def call(self, data):
        x = self.embed_layer(data)
        x = self.transf_blocks(x)
        x = self.dense(x)
        return x
          
    def reset_metrics(self): 
        for metric in self.metrics_list:
            metric.reset_states()
          
    @tf.function
    def train_step(self, data):
        with tf.GradientTape() as tape:
            predictions = self(data, training=True)
            loss = self.loss_function(data, predictions) + tf.reduce_sum(self.losses)
        
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        
        # update loss metric
        self.metrics_list[0].update_state(loss)
        
        # for all metrics except loss, update states (accuracy etc.)
        for metric in self.metrics_list[1:]:
            metric.update_state(data,predictions)

        # Return a dictionary mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}
        
        
    # TO DO - I don't really understand the single steps in the hw description ...
    # method receives a text prompt and the desired output length and returns a continuation of the prompt of a specified length  
    def generate_text(self, prompt, output_length, top_k):
        
        # repeat until desired output length is achieved
        for i in range(len(output_length)):
            
            # tokenize text input (=prompt)
            tokenized_prompt = self.tokenizer.tokenize(prompt)
            # add an extra batch dimension
            text_input = tf.expand_dims(tokenized_prompt)
            # pad?
            text_input = tf.pad(text_input)
            # feed into model?
            output = self.call(text_input)
            # returns two tensors, one with the top k highest logits and another with the corresponding token indices
            vals, idx = tf.math.top_k(output, k=top_k, sorted=True)
            # sample one token from the top k distribution
            # this can be done with tf.random.categorical on the last time-step in the sequence of logits that your model outputs
            # ? 
            sample = tf.random.categorical(vals, 1)
            # index the tensor with the corresponding token index (using the sampled index to index the tensor that contains the corresponding token indices)
            # ?
            new_token = idx[sample]
            # concatenate the token to the sequence
            new_text = text_input + new_token
            # if necessary truncate the length of the input (e.g. by indexing with [-self.max_len:]), and repeat until the desired length is reached.

        
        # detokenize 
        new_text = self.tokenizer.detokenize(new_text)
        return new_text
            

## 2.3 Training

In [23]:
# Define where to save the log
config_name= "config_name"
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

train_log_path = f"logs/{config_name}/{current_time}/train"
val_log_path = f"logs/{config_name}/{current_time}/val"

# log writer for training metrics
train_summary_writer = tf.summary.create_file_writer(train_log_path)

# log writer for validation metrics
val_summary_writer = tf.summary.create_file_writer(val_log_path)

In [24]:
def training_loop(model, prompt, train_ds, epochs, train_summary_writer, val_summary_writer):
    
    for epoch in range(epochs):
        print(f"Epoch {epoch}:")
        
        # Training:
        
        for data in tqdm.tqdm(train_ds, position=0, leave=True):
            metrics = model.train_step(data)
            
            # logging the validation metrics to the log file which is used by tensorboard
            with train_summary_writer.as_default():
                for metric in model.metrics:
                    tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)

        
        print([f"{key}: {value.numpy()}" for (key, value) in metrics.items()])
    
        model.reset_metrics()    
        
        # Validation:
        # test the text generator
        # ? 
        gen_text = model.generate_text(prompt, 5, 3)
        print(gen_text)
        print("\n")
        
        # why do we need validation dataset?

In [25]:
%tensorboard --logdir logs/

Reusing TensorBoard on port 6006 (pid 14956), started 0:29:27 ago. (Use '!kill 14956' to kill it.)

In [26]:
# hyperparameter: number of epochs
# between 100 and 600 epochs depending on the text used
NUM_EPOCHS = 420

In [27]:
# instantiate the model
model = TransformerModel(tokenizer)
starting_prompt = 'Hogwards is'

In [28]:
# run the training loop 
training_loop(model=model, 
                prompt = starting_prompt,
                train_ds=hp_ds, 
                #val_ds=val_ds, # to do?
                epochs=NUM_EPOCHS, 
                train_summary_writer=train_summary_writer, 
                val_summary_writer=val_summary_writer)

Epoch 0:


  0%|                                                                                         | 0/2698 [00:54<?, ?it/s]


ResourceExhaustedError: {{function_node __wrapped__IteratorGetNext_output_types_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} OOM when allocating tensor with shape[86307,142] and type int32 on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node split}}]] [Op:IteratorGetNext]