Transformer chatbot based on https://gitlab.com/Deutschman/chat-bot/-/blob/b8cc6052ed252a61730d009fe6d79ae281cbc9c2/transformer_chatbot.ipynb

Attention is all you need - explained
https://www.youtube.com/watch?v=S27pHKBEp30

Motivation for transformers: 
* eliminate need to compute hidden states  
* difficult for RNNs to develop long range dependencies  
* attention: focuses on hidden state information  
* index hidden states using a softmax fxn  
* Ouput of ONE token is ONE sample (update all weights in ONE step)  
* Positional encodings: uses sine fxn (continuous binary encoding)  
* Attention over input sequence -> in essence, pick and choose what you like  
* Keys, values, queries  
* Keys and values -> output by encoding of initial sentence  
* Query -> output by encoding of target sentence  
* Query * Key -> angle between vectors   
** if key and query actually align, they will have a large dot product  
*** softmax -> means one key is selected (magnifies difference between vectors)  
* Induced distribution, multiply by values  
* almost like an indexing scheme  

Illustrated transformer model guide: http://jalammar.github.io/illustrated-transformer/

Training tips for the transformer model: https://arxiv.org/pdf/1804.00247.pdf

BLEU Scores: https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213

Top p-sampling (instead of greedy):
https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313

In [98]:
%cd C:\Users\claudia.nguyen\Downloads

C:\Users\claudia.nguyen\Downloads


In [110]:
%pip install --upgrade pip

Requirement already up-to-date: pip in c:\users\claudia.nguyen\anaconda3\lib\site-packages (20.2.2)
Note: you may need to restart the kernel to use updated packages.


## Import packages

In [45]:
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
import tensorflow as tf
tf.random.set_seed(1234)
AUTO = tf.data.experimental.AUTOTUNE
import tensorflow_datasets as tfds
import os
import re
import numpy as np
from time import time
import matplotlib.pyplot as plt
import pandas as pd
import json
from operator import itemgetter
import pickle

# for GPU vs TPU capability
strategy = tf.distribute.get_strategy()

In [46]:
print(tf.__version__)

2.0.0


## Hyperparameters

In [47]:
# Maximum sentence length
MAX_LENGTH = 30

# For tf.data.Dataset
BATCH_SIZE = int(64 * strategy.num_replicas_in_sync)
BUFFER_SIZE = 20000

# For Transformer
NUM_LAYERS = 6
D_MODEL = 512
NUM_HEADS = 8
UNITS = 2048
DROPOUT = 0.1

EPOCHS = 100

## Import and preprocess conversations

In [48]:
# This method gets a list of all of the .json files to open in the relevant directory
def getJSONFiles():
    f = []
    for root, dirs, files in os.walk("inbox"):
        for file in files:
            if file.endswith(".json"):
                 f.append(os.path.join(root, file))
    return f        

# This method just checks to see if message is from me
def fromClaud(msg):
    if msg['sender_name'] == 'Claudia Nguyen': return True
    else: return False 

# Empty list of query-answer pairs
Qs = []
As = []
# Write the .json messages to qa_pairs
for fl in getJSONFiles():
    data = json.load(open(fl))
    sortedData = sorted(data['messages'], key=itemgetter('timestamp_ms')) 
    # need to sort data by timestamp
    query, response = '', '' # empty strings for initial query, response pairs
    last_msg_frm_claud = False
    n = 0
    # order all messages in timestamp order
    # need to make sure that the messages line up across json files    
    for msg in sortedData:
        n+=1
        # need this try-catch here for images and non-text messages like stickers
        try:
            content = msg['content']
        except KeyError: 
            continue
        
        if fromClaud(msg): # if this message is from claud
            if last_msg_frm_claud: # and last message was from claud
                response += content+'\n '
            else: 
                response = content+'\n '
            last_msg_frm_claud = True
        else: # if this message is not from claud
            if last_msg_frm_claud: # and last message is from claud
                Qs.append(query) # store previous record of query, response
                As.append(response)
                query = content + '\n ' # assign new query
            else: # else append to old query with newline
                query +=  content + '\n '
            last_msg_frm_claud = False

In [49]:
def textPreprocess(input_text):
      def removeAccents(input_text):
        strange='ąćęłńóśżź'
        ascii_replacements='acelnoszz'
        translator=str.maketrans(strange,ascii_replacements)
        return input_text.translate(translator)

      def removeSpecial(input_text):
        special='[^A-Za-z0-9 ]+'
        return re.sub(special, '', input_text)

      def removeTriplicated(input_text):
        return re.compile(r'(.)\1{2,}', re.IGNORECASE).sub(r'\1', input_text)

      return removeTriplicated(removeSpecial(removeAccents(input_text.lower())))

In [50]:
questions = [textPreprocess(str(x)) for x in Qs]
answers = [textPreprocess(str(x)) for x in As]

In [51]:
print('Sample question: {}'.format(questions[200]))
print('Sample answer: {}'.format(answers[200])) 

Sample question: lmao lmao lol same 
Sample answer: suhail didnt sufail me 


In [None]:
# Build tokenizer using tfds for both questions and answers
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size=2**13)

# Define start and end token to indicate the start and end of a sentence
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]

# Vocabulary size plus start and end token
VOCAB_SIZE = tokenizer.vocab_size + 2 

In [None]:
# Tokenize, filter and pad sentences
def tokenize_and_filter(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []
  
    for (sentence1, sentence2) in zip(inputs, outputs):
    # tokenize sentence
        sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
        sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN
        # check tokenized sentence max length
        if len(sentence1) <= MAX_LENGTH and len(sentence2) <= MAX_LENGTH:
            tokenized_inputs.append(sentence1)
            tokenized_outputs.append(sentence2)
  
    # pad tokenized sentences
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    tokenized_inputs, maxlen=MAX_LENGTH, padding='post')
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
    tokenized_outputs, maxlen=MAX_LENGTH, padding='post')
  
    return tokenized_inputs, tokenized_outputs

questions, answers = tokenize_and_filter(questions, answers) 

In [None]:
# decoder inputs use the previous target as input
# remove START_TOKEN from targets
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': questions,
        'dec_inputs': answers[:, :-1]
    },
    {
        'outputs': answers[:, 1:]
    },
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) 

In [None]:
print('Vocab size: {}'.format(VOCAB_SIZE))
print('Number of samples: {}'.format(len(questions))) 
print(dataset) 

## Defining attention functions

### Scaled dot product attention

In [None]:
def scaled_dot_product_attention(query, key, value, mask):
    """Calculate the attention weights."""
    matmul_qk = tf.matmul(query, key, transpose_b=True)

    # scale matmul_qk
    depth = tf.cast(tf.shape(key)[-1], tf.float32)
    logits = matmul_qk / tf.math.sqrt(depth)

    # add the mask to zero out padding tokens
    if mask is not None:
        logits += (mask * -1e9)

    # softmax is normalized on the last axis (seq_len_k)
    attention_weights = tf.nn.softmax(logits, axis=-1)

    output = tf.matmul(attention_weights, value)

    return output 

### Multi-head attention

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):

    def __init__(self, d_model, num_heads, name="multi_head_attention"):
        super(MultiHeadAttention, self).__init__(name=name)
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.query_dense = tf.keras.layers.Dense(units=d_model)
        self.key_dense = tf.keras.layers.Dense(units=d_model)
        self.value_dense = tf.keras.layers.Dense(units=d_model)

        self.dense = tf.keras.layers.Dense(units=d_model)

    def split_heads(self, inputs, batch_size):
        inputs = tf.reshape(
            inputs, shape=(batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(inputs, perm=[0, 2, 1, 3])

    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs[
            'value'], inputs['mask']
        batch_size = tf.shape(query)[0]

        # linear layers
        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)

        # split heads
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        # scaled dot-product attention
        scaled_attention = scaled_dot_product_attention(query, key, value, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        # concatenation of heads
        concat_attention = tf.reshape(scaled_attention,
                                      (batch_size, -1, self.d_model))

        # final linear layer
        outputs = self.dense(concat_attention)
        return outputs 

### Masking

In [None]:
def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, sequence length)
    return mask[:, tf.newaxis, tf.newaxis, :] 

def create_look_ahead_mask(x):
    seq_len = tf.shape(x)[1]
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    padding_mask = create_padding_mask(x)
    return tf.maximum(look_ahead_mask, padding_mask) 

### Positional Encoding

More info on positional encoding here: https://github.com/tensorflow/examples/blob/master/community/en/position_encoding.ipynb  

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):

    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_angles(self, position, i, d_model):
        angles = 1 / tf.pow(10000.0, (2.0 * (i // 2.0)) / tf.cast(d_model, tf.float32))
        return position * angles

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
            i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
            d_model=d_model)
        # apply sin to even index in the array
        sines = tf.math.sin(angle_rads[:, 0::2])
        # apply cos to odd index in the array
        cosines = tf.math.cos(angle_rads[:, 1::2])

        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :] 

### Encoder Layer

In [None]:
def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    attention = MultiHeadAttention(
      d_model, num_heads, name="attention")({
          'query': inputs,
          'key': inputs,
          'value': inputs,
          'mask': padding_mask
      })
    attention = tf.keras.layers.Dropout(rate=dropout)(attention)
    attention = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(inputs + attention)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention + outputs)

    return tf.keras.Model(
      inputs=[inputs, padding_mask], outputs=outputs, name=name) 

### Encoder

In [None]:
def encoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout,
            name="encoder"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)
    for i in range(int(num_layers)):
        outputs = encoder_layer(
            units=units,
            d_model=d_model,
            num_heads=num_heads,
            dropout=dropout,
            name="encoder_layer_{}".format(i),
        )([outputs, padding_mask])

    return tf.keras.Model(
      inputs=[inputs, padding_mask], outputs=outputs, name=name)

### Decoder Layer

In [None]:
def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")
    look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name="look_ahead_mask")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

    attention1 = MultiHeadAttention(
      d_model, num_heads, name="attention_1")(inputs={
          'query': inputs,
          'key': inputs,
          'value': inputs,
          'mask': look_ahead_mask
      })
    attention1 = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention1 + inputs)

    attention2 = MultiHeadAttention(
      d_model, num_heads, name="attention_2")(inputs={
          'query': attention1,
          'key': enc_outputs,
          'value': enc_outputs,
          'mask': padding_mask
      })
    attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
    attention2 = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention2 + attention1)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(outputs + attention2)

    return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name) 

### Decoder

In [None]:
def decoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout,
            name='decoder'):
    inputs = tf.keras.Input(shape=(None,), name='inputs')
    enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')
    look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name='look_ahead_mask')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(int(num_layers)):
        outputs = decoder_layer(
            units=units,
            d_model=d_model,
            num_heads=num_heads,
            dropout=dropout,
            name='decoder_layer_{}'.format(i),
        )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])

    return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)

### Create Model

In [None]:
def transformer(vocab_size,
                num_layers,
                units,
                d_model,
                num_heads,
                dropout,
                name="transformer"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")

    enc_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='enc_padding_mask')(inputs)
    # mask the future tokens for decoder inputs at the 1st attention block
    look_ahead_mask = tf.keras.layers.Lambda(
      create_look_ahead_mask,
      output_shape=(1, None, None),
      name='look_ahead_mask')(dec_inputs)
    # mask the encoder outputs for the 2nd attention block
    dec_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='dec_padding_mask')(inputs)

    enc_outputs = encoder(
      vocab_size=vocab_size,
      num_layers=num_layers,
      units=units,
      d_model=d_model,
      num_heads=num_heads,
      dropout=dropout,
    )(inputs=[inputs, enc_padding_mask])

    dec_outputs = decoder(
      vocab_size=vocab_size,
      num_layers=num_layers,
      units=units,
      d_model=d_model,
      num_heads=num_heads,
      dropout=dropout,
    )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])

    outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)

    return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name) 

## Training

In [None]:
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))

    loss = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

In [None]:
# custom learning rate
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

### Initialize Model

In [None]:
# clear backend
tf.keras.backend.clear_session()

learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
    # ensure labels have shape (batch_size, MAX_LENGTH - 1)
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred) 

In [None]:
# initialize and compile model within strategy scope
with strategy.scope():
    model = transformer(
      vocab_size=VOCAB_SIZE,
      num_layers=NUM_LAYERS,
      units=UNITS,
      d_model=D_MODEL,
      num_heads=NUM_HEADS,
      dropout=DROPOUT)

    model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

model.summary()

### Fit Model

In [None]:
import datetime

checkpoint_path = "training_2/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

logdir = os.path.join("logs"
                    , "{}_L {}_D {}_H {}_U {}_D".format(NUM_LAYERS, D_MODEL, NUM_HEADS, UNITS, DROPOUT)
                    , datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
# Tensorboard callback
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True)

model.fit(dataset, epochs=EPOCHS, callbacks = [cp_callback,tensorboard_callback]) 

In [34]:
load_status = model.load_weights("training_2/cp.ckpt")

### Evaluation

In [None]:
## https://towardsdatascience.com/hyperparameter-optimization-for-optimum-transformer-models-b95a32b70949

In [35]:
def evaluate(sentence, model):
    sentence = textPreprocess(sentence)

    sentence = tf.expand_dims(
      START_TOKEN + tokenizer.encode(sentence) + END_TOKEN, axis=0)

    output = tf.expand_dims(START_TOKEN, 0)

    for i in range(MAX_LENGTH):
        predictions = model(inputs=[sentence, output], training=False)

        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if tf.equal(predicted_id, END_TOKEN[0]):
            break

        # concatenated the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0)


def predict(sentence,model):
    prediction = evaluate(sentence,model)

    predicted_sentence = tokenizer.decode(
      [i for i in prediction if i < tokenizer.vocab_size])

    print('Input: {}'.format(sentence))
    print('Output: {}'.format(predicted_sentence))

    return predicted_sentence 

In [36]:
loaded_model = model

In [37]:
output = predict("hey!!!!", loaded_model) 

Input: hey!!!!
Output: umm yes 


### Claudbot

In [38]:
def evaluateInput(model):
    input_sentence = ''
    while(1):
        # Get input sentence
        input_sentence = input('> ')
        # Check if it is quit case
        if input_sentence == 'q' or input_sentence == 'quit': break
        output_words = evaluate(input_sentence,model)
        predicted_sentence = tokenizer.decode([i for i in output_words if i < tokenizer.vocab_size])
        print('Claudbot: {}'.format(predicted_sentence))


In [39]:
evaluateInput(loaded_model)

> hey
Claudbot: umm yes 
> ?
Claudbot: you sent a link tom hiddleston in this foto us this foto you 
> hmm
Claudbot:  
> interesting..
Claudbot: the first years are hilarious but the days where the winner and loses lol but its like is hard to talk to me 
> q


## Tensorboard

In [42]:
# load tensorboard extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [43]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 14308), started 0:02:48 ago. (Use '!kill 14308' to kill it.)