# Mazu Talk
Mazu Talk is a GPT style, Transformer based Decoder. The code is adapted from two sources:
* the [GPT tutorial](https://keras.io/examples/generative/text_generation_with_miniature_gpt/) by Apoorv Nandan available on the Keras website.
* Generative Deep Learning, 2nd edition, by David Foster (O’Reilly), 2023.

## Install libraries and dependencies

In [None]:
# !pip install -U deep-translator
# !poetry add deep-translator   # for poetry usage

# from deep_translator import GoogleTranslator

In [1]:
%load_ext autoreload
%autoreload 2
import glob
import numpy as np
import json
import re
import string
from IPython.display import display, HTML
import os

import tensorflow as tf
from tensorflow.keras import layers, models, losses, callbacks, saving

2024-04-07 21:59:05.568574: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Set min. log level for TF to mute warnings
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

## Parameters

In [14]:
VOCAB_SIZE = 50000
# MAX_LEN = 80
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 5
DATASET_REPETITIONS = 5

## Load the data
Chinese Poems are sourced from:
* https://www.kaggle.com/datasets/qianboao/chinesepoetrydataset
* https://github.com/chinese-poetry/chinese-poetry

In [None]:
# Open file containing Chinese poetry
# with open('/app/data/chinese-poetry/chinese_poems.txt', 'r') as f:
#     zh_poems = f.readlines()
    
# print(zh_poems[:5])
# print(len(zh_poems))

### Translate Chinese poems to Swedish

In [None]:
# Instantiate the Google Translator
translator = GoogleTranslator(source='zh-CN', target='sv')

# Step through the Chinese poems in batches of 1000 poems each
# TODO: start by testing a smaller set of batches, then get the rest
# for i in range(5000, 305000, 1000):
for i in range(0, 5000, 1000):
    print(f"Translating batch {i}")
    # Create list to store the translations in
    zh_poems_sv = []
    for poem in zh_poems[i: i + 1000]:
        try:
            # Send a batch to the translator and append to the above list
            zh_poems_sv.append(translator.translate(poem))
        except:
            print("Error: Could not translate a poem.")
    # Save the batch as a json file
    with open("./zh_poems_sv/zh_poems_sv_%000006d_%000006d.json" % (i, i + 1000), 'w') as f:
        json.dump(zh_poems_sv, f)
    print("Done!")


### Load Swedish translations from saved files

In [3]:
# Find all the files
file_list = glob.glob("/app/data/zh_poems_sv/*.json")
print(f"Found {len(file_list)} files")
file_list

# Put the file contents in a list
translations_sv = []
for file in file_list:
    with open(file, 'r') as f:
        for poem in json.load(f):
            translations_sv.append(poem)

# Print some examples of the list
print(f"Found {len(translations_sv)} poems")
translations_sv[:2]

Found 5 files
Found 5000 poems


['Vem känner inte till våren när en tjänsteman degraderas? Han kan fortfarande vara full efter att ha lämnat Guo. Silverpennan jagar Bao Xie och Ximen skriver en mening för att imitera molnet. Dongyuan vågar läsa Bai Pengxi och Nanmu borde arbeta med tusentals par. Det finns nya dikter kvar att tigga. Jag är inte alls ond, jag är rädd att jag hörs över hela himlen genom att slå på mitt horn.',
 'Glaset utanför bambun är tio hektar brett, med glaserade plattor ristade högt och lågt. Höstvinden blåser genom den kalldoftande jianjian, ensam och vacker, den svala månen är kall i gryningen. Den glada atmosfären är lika hög som att gå ut av samhället, men vem kan se charmen och sederna hos Yi. Jag skämtar om att jag kysser min moster idag, och Taihua sjunger högt. Han räknas inte med.']

### Concatenate the data

In [4]:
complete_data = translations_sv
len(complete_data)

5000

## Tokenize the data

In [5]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}, '\n'])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s

text_data = [pad_punctuation(x) for x in complete_data]

In [6]:
# Display an example of a recipe
example_data = text_data[25]
example_data

'Den utsökta målarpaviljongen går in i Hongming och den vaga Hongmingen går in i Taiqing . Den lila luften rör sig bort , och de gröna molnen och dimman lyser steg för steg . Vågorna av persikoblomningar tränger igenom tre berg och sköldpaddshornsskärmen är sju hög Plommonfen är utvisad från den jordiska världen , Vem visste att det fanns Peng Ying i världen ? '

In [11]:
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [12]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [13]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

2024-04-07 22:01:20.126113: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [15]:
# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: och
3: .
4: ,
5: är
6: att
7: i
8: det
9: jag


In [16]:
# Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[   11  2411 17704    60    67     7 10323     2    11  4257 20577    60
    67     7  8788     3    11   274   459   253    30    74     4     2
    12    51    40     2   374   106   692    16   692     3   202    13
 16883  2132   747    53   243     2 15063     5   985   265  9414     5
  6669    32    11  3373    48     4    73   680     6     8   369  1545
  1179     7    48    37     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0]


## Create the Training Set

In [17]:
# Create the training set of recipes and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


# train_ds = text_ds.map(prepare_inputs)
train_ds = text_ds.map(prepare_inputs).repeat(DATASET_REPETITIONS)

In [18]:
example_input_output = train_ds.take(1).get_single_element()
# Example Input
example_input_output[0][0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([10446,     5,  1144,     2,  2279,     2,  1451,     5,   100,
          88,    32,  7000,     3,  7129,   324,     5,    46,  1111,
          20,  6256,     2,  6813,     5,    20,   288,  9429, 19231,
         853,    19,    68,     4,     2,     8,    27,  5300,  3790,
           4,     2,    12,  6250,  7770,  2870,  1509, 22717,     8,
          27,  1509,  6225,     7,  4720,     4,   996,   996,    21,
          25,     6,   220,   219,    85,   329,    37,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0])>

In [19]:
# Example Output (shifted by one token)
example_input_output[1][0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([    5,  1144,     2,  2279,     2,  1451,     5,   100,    88,
          32,  7000,     3,  7129,   324,     5,    46,  1111,    20,
        6256,     2,  6813,     5,    20,   288,  9429, 19231,   853,
          19,    68,     4,     2,     8,    27,  5300,  3790,     4,
           2,    12,  6250,  7770,  2870,  1509, 22717,     8,    27,
        1509,  6225,     7,  4720,     4,   996,   996,    21,    25,
           6,   220,   219,    85,   329,    37,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0])>

## Create the Causal Attention Mask function

In [20]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


np.transpose(causal_attention_mask(1, 10, 10, dtype=tf.int32)[0])

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int32)

## Create a Transformer Block layer

In [21]:
class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, key_dim, embed_dim, ff_dim, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate
        self.attn = layers.MultiHeadAttention(
            num_heads, key_dim, output_shape=embed_dim
        )
        self.dropout_1 = layers.Dropout(self.dropout_rate)
        self.ln_1 = layers.LayerNormalization(epsilon=1e-6)
        self.ffn_1 = layers.Dense(self.ff_dim, activation="relu")
        self.ffn_2 = layers.Dense(self.embed_dim)
        self.dropout_2 = layers.Dropout(self.dropout_rate)
        self.ln_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(
            batch_size, seq_len, seq_len, tf.bool
        )
        attention_output, attention_scores = self.attn(
            inputs,
            inputs,
            attention_mask=causal_mask,
            return_attention_scores=True,
        )
        attention_output = self.dropout_1(attention_output)
        out1 = self.ln_1(inputs + attention_output)
        ffn_1 = self.ffn_1(out1)
        ffn_2 = self.ffn_2(ffn_1)
        ffn_output = self.dropout_2(ffn_2)
        return (self.ln_2(out1 + ffn_output), attention_scores)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "key_dim": self.key_dim,
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "ff_dim": self.ff_dim,
                "dropout_rate": self.dropout_rate,
            }
        )
        return config

## Create Token and Position Embedding

In [22]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "max_len": self.max_len,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config

## Build the Transformer Model

In [23]:
inputs = layers.Input(shape=(None,), dtype=tf.int32)
x = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x, attention_scores = TransformerBlock(
    N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FORWARD_DIM
)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
gpt = models.Model(inputs=inputs, outputs=[outputs, attention_scores])
gpt.compile("adam", loss=[losses.SparseCategoricalCrossentropy(), None])

In [24]:
gpt.summary()

In [25]:
if LOAD_MODEL:
    gpt.load_weights("./checkpoint/checkpoint.ckpt")

## Train the Transformer

In [26]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y, att = self.model.predict(x, verbose=0)
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            info.append(
                {
                    "prompt": start_prompt,
                    "word_probs": probs,
                    "atts": att[0, :, -1, :],
                }
            )
            start_tokens.append(sample_token)
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("Vatten och luft", max_tokens=80, temperature=1.0)

In [27]:
# Create a model save checkpoint
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint..weights.h5",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab)

In [28]:
gpt.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)

Epoch 1/5


I0000 00:00:1712527518.932882   12254 service.cc:145] XLA service 0x7fd940019cb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1712527518.932917   12254 service.cc:153]   StreamExecutor device (0): NVIDIA RTX A2000 8GB Laptop GPU, Compute Capability 8.6
2024-04-07 22:05:18.981426: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
W0000 00:00:1712527519.075204   12254 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
2024-04-07 22:05:19.207731: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8906




















I0000 00:00:1712527535.861462   12254 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m112/785[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m37s[0m 56ms/step - loss: 6.9594

W0000 00:00:1712527542.699245   12255 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert






















[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - loss: 4.7119






















































































generated text:
Vatten och luft har smakar på dagen när man ser orden . den vita yao - gamla seden är full av höst . blommorna är smärtsamt att månens själ , och täckt av doftande vinden och tomma floden miluo i fara är lätta goda hästar och flyger av röda fisken och redo att göra dig om liu lågor . vinden öste ner på musiken i snön . 

[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 270ms/step - loss: 4.7107
Epoch 2/5
[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 2.3364





























generated text:
Vatten och luft har gett pärlor subtila kang blivit mer än att se fram emot bröstet . han tittar på många sjukdomar ber om himlens port , och en vårutflykt kan resa utan bambu , om hästen planteras sparsamt för dag . du kan inte med dig lycklig blir du kan inte röra dig i skymningen . det är inte lätt , om du tänker utan pengar på de tre år . 

[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 85ms/step - loss: 2.3361
Epoch 3/5
[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 1.0863
generated text:
Vatten och luft har fallit är lång , och den sorglösa i klipprötterna är flitig och stenen är oregerlig , men hängande . vad konstigt att tänka på den till shaoyang börjar , och det är särskilt märkligt . 

[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 58ms/step - loss: 1.0860
Epoch 4/5
[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.5210
generated text:
Vatten och 

<keras.src.callbacks.history.History at 0x7fda46930090>