In [None]:
import sys
sys.path.append("..") # for sibling import

import compyute as cp

# Example 5.2

### Language Model: Neural network

The bigram model is able to predict the following character by looking at the previous one. For better predictions it helps to not only consider one character for a prediction. In this example a neural network is used that uses multiple characters for predictions.

### Step 1: Prepare data
Like in the bigram model, the tinyshakespeare dataset is used. (https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt)

In [None]:
with open("../data/tinyshakespeare.txt", "r") as f:
    data = f.read()

### Step 2: Tokenization

This time, a Byte-Pair-Encoding tokenizer is used to allow for more information to be passed into the neural net without increasing the context size.

In [None]:
from compyute.preprocessing.text import BPETokenizer, save_tokenizer, load_tokenizer

# tokenizer = BPETokenizer()
# tokenizer.fit(data, vocab_size=400)
# save_tokenizer(tokenizer, "nn_tokenizer.cp")
tokenizer = load_tokenizer("nn_tokenizer.cp")

tokenizer.vocab_size

In [None]:
data_enc = tokenizer.encode(data)[:5000]
len(data_enc)

### Step 3: Build dataset
In this example a larger `block_size` is now used.

In [None]:
num_samples = 10000
block_size = 32

In [None]:
X = cp.zeros((num_samples, block_size))
y = cp.zeros((num_samples, block_size))

rand_indices = cp.random.uniform_int((num_samples,), 0, len(data_enc) - block_size - 1, )

for i, index in enumerate(rand_indices):
    context = data_enc[index : index + block_size]
    label = data_enc[index + 1 : index + block_size + 1]

    X[i] = context
    y[i] = label

n = int(len(X) * 0.9)

X_train = X.int()[:n]
y_train = y.int()[:n,-1]
X_val = X.int()[n:]
y_val = y.int()[n:,-1]

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_val.shape=}")
print(f"{y_val.shape=}")

### Step 4: Build the neural network structure

As our first layer, again, an `Embedding` layer is used. It is followed by a stack of linear layers.

In [None]:
import compyute.nn as nn

vocab_size = tokenizer.vocab_size
embed_dims = 30
n_hidden = 256

model = nn.SequentialModel([
    nn.Embedding(vocab_size, embed_dims),
    nn.Flatten(),

    nn.Linear(block_size*embed_dims, n_hidden, use_bias=False),
    nn.Layernorm((n_hidden,)),
    nn.Tanh(),

    nn.Linear(n_hidden, n_hidden, use_bias=False),
    nn.Layernorm((n_hidden,)),
    nn.Tanh(),

    nn.Linear(n_hidden, n_hidden, use_bias=False),
    nn.Layernorm((n_hidden,)),
    nn.Tanh(),
    
    nn.Linear(n_hidden, vocab_size)
])

`Compyute` also includes a few methods to decay the learning rate.

In [None]:
nn.model_summary(model, (block_size,), "int32")

### Step 5: Train the model

In [None]:
from compyute.nn.trainer import callbacks, optimizers, losses, metrics, Trainer

trainer = Trainer(
    model=model,
    optimizer=optimizers.AdamW(lr=1e-2),
    loss_functon=losses.Crossentropy(),
    metric_function=metrics.Accuracy(),
    callbacks=[callbacks.lr_decay.CosineLR(lr_min=1e-3, until_epoch=5)]
)

In [None]:
model.retain_values = True
history = trainer.train(X_train, y_train, epochs=5, verbose=2, val_data=(X_val, y_val), batch_size=256)

In [None]:
# !pip install matplotlib

In [None]:
import matplotlib.pyplot as plt

def plot_history(losses, scores, label):
    plt.figure(figsize=(10, 3))
    plt.plot(cp.arange(start=1, stop=len(losses) + 1).to_numpy(), losses, linewidth=1)
    plt.plot(cp.arange(start=1, stop=len(scores) + 1).to_numpy(), scores, linewidth=1)
    plt.title(f"{label} history")
    plt.legend([f"{label}_loss", f"{label}_score"])
    plt.grid(color="gray", linestyle="--", linewidth=0.5)

In [None]:
plot_history(trainer.state["train_losses"], trainer.state["train_scores"], "train")

In [None]:
plot_history(trainer.state["val_losses"], trainer.state["val_scores"], "val")

### Step 6: Analyze the model
Usind different plots, the models performance and training behaviour can be analyzed (eg. checking for overfitting)

If the `normal` weight initialization method is used, the **tanh** activations get saturated very fast and the gradients "die out". If other initializers, such as `kaiming_normal` are used, this couteracts this behaviour. Furthermore the initial loss is lower and the model is therefore not wasting time correcting unnecessary high weight values in the beginning (Analysis inspired by Andrej Karpathy - highly recommend checking out his videos on YouTube)

In [None]:
import matplotlib.pyplot as plt
import numpy

def plot_distrbution(ys):
    plt.figure(figsize=(15, 3))
    legends = []
    for y in ys:
        label, array = y
        mean = array.mean()
        std = array.std()
        print(f"{label:10s} | mean {mean:9.4f} | std {std:9.4f}")
        y_vals, x_vals = numpy.histogram(array, bins=n_hidden, density=True)
        x_vals = numpy.delete(x_vals, -1)
        plt.plot(x_vals, y_vals, linewidth=1)
        legends.append(f"{label:s}")
    plt.grid(color="gray", linestyle="--", linewidth=0.5)
    plt.legend(legends)

##### Weights

In [None]:
lin_weights = [
    (f"{l.__class__.__name__}{i}", l.w.cpu().to_numpy())
    for i, l in enumerate(model.child_modules[0].child_modules)
    if l.__class__.__name__ == "Linear"
]
plot_distrbution(lin_weights)

##### Activations

In [None]:
tanh_activations = [
    (f"{l.__class__.__name__}{i}", l.y.cpu().to_numpy())
    for i, l in enumerate(model.child_modules[0].child_modules)
    if l.__class__.__name__ == "Tanh"
]
plot_distrbution(tanh_activations)

In [None]:
tanh_saturations = [
    (f"{l.__class__.__name__}{i}", l.y.cpu().abs().to_numpy() > 0.99)
    for i, l in enumerate(model.child_modules[0].child_modules)
    if l.__class__.__name__ == "Tanh"
]

for label, image in tanh_saturations:
    plt.figure(figsize=(4, 4))
    plt.imshow(image, cmap="gray")
    plt.xlabel(label)
    plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)
    plt.show()

As mentioned, the gradient of saturated neurons get very close to zero. If that happens for all batches, then the neuron is not learning and it is considererd dead (white pixels in the plot). By using the Kaiming He initialization method this can be reduced.

##### Gradients

In [None]:
weight_gradients = [
    (f"{l.__class__.__name__}{i}", l.w.grad.to_numpy())
    for i, l in enumerate(model.child_modules[0].child_modules)
    if l.__class__.__name__ == "Linear"
]
plot_distrbution(weight_gradients)

In [None]:
activation_gradients = [
    (f"{l.__class__.__name__}{i}", l.y.grad.to_numpy())
    for i, l in enumerate(model.child_modules[0].child_modules)
    if l.__class__.__name__ == "Tanh"
]
plot_distrbution(activation_gradients)