In [None]:
import compyute as cp

In [None]:
device = "cuda" if cp.engine.gpu_available() else "cpu"
device

# Example 5.2

### Language Model: Neural network

The bigram model is able to predict the following character by looking at the previous one. For better predictions it helps to not only consider one character for a prediction. In this example a neural network is used that uses multiple characters for predictions.

### Step 1: Prepare data
Like in the bigram model, the tinyshakespeare dataset is used. (https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt)

In [None]:
with open("../data/tinyshakespeare.txt", "r") as f:
    data = f.read()[:100000]

### Step 2: Tokenization

This time, a Byte-Pair-Encoding tokenizer is used to allow for more information to be passed into the neural net without increasing the context size.

In [None]:
from compyute.preprocessing.text import BPETokenizer

tokenizer = BPETokenizer()

In [None]:
# train new tokenizer
tokenizer.fit(data, vocab_size=1024)
cp.save(tokenizer.get_state_dict(), "tokenizer.cp")

In [None]:
# load tokenizer
tokenizer_state = cp.load("tokenizer.cp")
tokenizer.load_state_dict(tokenizer_state)

In [None]:
tokenizer.vocab_size

In [None]:
data_enc = tokenizer.encode(data)
len(data_enc)

### Step 3: Build dataset
In this example a larger `block_size` is now used.

In [None]:
block_size = 32

In [None]:
X = cp.stack([data_enc[i * block_size : i * block_size + block_size] for i in range(len(data_enc) // block_size)])
y = cp.stack([data_enc[i * block_size + block_size] for i in range(len(data_enc) // block_size)])

X, idx = cp.random.shuffle(X)
y = y[idx]

n = int(len(X) * 0.90)

X_train = X.to_int()[:n]
y_train = y.to_int()[:n]
X_val = X.to_int()[n:]
y_val = y.to_int()[n:]

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_val.shape=}")
print(f"{y_val.shape=}")

### Step 4: Build the neural network structure

As our first layer, again, an `Embedding` layer is used. It is followed by a stack of linear layers.

In [None]:
import compyute.nn as nn

vocab_size = tokenizer.vocab_size
embed_dims = 10
n_hidden = 128

model = nn.Sequential(
    nn.Embedding(vocab_size, embed_dims),
    nn.Flatten(),
    nn.DenseBlock(block_size*embed_dims, n_hidden, activation="tanh", weight_init="kaiming_normal"),
    nn.DenseBlock(n_hidden, n_hidden, activation="tanh", weight_init="kaiming_normal"),
    nn.DenseBlock(n_hidden, n_hidden, activation="tanh", weight_init="kaiming_normal"),
    nn.DenseBlock(n_hidden, n_hidden, activation="tanh", weight_init="kaiming_normal"),
    nn.DenseBlock(n_hidden, n_hidden, activation="tanh", weight_init="kaiming_normal"),
    nn.Linear(n_hidden, vocab_size)
)
model.to_device(device)

In [None]:
summary = cp.nn.utils.get_module_summary(model, input_shape=(block_size,), input_dtype="int32")
print(summary)

### Step 5: Train the model

To avoid overfitting the model, the `EarlyStopping`-Callback can be used.

In [None]:
from compyute.nn import losses, metrics, optimizers
from compyute.nn.trainer import Trainer
from compyute.nn.trainer.callbacks import History, ProgressBar

history = History()

trainer = Trainer(
    model=model,
    optimizer=optimizers.Adam(),
    loss=losses.CrossEntropy(),
    metric=metrics.Accuracy(),
    callbacks=[history,ProgressBar()]
)

In [None]:
with model.retain_values():
    trainer.train(X_train, y_train, epochs=1, val_data=(X_val, y_val), batch_size=32)

### Step 6: Analyze the model
Usind different plots, the models performance and training behaviour can be analyzed (eg. checking for overfitting)

In [None]:
# !pip install matplotlib

In [None]:
import matplotlib.pyplot as plt

def plot_distrbution(ys):
    plt.figure(figsize=(20, 4))
    legends = []
    for y in ys:
        label, tensor = y
        mean = cp.mean(tensor).item()
        std = cp.std(tensor).item()
        print(f"{label:10s} | mean {mean:9.4f} | std {std:9.4f}")
        y_vals, x_vals = cp.histogram(tensor, bins=n_hidden, density=True)
        plt.plot(x_vals[:-1], y_vals, linewidth=1)
        legends.append(f"{label:s}")
    plt.legend(legends)

##### Weights

In [None]:
lin_weights = [(str(i), p.to_cpu()) for i, p in enumerate(model.get_parameters()) if p.n_axes > 1]
plot_distrbution(lin_weights)

##### Activations

In [None]:
tanh_activations = [
    (f"{l.label}{i}", l.y.to_cpu())
    for i, l in enumerate(model.modules[:-1])
    if l.label == "DenseBlock"
]
plot_distrbution(tanh_activations)

In [None]:
tanh_saturations = [
    (f"{l.label}{i}", cp.abs(l.y.to_cpu()) > 0.99)
    for i, l in enumerate(model.modules)
    if l.label == "DenseBlock"
]

for label, image in tanh_saturations:
    plt.imshow(image, cmap="gray")
    plt.xlabel(label)
    plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)
    plt.show()

As mentioned, the gradient of saturated neurons get very close to zero. If that happens for all batches, then the neuron is not learning and it is considererd dead (white pixels in the plot).

##### Gradients

In [None]:
lin_weight_grads = [(str(i), p.grad.to_cpu()) for i, p in enumerate(model.get_parameters()) if p.n_axes > 1]
plot_distrbution(lin_weight_grads)

In [None]:
activation_gradients = [
    (f"{l.label}{i}", l.y.grad)
    for i, l in enumerate(model.modules[:-1])
    if l.label == "DenseBlock"
]
plot_distrbution(activation_gradients)