In [None]:
import walnut

# Example 6

### Character level language model

The goal of this model is to be able to generate text that is similar to the training data.

### Step 1: Prepare data
You will need to download the dataset from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt and place it into the *data* directory.

In [None]:
with open("data/tinyshakespeare.txt", "r") as f:
    data = "".join(f.readlines()).lower()

To build a vocabulary of tokens, the `CharacterTokenizer` is used. In this step the `fit()` extracts tokens from the previously imported data. Here, a token is represented by a single a character.

In [None]:
from walnut.preprocessing.text import CharacterTokenizer

tknzr = CharacterTokenizer()
tknzr.fit(data)
tknzr.vocab_size

Next up, we need to build the training dataset. `num_samples` represents the number of inputs for the neural network. `block_size` defines, how many characters are considered when trying to predict the following one.

In [None]:
num_samples = 100000
block_size = 8

To use the dataset for training, the following steps must be performed:
- choose random samples from the data by randomly selecting a sequence of 8 characters for training and the 9th character as the target
- samples and characters must then be encoded using the tokenizer
- all samples must then be one-hot-encoded

In [None]:
import numpy as np
from walnut import Tensor
from walnut.preprocessing.encoding import list_one_hot_encode

# initialize tensors with zeros
X_array = np.zeros((num_samples, block_size, tknzr.vocab_size))
Y_array = np.zeros((num_samples, tknzr.vocab_size))

#randomly choose indices of blocks in the original data
rand_indices = np.random.randint(0, len(data) - block_size, (num_samples,))

for i, index in enumerate(rand_indices):
    # get characters and the label from the data
    context = data[index : index + block_size]
    label = data[index + block_size]

    # encode characters to get the indices
    context_enc = tknzr.encode(context)
    label_enc = tknzr.encode(label)

    # one-hot-encode indices and add to the tensors
    X_array[i] = list_one_hot_encode(context_enc, tknzr.vocab_size)
    Y_array[i] = list_one_hot_encode(label_enc, tknzr.vocab_size)

X = Tensor(X_array, dtype="int")
Y = Tensor(Y_array, dtype="int")

In [None]:
n1 = int(num_samples * 0.8)
n2 = int(n1 + num_samples * 0.1)

x_train = X[:n1]
y_train = Y[:n1]
x_val = X[n1:n2]
y_val = Y[n1:n2]
x_test = X[n2:]
y_test = Y[n2:]

print(f"{x_train.shape=}")
print(f"{y_train.shape=}")
print(f"{x_val.shape=}")
print(f"{y_val.shape=}")
print(f"{x_test.shape=}")
print(f"{y_test.shape=}")

### Step 2: Build the neural network structure

As our first layer, an `Embedding` is used. It assigns each token a n-dimensional vector. The vector's components are learned and updated during the training process.

In [None]:
import walnut.nn as nn

model = nn.Sequential(layers=[
    nn.layers.Embedding(30, input_shape=(block_size, tknzr.vocab_size)),
    nn.layers.Layernorm(),
    nn.layers.Linear(100, act="tanh", norm="layer"),
    nn.layers.Linear(100, act="tanh", norm="layer"),
    nn.layers.Linear(100, act="tanh", norm="layer"),
    nn.layers.Linear(100, act="tanh", norm="layer"),
    nn.layers.Linear(tknzr.vocab_size, act="softmax", norm="layer")
])

In [None]:
model.compile(nn.optimizers.Adam(l_r=1e-3), nn.losses.Crossentropy(), nn.metrics.Accuracy())

In [None]:
model

### Step 3: Train the model

Since there are usually quite a large number of classes (=tokens) in language models, the training process is slower.

In [None]:
train_hist, val_hist = model.train(
    x_train,
    y_train,
    epochs=10,
    batch_size=32,
    verbose="reduced",
    val_data=(x_val, y_val),
    reset_params=False)

The training loss above only represents one batch. Below the model is evaluated on the entire training data.

In [None]:
loss, accuracy = model.evaluate(x_train, y_train)
print(f'training loss {loss:.4f}')
print(f'training accuracy {accuracy:.4f}')

Because `kaiming_he` is used as an initialization method, the activations do not get saturated (a.k.a. fall within the flat spots of the tanh function).

In [None]:
acts = {f"{i + 1} {l.__class__.__name__}" : l.y.data.copy() for i, l in enumerate(model.layers) if l.__class__.__name__ == "Tanh"}
nn.analysis.plot_distrbution(acts, figsize=(15, 3), title="activation distribution", bins=200)

Also, because layer normalization is used, the activiation gradients follow a normal distribuition, even in deeper layers.

In [None]:
act_gradients = {f"{i + 1} {l.__class__.__name__}" : l.y.grad.copy() for i, l in enumerate(model.layers) if l.__class__.__name__ == "Tanh"}
nn.analysis.plot_distrbution(act_gradients, figsize=(15, 3), title="activation gradient distribution", bins=200)

According to A. Karpathy (https://www.youtube.com/watch?v=P6sfmUTpUmc) the ratio between the parameter updates and their value should be around 1e-3 for a suitable learning rate. If the ratio is too high or low, the learning rate should be adjusted.

In [None]:
import math

for i, l in enumerate(model.layers):
    if not isinstance(l, nn.layers.Linear):
        continue
    val = (Tensor(l.w.params["delta"]).std() / l.w.std()).log10().item()
    print(f"{l.__class__.__name__} {i}: {val:.4f}")

### Step 4: Evaluate the model

In [None]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f'loss {loss:.4f}')
print(f'accuracy {accuracy:.4f}')

# high score: accuracy 0.043 after 10000 epochs (25 min training time)

### Step 5: Generate text
To see the model in action, it is given a starting sequence. Then it is used to generate $n$ characters using previous charactes as input. The better the model is trained, the more sensical the output will be. This can take quite a lot of training though.

In [None]:
context_list = [1] * block_size # use " " as start characters
context_enc = list_one_hot_encode(context_list, tknzr.vocab_size)
context = walnut.expand_dims(Tensor(context_enc, dtype="int"), 0)
context.shape

for i in range(500):
    pred = model(context) # get model prediction for a character
    index = walnut.choice(pred) # choose following character
    print(tknzr.decode([index]), end="")
    if len(context_list) == 1:
        context_list = [index]
    else:
        context_list = context_list[1:] + [index]
    context_enc = list_one_hot_encode(context_list, tknzr.vocab_size)
    context = walnut.expand_dims(Tensor(context_enc, dtype="int"), 0)