In [None]:
import walnut

# Example 6

### Character level language model

The goal of this model is to be able to generate text that is similar to the training data.

### Step 1: Prepare data
You will need to download the dataset from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt and place it into the *data* directory.

In [None]:
with open("data/tinyshakespeare.txt", "r") as f:
    data = f.read()

To build a vocabulary of tokens, the `CharacterTokenizer` is used. In this step the `fit()` extracts tokens from the previously imported data. Here, a token is represented by a single a character.

In [None]:
from walnut.preprocessing.text import CharacterTokenizer

tknzr = CharacterTokenizer()
tknzr.fit(data)
tknzr.vocab_size

Encode the data

In [None]:
data_enc = tknzr.encode(data)
data_enc[:100]

Next up, we need to build the training dataset. `num_samples` represents the number of inputs for the neural network. `block_size` defines, how many characters are considered when trying to predict the following one.

In [None]:
num_samples = 100000
block_size = 8

To use the dataset for training, the following steps must be performed:
- choose random samples from the data by randomly selecting a sequence of 8 characters for training and the 9th character as the target
- one-hot-encode tokens

In [None]:
import numpy as np
from walnut.preprocessing.encoding import one_hot_encode

# initialize tensors with zeros
X = walnut.zeros((num_samples, block_size, tknzr.vocab_size))
Y = walnut.zeros((num_samples, tknzr.vocab_size))

#randomly choose indices of blocks in the original data
rand_indices = np.random.randint(0, len(data) - block_size, (num_samples,))

for i, index in enumerate(rand_indices):
    # get characters and the label from the data
    context = data_enc[index : index + block_size]
    label = walnut.match_dims(data_enc[index + block_size], 1)

    # one-hot-encode indices and add to the tensors
    X[i] = one_hot_encode(context, tknzr.vocab_size).data
    Y[i] = one_hot_encode(label, tknzr.vocab_size).data

Train/val/test split

In [None]:
x_train = X
y_train = Y

print(f"{x_train.shape=}")
print(f"{y_train.shape=}")

### Step 2: Build the neural network structure

As our first layer, an `Embedding` is used. It assigns each token a n-dimensional vector. The vector's components are learned and updated during the training process.

In [None]:
import walnut.nn as nn
from walnut.nn.layers import *
from walnut.nn.inits import kaiming_normal, get_gain

vocab_size = tknzr.vocab_size
embed_dims = 10
n_hidden = 256

gain = get_gain("tanh")
wl1 = kaiming_normal((block_size*embed_dims, n_hidden), gain)
wl2 = kaiming_normal((n_hidden, n_hidden), gain)
wl3 = kaiming_normal((n_hidden, n_hidden), gain)
wl4 = kaiming_normal((n_hidden, n_hidden), gain)
wl5 = kaiming_normal((n_hidden, n_hidden), gain)


model = nn.Sequential([
    Embedding(vocab_size, embed_dims), Layernorm((block_size, embed_dims)),
    Reshape(),
    Linear(block_size*embed_dims, n_hidden, use_bias=False, weights=wl1), Layernorm((n_hidden,)), Tanh(),
    Linear(n_hidden, n_hidden, use_bias=False, weights=wl2), Layernorm((n_hidden,)), Tanh(),
    Linear(n_hidden, n_hidden, use_bias=False, weights=wl3), Layernorm((n_hidden,)), Tanh(),
    Linear(n_hidden, n_hidden, use_bias=False, weights=wl4), Layernorm((n_hidden,)), Tanh(),
    Linear(n_hidden, n_hidden, use_bias=False, weights=wl5), Layernorm((n_hidden,)), Tanh(),
    Linear(n_hidden, vocab_size), Softmax()
])

In [None]:
model.compile(
    nn.optimizers.Adam(l_r=4e-3),
    nn.losses.Crossentropy(),
    nn.metrics.Accuracy()
)

In [None]:
model

### Step 3: Train the model

Since there are usually quite a large number of classes (=tokens) in language models, the training process is slower.

In [None]:
epochs = 20000
batch_size = 64

train_loss_hist, val_loss_hist = model.train(x_train, y_train, epochs=epochs, batch_size=batch_size)

### Step 4: Evaluate the model

In [None]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f'loss {loss:.4f}')
print(f'accuracy {100*accuracy:.2f}')

# 49.00 (20000e, b8, n64, 10d

### Step 5: Generate text
To see the model in action, it is given a starting sequence. Then it is used to generate $n$ characters using previous charactes as input. The better the model is trained, the more sensical the output will be. This can take quite a lot of training though.

In [None]:
context = walnut.ones((block_size,)).astype("int") # use ones as startig context

for _ in range(1000):
    context_enc = one_hot_encode(context, tknzr.vocab_size) # encode tokens
    context_enc = walnut.expand_dims(context_enc, 0) # create fake batch dim

    pred = model(context_enc) # get model prediction for a character
    index = walnut.choice(pred) # choose a character from prediction
    print(tknzr.decode(walnut.expand_dims(index, 0)), end="")

    context = context.append(index, axis=0).astype("int") # append predicted character
    context = context[1:] # set new context