In [3]:
import torch
import random

"\nThe MLP implemented in this notebook is based on the design from\nthe paper 'A Neural Probabilistic Language Model' (Bengio et al. 2003)\nhttps://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf\n\nKaiming initialisation is implemented based on 'Delving Deep into Rectifiers'\n(Kaiming et al. 2015) https://arxiv.org/pdf/1502.01852.pdf\n\nBatch normalisation is implemented is based on work from the paper 'Batch\nNormalization:  Accelerating Deep Network Training by Reducing Internal \nCovariate Shift' (Ioffe et al. 2015) https://arxiv.org/pdf/1502.03167.pdf\n"

In [4]:
with open('names.txt', 'r', encoding='utf-8') as f:
    text = f.read().split('\n')

# Unique characters in the text
chars = ['.'] + sorted(list(set(''.join(text))))
vocab_size = len(chars)

# Mapping from characters to integers and vice versa
char_to_int = {c: i for i, c in enumerate(chars)}
int_to_char = {i: c for i, c in enumerate(chars)}

In [5]:
class Linear:
    """Linear layer with optional bias."""

    def __init__(self, nin: int, nout: int, bias: bool=True):
        kaiming_init = 1 / nin**0.5
        self.weights = torch.randn((nin, nout)) * kaiming_init
        self.bias = torch.zeros(nout) if bias else None

    def __call__(self, x: list[float]):
        self.out = x @ self.weights
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weights] + ([] if self.bias is None else [self.bias])


class BatchNorm1D:
    """Batch normalisation layer."""

    def __init__(self, dims: int, epsilon: float=1e-5, momentum: float=0.1):
        self.epsilon = epsilon
        self.momentum = momentum
        self.training = True
        # Parameters.
        self.gamma = torch.ones(dims)
        self.beta = torch.zeros(dims)
        self.mean_live = torch.zeros(dims)
        self.varience_live = torch.ones(dims)

    def __call__(self, x: list[float]):
        # Forward pass.
        if self.training:
            x_mean = x.mean(0, keepdim=True) # Batch mean.
            x_varience = x.var(0, keepdim=True) # Batch varience.
        else:
            x_mean = self.mean_live
            x_varience = self.varience_live
        x_norm = (x - x_mean) / torch.sqrt(x_varience + self.epsilon) # Normalise to unit varience.
        self.out = self.gamma * x_norm + self.beta
        # Update parameters.
        if self.training:
            with torch.no_grad():
                self.mean_live = (1 - self.momentum) * self.mean_live + self.momentum * x_mean
                self.varience_live = (1 - self.momentum) * self.varience_live + self.momentum * x_varience
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

class Tanh:
    """Tanh activation function."""

    def __call__(self, x: list[float]):
        self.out = torch.tanh(x)
        return self.out

    def parameters(self):
        return []

In [6]:
# Hyperparameters
block_size = 3 # Context length (characters).
embedding_dims = 10 # Number of dimensions for the embedding space.
batch_size = 32 # Number of examples to process at a time in training.
hidden_layer_size = 100 # Number of neurons in the hidden layer.
init_lr = 0.1 # Initial learning rate.
final_lr = 0.01 # Final learning rate.
max_steps = 200000

def build_dataset(text: list[str]) -> tuple[torch.Tensor, torch.Tensor]:
    """Builds the dataset for training the model."""
    X, Y = [], []
    for word in text:
        context = [0] * block_size # Padding the context with initial '.' characters
        print(f'word: {word}')
        for char in word + '.':
            ix = char_to_int[char]
            X.append(context)
            Y.append(ix)
            print(f'context: {"".join(int_to_char[i] for i in context)} -> target: {int_to_char[ix]}')
            context = context[1:] + [ix] # Update context
    return torch.tensor(X), torch.tensor(Y)

random.shuffle(text)
n1 = int(0.8 * len(text))
n2 = int(0.9 * len(text))

x_train, y_train = build_dataset(text[:n1])
x_val, y_val = build_dataset(text[n1:n2])
x_test, y_test = build_dataset(text[n2:])

In [7]:
# Embedding matrix.
C = torch.randn((vocab_size, embedding_dims))

# MLP.
layers = [
    Linear(embedding_dims * block_size, hidden_layer_size), BatchNorm1D(hidden_layer_size), Tanh(),
    Linear(hidden_layer_size, hidden_layer_size), BatchNorm1D(hidden_layer_size), Tanh(),
    Linear(hidden_layer_size, hidden_layer_size), BatchNorm1D(hidden_layer_size), Tanh(),
    Linear(hidden_layer_size, hidden_layer_size), BatchNorm1D(hidden_layer_size), Tanh(),
    Linear(hidden_layer_size, hidden_layer_size), BatchNorm1D(hidden_layer_size), Tanh(),
    Linear(hidden_layer_size, vocab_size), BatchNorm1D(vocab_size)
]

# Initialisations.
with torch.no_grad():
    layers[-1].gamma *= 0.1 # Make last layer less confident.
    for layer in layers[:-1]: # For all other layers, apply gain.
        if isinstance(layer, Linear):
            layer.weights *= 5/3

params = [C] + [param for layer in layers for param in layer.parameters()]
for param in params:
    param.requires_grad = True
    
print(f'Number of parameters: {sum(param.nelement() for param in params)}')

Number of parameters: 47551


In [8]:
# Gradient descent.
for i in range(max_steps):

    # Constructing batches.
    idx = torch.randint(0, x_train.shape[0], (batch_size, ))

    # Forward pass.
    embedding = C[x_train[idx]] # Embed characters into vectors.
    x = embedding.view(embedding.size(0), -1)
    for layer in layers:
        x = layer(x)
    loss = torch.nn.functional.cross_entropy(x, y_train[idx])

    # Backward pass.
    for layer in layers:
        layer.out.retain_grad()
    for param in params:
        param.grad = None
    loss.backward()

    # Stochastic gradient descent.
    lr = init_lr if i < (max_steps / 2) else final_lr
    for param in params:
        param.data -= lr * param.grad

In [9]:
def split_loss(split: str) -> None:
    ''' Evaluates the model on the chosen split. '''

    x, y = {
        'train': (x_train, y_train),
        'val': (x_val, y_val),
        'test': (x_test, y_test)
    }[split]
    
    # Forward pass.
    embedding = C[x]
    x = embedding.view(embedding.size(0), -1)
    for layer in layers:
        x = layer(x)
    loss = torch.nn.functional.cross_entropy(x, y)
    print(f'{split.capitalize()} Loss: {loss.data}')

In [10]:
# Set layers to evaluation mode.
for layer in layers:
  layer.training = False

split_loss('test')
split_loss('val')

Test Loss: 2.0837438106536865
Val Loss: 2.0869650840759277


In [11]:
# Sample from the model.
for _ in range(5):
    out = []
    context = [0] * block_size # Initialise context to '...'

    while True:
        # Forward pass.
        embedding = C[torch.tensor([context])]
        x = embedding.view(embedding.size(0), -1)
        for layer in layers:
            x = layer(x)
        logits = x
        probs = torch.nn.functional.softmax(logits, dim=1)

        # Sample from the distribution.
        idx = torch.multinomial(probs, num_samples=1, replacement=True).item()
        context = context[1:] + [idx] # Shift the context window.
        out.append(itos[idx])
        if idx == 0: # If we sample '.', stop.
            break
    print(''.join(out))

yannayvin.
mala.
laii.
sabishah.
greigholline.
