I want to play with model hyper parameters.

In [None]:
import math
import time
import torch
from tempfile import TemporaryDirectory
from typing import Tuple
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [None]:
# Prepare the data set
# --------------------

def read_text(path):
  with open(path, 'r') as f:
    return f.read()
  
text_paths = [
  "../a06_RNN_language_model/animal_farm.txt",
  "../a22_transformer_larger_text/books/alice-s-adventures-in-wonderland.txt",
  "../a22_transformer_larger_text/books/a_room_with_a_view_forster.txt",
  "../a22_transformer_larger_text/books/the_enchanted_april_elizabeth_von_arnim.txt"
]

texts = [read_text(path) for path in text_paths]
text_train = ' '.join(texts)

print("Text size: ", len(text_train))

# Create list of unique characters
vocab = sorted(list(set(text_train)))

# Create dictionaries that map characters to integers and vice versa
char_to_int = {c: i for i, c in enumerate(vocab)}
int_to_char = {i: c for i, c in enumerate(vocab)}

# Convert text to integers
train_data = torch.tensor([char_to_int[c] for c in text_train])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 50
train_data = batchify(train_data, batch_size)  # shape ``[seq_len, batch_size]``


In [None]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of ``-inf``, with zeros on ``diag``."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
  
  
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
bptt = 30

def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
  """
  Args:
      source: Tensor, sha2[seq_len * batch_size]``
  """
  seq_len = min(bptt, len(source) - 1 - i)
  data = source[i:i+seq_len]
  target = source[i+1:i+1+seq_len].reshape(-1)
  return data, target

In [None]:
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 400  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 8  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 8  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)
sum(p.numel() for p in model.parameters() if p.requires_grad) # total number of trainable parameters

In [None]:
criterion = nn.CrossEntropyLoss()
lr = 0.001  # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    total_loss = 0

    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        optimizer.zero_grad()
        data, targets = get_batch(train_data, i)
        seq_len = data.size(0)
        if seq_len != bptt:  # only on last batch
            src_mask = src_mask[:seq_len, :seq_len]
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets) 
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss.item()
        
    return total_loss / (batch + 1)


In [None]:
epochs = 50
start_time = time.time()

with TemporaryDirectory() as tempdir:
  for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train_loss = train(model)
    print(f'Epoch {epoch:3d}: {train_loss:5.4f}')


print(f"Training time: {round(time.time() - start_time)} seconds")


```
Epoch   1: 2.1162
Epoch   2: 1.7529
Epoch   3: 1.6641
Epoch   4: 1.6140
Epoch   5: 1.5809
Epoch   6: 1.5563
Epoch   7: 1.5358
Epoch   8: 1.5219
Epoch   9: 1.5088
Epoch  10: 1.4986
Epoch  11: 1.4891
Epoch  12: 1.4808
Epoch  13: 1.4737
Epoch  14: 1.4686
Epoch  15: 1.4626
Epoch  16: 1.4574
Epoch  17: 1.4527
Epoch  18: 1.4483
Epoch  19: 1.4441
Epoch  20: 1.4406
Epoch  21: 1.4366
Epoch  22: 1.4334
Epoch  23: 1.4306
Epoch  24: 1.4278
Epoch  25: 1.4235
Epoch  26: 1.4225
Epoch  27: 1.4200
Epoch  28: 1.4177
Epoch  29: 1.4151
Epoch  30: 1.4126
Epoch  31: 1.4115
Epoch  32: 1.4102
Epoch  33: 1.4069
Epoch  34: 1.4059
Epoch  35: 1.4037
Epoch  36: 1.4019
Epoch  37: 1.4001
Epoch  38: 1.3996
Epoch  39: 1.3981
Epoch  40: 1.3970
Epoch  41: 1.3961
Epoch  42: 1.3942
Epoch  43: 1.3924
Epoch  44: 1.3921
Epoch  45: 1.3913
Epoch  46: 1.3891
Epoch  47: 1.3891
Epoch  48: 1.3881
Epoch  49: 1.3867
Epoch  50: 1.3856
Training time: 360 seconds

```

In [None]:
def generate_sequence(model, initial_text, n_elements, temperature=1.0):
  model.eval()  # Set the model to evaluation mode
  
  initial_sequence = [char_to_int[c] for c in initial_text]
  generated_sequence = initial_sequence
  src_mask = generate_square_subsequent_mask(bptt).to(device)
  
  # Use torch.no_grad() to prevent gradient calculations during text generation
  with torch.no_grad():
    # Generate new elements
    for _ in range(n_elements):
      # Convert the input_sequence to a tensor and add batch dimension
      input_tensor = torch.tensor(generated_sequence[-bptt:]).unsqueeze(1).to(device)
      seq_len = input_tensor.size(0)
      
      if seq_len != bptt:
        src_mask = src_mask[:seq_len, :seq_len]
    
      # Evaluate the model
      output = model(input_tensor, src_mask)
      
                        
      # Apply temperature scaling to the output logits to control the randomness of the generated text
      output = output[-1, 0, :] / temperature
            
      # Convert the output logits into probabilities using softmax
      probabilities = torch.softmax(output, dim=-1)
          
      # Sample the next element using the probabilities
      next_element = torch.multinomial(probabilities, num_samples=1).item()
      
      # Append the element
      generated_sequence += [next_element]

  generated_text = [int_to_char[c] for c in generated_sequence]
  return ''.join(generated_text)

generated = generate_sequence(model, "We are not like that", 300, 0.8)
print(generated)

## Sample output

We are not like that; and think they had personed again them to greet of the first London weeks might learn before the house she was never assument pieces and the other train from an opportunity, and the looked at the least thing with a solemnly cup of other way and got up it off on and rent down with the moment the fr
