## Prepare the dataset

In [None]:
# Import necessary packages
import torch
import torch.nn as nn
import math
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

# Create the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')


# Read the text file
with open("../a06_RNN_language_model/animal_farm.txt", "r") as f:
  text = f.read()
  
text = text[:5000]  # make the text shorter for testing

# Use the tokenizer to convert the text to tokens
tokens = tokenizer.tokenize(text)

# Convert tokens to IDs
int_text = tokenizer.convert_tokens_to_ids(tokens)

# Create input and target sequences
sequence_length = 300
X, y = [], []

for i in range(len(int_text) - sequence_length):
  X.append(int_text[i:i + sequence_length])
  y.append(int_text[i + 1:i + sequence_length + 1])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Convert lists to tensors
X = torch.tensor(X, dtype=torch.long).to(device)
y = torch.tensor(y, dtype=torch.long).to(device)


## Create Transformer model

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, dropout=0.1, max_len=5000):
    super(PositionalEncoding, self).__init__()
    self.dropout = nn.Dropout(p=dropout)

    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0).transpose(0, 1)
    self.register_buffer('pe', pe)

  def forward(self, x):    
    x = x + self.pe[:x.size(0), :]
    return self.dropout(x)


class TransformerModel(nn.Module):
  def __init__(self, vocab_size, d_model, num_heads, hidden_dim, num_layers, dropout):
    super(TransformerModel, self).__init__()
    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoder = PositionalEncoding(d_model, dropout) 
    
    self.transformer_encoder = nn.TransformerEncoder(
      nn.TransformerEncoderLayer(d_model, num_heads, hidden_dim, dropout),
      num_layers
    )
    
    self.output_layer = nn.Linear(d_model, vocab_size)
    self.init_weights()
    
  def init_weights(self):
    initrange = 0.1
    self.embedding.weight.data.uniform_(-initrange, initrange)
    self.output_layer.bias.data.zero_()
    self.output_layer.weight.data.uniform_(-initrange, initrange)

  def forward(self, x, mask=None):
    """
    Arguments:
        src: Tensor, shape ``[seq_len, batch_size]``
        src_mask: Tensor, shape ``[seq_len, seq_len]``

    Returns:
        output Tensor of shape ``[seq_len, batch_size, ntoken]``
    """
        
    x = self.embedding(x)
    x = self.pos_encoder(x)
    x = self.transformer_encoder(x, mask)
    x = self.output_layer(x)
    return x

## Instantiate the Transformer model

In [None]:
# Set the hyperparameters
vocab_size = len(tokenizer.get_vocab())
print(vocab_size)
print(len(set(int_text)))
d_model = 128
num_heads = 2
num_layers = 1
hidden_dim = 128
dropout = 0.1
learning_rate = 0.001
batch_size = 10

# Instantiate the model
model = TransformerModel(vocab_size, d_model, num_heads, num_layers, hidden_dim, dropout)
model = model.to(device)

# Use the CrossEntropyLoss as our loss function
loss_function = nn.CrossEntropyLoss()

# Use the Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Train the model

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import time

class TextDataset(Dataset):
  def __init__(self, input_data, target_data):
    self.input_data = input_data
    self.target_data = target_data

  def __len__(self):
    return len(self.input_data)

  def __getitem__(self, idx):
    return self.input_data[idx], self.target_data[idx]
      
      
def generate_square_subsequent_mask(sz):
  """Generates an upper-triangular matrix of ``-inf``, with zeros on ``diag``."""
  return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

# Create the dataset
text_dataset = TextDataset(X, y)

# Create the DataLoader
data_loader = DataLoader(text_dataset, batch_size=batch_size, shuffle=False)

# Set the number of epochs
num_epochs = 5

# Training loop
for epoch in range(num_epochs):
  start_epoch = time.time()
  model.train()
  epoch_loss = 0

  for batch_idx, (input_batch, target_batch) in enumerate(data_loader):
    optimizer.zero_grad()
    input_batch = input_batch.transpose(0, 1)  # Transpose the input
    target_batch = target_batch.transpose(0, 1)  # Transpose the target

    # Forward pass
    mask = generate_square_subsequent_mask(sequence_length).to(device)
    output = model(input_batch, mask)
    loss = loss_function(output.reshape(-1, vocab_size), target_batch.reshape(-1))  # Flatten the outputs and targets

    # Backward pass
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()

  # Print the average loss for this epoch
  print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss / len(data_loader):.4f}, time: {time.time() - start_epoch:.2f} s")


## Training output

```
Epoch 1/5, Loss: 7.4910, time: 9.43 s
Epoch 2/5, Loss: 5.7837, time: 8.86 s
Epoch 3/5, Loss: 5.7779, time: 8.87 s
Epoch 4/5, Loss: 5.6356, time: 8.88 s
Epoch 5/5, Loss: 5.6284, time: 8.90 s
```

## Evaluate the model

In [None]:
def generate_text(model, seed_text, num_chars, temperature=1.0):
  # Set the model to evaluation mode
  model.eval()
  
  # Convert the text into a sequence of integers using the char_to_int dictionary
  tokens = tokenizer.tokenize(seed_text)
  input_sequence = tokenizer.convert_tokens_to_ids(tokens)
  
  # Use torch.no_grad() to prevent gradient calculations during text generation
  with torch.no_grad():
    # Generate 'num_chars' characters
    for _ in range(num_chars):
      input_tensor = torch.tensor([input_sequence[-sequence_length:]], dtype=torch.long).to(device)
    
      # Get the output probabilities for the next character by feeding the input tensor to the trained model
      output = model(input_tensor)
            
      # Apply temperature scaling to the output logits to control the randomness of the generated text
      output = output[:, -1, :] / temperature
            
      # Convert the output logits into probabilities using softmax
      probabilities = torch.softmax(output, dim=-1)
            
      # Sample the index of the next character using the probabilities
      next_token_id = torch.multinomial(probabilities, num_samples=1).item()
      input_sequence.append(next_token_id)
      
  # Generate text from the generated sequence of integers
  generated_text = tokenizer.decode(input_sequence, clean_up_tokenization_spaces=True)
  return generated_text

seed_text = "We are not like that."
num_chars_to_generate = 300

generated_text = generate_text(model, seed_text, num_chars_to_generate, temperature=0.8)
print(generated_text)

## Sample output

```
We are not like that.er and was and with squeezed the the man asleep Nevertheless front duck them, to, for cat,, it beast was, Benjamin had Mr which filed, at of itlings the, white usually, they so,.,llielings have tempered. to. a to, with to to Mo into the that wholings,ing to. to who a a duck white ’lover would. placench nearly s, Last and you White. s thelings not asleep foolish.,. who, Iedly Sundays would animals in doily God have came of big which was a saw a that Sundays., usually a would to da came which the.,. speaking white., their to the to the ; was who foolish the white of of. you nestled of, he Mr given the by them he laugh he the place to dide to it,ly theood be on it ;. trap of and animalsdock p did usually think that not Jones on raised to p tailbly, who a the to still lost his by a of., say who to inside to all the, of, of the,ing asleep, have at, and place t had a to of the sort, at animals of, Boxlings, devoted began She her at they and had, trap, got to down Nevertheless of to, not goat been never She dream to was at her for, to If immediately,,, in foring da and Sundays of my srod Jones not stripe asleep,ing laugh the
```

```
We are not like that. have I the, he had the the side wall themselves the ae, Icing had on, fee last the place ribbons ’ took p man promptly it da. ’ a where lump saying and throughout to lumpcing the cat. man, with. they came and ’ drew the herself She, as ’ was foolish be to r door, moment squeezed usualbly in with ribbons strange Box C had for for he, say attention side. At. dream and to it and, had who last Word fell of never tobly the. youlover the,, of. place of made was farm or to that At find to and and stripe and. the But it as animals saw. down have, word, was in their Major and, had place, she the where. and not the. whitee,,lover, have of, the ; have began, to there be did asleep and a foolish fashion for throat to barn a, squeezed. comfortable. draw listening Major were thatrrede forilyint foolish, to finally and nestled years man down Mo inside that. looked throughout Mo the with, and the it pigeon who place saying they da, them understandradeaven.,, finally to present fore all and not pretty who. looked to that a the,, fee God to throughout had fee round asked, ’ted never that or herself, finally, Mr would where where don p the Jones, madetive nestled theing as a pu to I foolish they last who attention a Mom and
```