## Prepare the dataset

In [None]:
# Import necessary packages
import torch
import torch.nn as nn
import math
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

# Create the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')


# Read the text file
with open("../a06_RNN_language_model/animal_farm.txt", "r") as f:
  text = f.read()
  
text = text[:5000]  # make the text shorter for testing

# Use the tokenizer to convert the text to tokens
tokens = tokenizer.tokenize(text)

# Convert tokens to IDs
int_text = tokenizer.convert_tokens_to_ids(tokens)

# Create input and target sequences
sequence_length = 300
X, y = [], []

for i in range(len(int_text) - sequence_length):
  X.append(int_text[i:i + sequence_length])
  y.append(int_text[i + 1:i + sequence_length + 1])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Convert lists to tensors
X = torch.tensor(X, dtype=torch.long).to(device)
y = torch.tensor(y, dtype=torch.long).to(device)


## Create Transformer model

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, dropout=0.1, max_len=5000):
    super(PositionalEncoding, self).__init__()
    self.dropout = nn.Dropout(p=dropout)

    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0).transpose(0, 1)
    self.register_buffer('pe', pe)

  def forward(self, x):
    x = x + self.pe[:x.size(0), :]
    return self.dropout(x)


class TransformerModel(nn.Module):
  def __init__(self, vocab_size, d_model, num_heads, hidden_dim, num_layers, dropout):
    super(TransformerModel, self).__init__()
    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoder = PositionalEncoding(d_model, dropout) 
    
    self.transformer_encoder = nn.TransformerEncoder(
      nn.TransformerEncoderLayer(d_model, num_heads, hidden_dim, dropout),
      num_layers
    )
    
    self.output_layer = nn.Linear(d_model, vocab_size)
    self.init_weights()
    
  def init_weights(self):
    initrange = 0.1
    self.embedding.weight.data.uniform_(-initrange, initrange)
    self.output_layer.bias.data.zero_()
    self.output_layer.weight.data.uniform_(-initrange, initrange)

  def forward(self, x, mask=None):
    x = self.embedding(x)
    x = self.pos_encoder(x)
    x = self.transformer_encoder(x, mask=mask)
    x = self.output_layer(x)
    return x

## Instantiate the Transformer model

In [None]:
# Set the hyperparameters
vocab_size = len(tokenizer.get_vocab())
d_model = 128
num_heads = 2
num_layers = 1
hidden_dim = 128
dropout = 0.1
learning_rate = 0.001
batch_size = 16 

# Instantiate the model
model = TransformerModel(vocab_size, d_model, num_heads, num_layers, hidden_dim, dropout)
model = model.to(device)

# Use the CrossEntropyLoss as our loss function
loss_function = nn.CrossEntropyLoss()

# Use the Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Train the model

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import time

class TextDataset(Dataset):
  def __init__(self, input_data, target_data):
    self.input_data = input_data
    self.target_data = target_data

  def __len__(self):
    return len(self.input_data)

  def __getitem__(self, idx):
    return self.input_data[idx], self.target_data[idx]
      
      
def generate_square_subsequent_mask(sz):
  """Generates an upper-triangular matrix of ``-inf``, with zeros on ``diag``."""
  return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

# Create the dataset
text_dataset = TextDataset(X, y)

# Create the DataLoader
data_loader = DataLoader(text_dataset, batch_size=batch_size, shuffle=True)

# Set the number of epochs
num_epochs = 5

# Training loop
for epoch in range(num_epochs):
  start_epoch = time.time()
  model.train()
  epoch_loss = 0

  for batch_idx, (input_batch, target_batch) in enumerate(data_loader):
    optimizer.zero_grad()

    # Forward pass
    mask = generate_square_subsequent_mask(input_batch.size(0)).to(device)
    output = model(input_batch, mask)
    loss = loss_function(output.view(-1, vocab_size), target_batch.view(-1))

    # Backward pass
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()

  # Print the average loss for this epoch
  print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss / len(data_loader):.4f}, time: {time.time() - start_epoch:.2f} s")


```
Epoch 1/5, Loss: 6.6443, time: 9.45 s
Epoch 2/5, Loss: 5.4509, time: 8.87 s
Epoch 3/5, Loss: 5.4365, time: 8.84 s
Epoch 4/5, Loss: 5.4426, time: 8.92 s
Epoch 5/5, Loss: 5.4368, time: 8.82 s
```

## Evaluate the model

In [None]:
def generate_text(model, seed_text, num_chars, temperature=1.0):
  # Set the model to evaluation mode
  model.eval()
  
  # Initialize the generated text with the seed text
  generated_text = seed_text
  
  # Use torch.no_grad() to prevent gradient calculations during text generation
  with torch.no_grad():
    # Generate 'num_chars' characters
    for _ in range(num_chars):
      # Convert the current generated text into a sequence of integers using the char_to_int dictionary
      tokens = tokenizer.tokenize(generated_text)
      input_sequence = tokenizer.convert_tokens_to_ids(tokens)
      
      # Convert the input_sequence to a tensor and add batch dimension
      input_tensor = torch.tensor(input_sequence, dtype=torch.long).unsqueeze(0).to(device)
      
      # Get the output probabilities for the next character by feeding the input tensor to the trained model
      output = model(input_tensor)
            
      # Apply temperature scaling to the output logits to control the randomness of the generated text
      output = output[:, -1, :] / temperature
      
      # Convert the output logits into probabilities using softmax
      probabilities = torch.softmax(output, dim=-1)
            
      # Sample the index of the next character using the probabilities
      next_char_idx = torch.multinomial(probabilities, num_samples=1).item()
            
      # Convert the index of the next character back to the character using the int_to_char dictionary
      input_sequence += [next_char_idx]
      
      # Append the next character to the generated text
      generated_text = tokenizer.decode(input_sequence, clean_up_tokenization_spaces=True)

    return generated_text

seed_text = "We are not like that."
num_chars_to_generate = 200

generated_text = generate_text(model, seed_text, num_chars_to_generate, temperature=0.8)
print(generated_text)

```
We are not like that.,. for it side word heard had have themselves horses the for, filed on he the usually cut thein,,, in the. he say her, last he to a The came Mr and flies the,. openly intelligence of the mare was Benjamin as the the to powers the, old usually, he Box for he the,,, and, they not,. Mr who make themselves theird, an of, the, int had les, he,. -, lings, ing to raised and was should wise the. that c animals window, horses down back the great as in and flies. powers, immediately thatlover which though on, in her some many he in, two to came a never a Box underlover where ’,., theer place of Thes animal to, s way was, had, ’ would given themselves and he very, er, the he he the white At as was,. a draw be had tempered duck appearance him animals., red say
```