<a href="https://colab.research.google.com/github/dogukartal/IBM_AI_Labs/blob/main/Generative%20AI%20Language%20Modeling%20with%20Transformers/Decoder_Causal_Language_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Decoder Causal Language Models
---

## Setup


In [None]:
!pip install -qq torch==2.0.0
!pip install -Uqq portalocker>=2.0.0
!pip install -qq torchtext==0.15.1
!pip install -qq torchdata==0.6.0
!pip install -qq matplotlib
!pip install -qq transformers

In [None]:
from torchtext.datasets import multi30k, Multi30k
from torch.utils.data import DataLoader
import torch
from typing import Iterable, List
import matplotlib.pyplot as plt
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torchtext.datasets import IMDB,PennTreebank
from transformers import GPT2Tokenizer
import time
from torch.optim import Adam

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

## Text pipeline
### Dataset

Some datasets:
* [PennTreebank](https://pytorch.org/text/0.8.1/datasets.html#penntreebank)
* [WikiText-2](https://pytorch.org/text/0.8.1/datasets.html#wikitext-2)
* [WikiText103](https://pytorch.org/text/0.8.1/datasets.html#wikitext103)



In [None]:
# Load the dataset
train_iter, val_iter = IMDB()

data_itr = iter(train_iter)

next(data_itr)

(1,
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betwee


### Preprocessing data

- **Special Symbols and Indices**: Initializes special tokens (`<unk>`, `<pad>`, and an empty string for EOS) with their corresponding indices (`0`, `1`, and `2`).
    - `UNK_IDX`: Index for unknown words.
    - `PAD_IDX`: Index used for padding shorter sentences in a batch to ensure uniform length.
    - `EOS_IDX`: Index representing the end of a sentence.

In [None]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, EOS_IDX = 0, 1, 2
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<|endoftext|>' ]

In [None]:
# Generator function iterating through a dataset , tokenizing each data sample and yields one tokenized sample at a time
def yield_tokens(data_iter):

    for _,data_sample in data_iter:
        yield  tokenizer(data_sample)

tokenizer = get_tokenizer("basic_english")

# Constructs a vocabulary from the tokenized dataset
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=special_symbols, special_first=True)

# Sets a default index for tokens not found in the vocabulary
vocab.set_default_index(UNK_IDX)

# Converts a given text into a sequence of indices based on the built vocabulary
text_to_index = lambda text: [vocab(token) for token in tokenizer(text)]

# Transforms a sequence of indices back into a readable string
index_to_en = lambda seq_en: " ".join([vocab.get_itos()[index] for index in seq_en])

In [None]:
# Check
index_to_en(torch.tensor([0,1,2]))

'<unk> <pad> <|endoftext|>'

### Collate function


In [None]:
# This function generates a random text sample(src_sequence) and its subsequent sequence(tgt_sequence) from a given text for language model training
def get_sample(block_size, text):
    # Determine the length of the input text
    sample_leg = len(text)
    # Calculate the stopping point for randomly selecting a sample
    # This ensures the selected sample doesn't exceed the text length
    random_sample_stop = sample_leg - block_size


    # Check if a random sample can be taken (if the text is longer than block_size)
    if random_sample_stop >= 1:
        # Randomly select a starting point for the sample
        random_start = torch.randint(low=0, high=random_sample_stop, size=(1,)).item()
        # Define the endpoint of the sample
        stop = random_start + block_size

        # Create the input and target sequences
        src_sequence = text[random_start:stop]
        tgt_sequence = text[random_start + 1:stop + 1]

    # Handle the case where the text length is exactly equal or less the block size
    elif random_sample_stop <= 0:
        # Start from the beginning and use the entire text
        random_start = 0
        stop = sample_leg
        src_sequence = text[random_start:stop]
        tgt_sequence = text[random_start + 1:stop]
        # Append an empty string to maintain sequence alignment
        tgt_sequence.append( '<|endoftext|>')

    return src_sequence, tgt_sequence

In [None]:
BATCH_SIZE = 1

batch_of_tokens = []

for i in range(BATCH_SIZE):
  _, text = next(iter(train_iter))
  batch_of_tokens.append(tokenizer(text))

text = batch_of_tokens[0][0:100]
text[0:10]

['i',
 'rented',
 'i',
 'am',
 'curious-yellow',
 'from',
 'my',
 'video',
 'store',
 'because']

In [None]:
block_size = 10
src_sequences, tgt_sequence = get_sample(block_size, text)

# Check if it is shifted
print("src: ",src_sequences)
print("tgt: ",tgt_sequence)

src:  ['1967', '.', 'i', 'also', 'heard', 'that', 'at', 'first', 'it', 'was']
tgt:  ['.', 'i', 'also', 'heard', 'that', 'at', 'first', 'it', 'was', 'seized']


In [None]:
## Creates batches of source (`src_batch`) and target (`tgt_batch`) sequences from a dataset for training NLP models
# Initialize empty lists to store source and target sequences
src_batch, tgt_batch = [], []

# Define the batch size
BATCH_SIZE = 2

# Loop to create batches of source and target sequences
for i in range(BATCH_SIZE):
    # Retrieve the next data point from the training iterator
    _,text = next(iter(train_iter))

    # Generate source and target sequences using the get_sample function
    src_sequence_text, tgt_sequence_text = get_sample(block_size, tokenizer(text))

    # Convert source and target sequences to tokenized vocabulary indices
    src_sequence_indices = vocab(src_sequence_text)
    tgt_sequence_indices = vocab(tgt_sequence_text)

    # Convert the sequences to PyTorch tensors with dtype int64
    src_sequence = torch.tensor(src_sequence_indices, dtype=torch.int64)
    tgt_sequence = torch.tensor(tgt_sequence_indices, dtype=torch.int64)

    # Append the source and target sequences to their respective batches
    src_batch.append(src_sequence)
    tgt_batch.append(tgt_sequence)

    # Print the output for every 2nd sample (adjust as needed)
    print(f"Sample {i}:")
    print("Source Sequence (Text):", src_sequence_text)
    print("Source Sequence (Indices):", src_sequence_indices)
    print("Source Sequence (Shape):", src_sequence.shape)
    print("Target Sequence (Text):", tgt_sequence_text)
    print("Target Sequence (Indices):", tgt_sequence_indices)
    print("Target Sequence (Shape):", tgt_sequence.shape)

Sample 0:
Source Sequence (Text): ['and', 'race', 'issues', 'in', 'the', 'united', 'states', '.', 'in', 'between']
Source Sequence (Indices): [7, 1610, 1462, 14, 4, 2671, 1768, 3, 14, 259]
Source Sequence (Shape): torch.Size([10])
Target Sequence (Text): ['race', 'issues', 'in', 'the', 'united', 'states', '.', 'in', 'between', 'asking']
Target Sequence (Indices): [1610, 1462, 14, 4, 2671, 1768, 3, 14, 259, 1743]
Target Sequence (Shape): torch.Size([10])
Sample 1:
Source Sequence (Text): ['named', 'lena', 'who', 'wants', 'to', 'learn', 'everything', 'she', 'can', 'about']
Source Sequence (Indices): [831, 6788, 49, 518, 10, 901, 287, 68, 59, 52]
Source Sequence (Shape): torch.Size([10])
Target Sequence (Text): ['lena', 'who', 'wants', 'to', 'learn', 'everything', 'she', 'can', 'about', 'life']
Target Sequence (Indices): [6788, 49, 518, 10, 901, 287, 68, 59, 52, 161]
Target Sequence (Shape): torch.Size([10])


In [None]:
BLOCK_SIZE = 30

def collate_batch(batch):
    src_batch, tgt_batch = [], []
    for _,_textt in batch:
      src_sequence,tgt_sequence = get_sample(BLOCK_SIZE,tokenizer(_textt))
      src_sequence = vocab(src_sequence)
      tgt_sequence = vocab(tgt_sequence)
      src_sequence = torch.tensor(src_sequence, dtype=torch.int64)
      tgt_sequence = torch.tensor(tgt_sequence, dtype=torch.int64)
      src_batch.append(src_sequence)
      tgt_batch.append(tgt_sequence)


    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=False)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=False)

    return src_batch.to(DEVICE), tgt_batch.to(DEVICE)

In [None]:
BATCH_SIZE = 1
dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_iter , batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

### Iterating through data samples

In [None]:
dataset = iter(dataloader)

for sample in range(5):
  src, trt = next(dataset)
  print("sample", sample)
  print("sorce:", index_to_en(src))
  print("\n")
  print("target:", index_to_en(trt))
  print("\n")

sample 0
sorce: a child of five for more then ten minutes . princess is lovely , but should be tongueless , cause actress don ' t know how to carry a role


target: child of five for more then ten minutes . princess is lovely , but should be tongueless , cause actress don ' t know how to carry a role .


sample 1
sorce: articulate the overwhelming power of this 90-minute waste of time if i were having a three-way with jessica alba and jessica biel in front of a tv and blood of


target: the overwhelming power of this 90-minute waste of time if i were having a three-way with jessica alba and jessica biel in front of a tv and blood of the


sample 2
sorce: when ww3 was a real possibility - nay probability - this film would have terrified me , but after the cold war ended so had the dangers of nuclear war


target: ww3 was a real possibility - nay probability - this film would have terrified me , but after the cold war ended so had the dangers of nuclear war which


sample 3
sorce: hey , 

In [None]:
for  src, trt in dataset:
    print(trt.shape)
    print(src.shape)
    print(index_to_en(src[0,:]))
    print(index_to_en(trt[0,:]))
    break

torch.Size([30, 1])
torch.Size([30, 1])
to
my


In [None]:
print("source:", index_to_en(src))
print("target:", index_to_en(trt))

source: to my house last night . i feared the worst knowing its reputation , and it was as god-awful as i ' d anticipated . this is a mexican-made mess
target: my house last night . i feared the worst knowing its reputation , and it was as god-awful as i ' d anticipated . this is a mexican-made mess ,


### Masking

In transformers, masking is crucial for ensuring certain positions are not attended to. The function ```generate_square_subsequent_mask``` produces an upper triangular matrix, which ensures that during decoding, a token can't attend to future tokens of target.


In [None]:
def generate_square_subsequent_mask(sz, device=DEVICE):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

The ```create_mask function```, on the other hand, generates source masks, based on the provided source sequence.


In [None]:
def create_mask(src,device=DEVICE):
    src_seq_len = src.shape[0]
    src_mask = generate_square_subsequent_mask(src_seq_len)
    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    return src_mask, src_padding_mask

In [None]:
#Replace first four tokens with PAD token so we can also check how pad tokens are masked using padding_mask
src[0:4] = PAD_IDX

In [None]:
mask, padding_mask = create_mask(src)
src

tensor([[    1],
        [    1],
        [    1],
        [    1],
        [  320],
        [    3],
        [   13],
        [ 9212],
        [    4],
        [  153],
        [ 1622],
        [  112],
        [ 2371],
        [    5],
        [    7],
        [   12],
        [   18],
        [   23],
        [ 4310],
        [   23],
        [   13],
        [    8],
        [  216],
        [ 6340],
        [    3],
        [   15],
        [   11],
        [    6],
        [32578],
        [  582]], device='cuda:0')

In [None]:
mask

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
         -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
         -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
         -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
         -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
         -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -i

In [None]:
padding_mask

tensor([[ True,  True,  True,  True, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False]],
       device='cuda:0')

### Positional encoding

GPT uses trainable positional encodings unlike fixed positional encodings such as sinusoidal encodings.

Trainable positional encodings are implemented as a set of learnable parameters, one for each position in the input sequence. These parameters have the same dimensionality as the token embeddings. During training, the model updates the positional encoding parameters along with the other model parameters to capture the positional information more effectively.


In [None]:
# add positional information to the input tokens
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

### Token embedding
Token embedding, also known as word embedding or word representation, is a way to convert words or tokens from a text corpus into numerical vectors in a continuous vector space where the numerical values represent various linguistic properties of the word, such as its meaning, context, or relationships with other words.


In [None]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

## Custom GPT model architecture

- **Initialization (`__init__`)**: The constructor takes several parameters including `embed_size`, `vocab_size`, `num_heads`, `num_layers`, `max_seq_len`, and `dropout`. It initializes the embedding layer, positional encoding, transformer encoder layers, and a linear layer (`lm_head`) for generating logits over the vocabulary.

- **Weight initialization (`init_weights`)**: This method initializes the weights of the model for better training convergence. The Xavier uniform initialization is used, which is a common practice for initializing weights in deep learning.

- **Decoder (`decoder`)**: Although named `decoder`, this method currently functions as the forward pass through the transformer encoder layers, followed by the generation of logits for the language modeling task. It handles the addition of positional encodings to the embeddings and applies a mask if necessary.

- **Forward pass (`forward`)**: This method is similar to the `decoder` method and defines the forward computation of the model. It processes the input through embedding layers, positional encoding, transformer encoder layers, and produces the final output using the `lm_head`.

- **Mask generation**: Both `decoder` and `forward` methods contain logic to generate a square causal mask if no source mask is provided. This mask ensures that the prediction for a position does not depend on the future tokens in the sequence, which is important for the autoregressive nature of GPT models.

In [None]:
class CustomGPTModel(nn.Module):
    def __init__(self, embed_size,vocab_size, num_heads, num_layers, max_seq_len=500,dropout=0.1):

        super().__init__()

        self.init_weights()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size, dropout=dropout)

        print("Embedding size: ", embed_size)

        # Remaining layers are part of the TransformerDecoder
        encoder_layers = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.embed_size = embed_size
        self.lm_head = nn.Linear(embed_size, vocab_size)

    def init_weights(self):
      for p in self.parameters():
          if p.dim() > 1:
              nn.init.xavier_uniform_(p)

    def create_mask(src,device=DEVICE):
        src_seq_len = src.shape[0]
        src_mask = nn.Transformer.generate_square_subsequent_mask(src_seq_len)
        src_padding_mask = (src == PAD_IDX).transpose(0, 1)
        return src_mask,src_padding_mask

    def decoder(self, x,src_mask):
        seq_length = x.size(0)

        # Add positional embeddings to the input embeddings
        x = self.embed(x)* math.sqrt(self.embed_size)
        x = self.positional_encoding(x)

        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask, src_padding_mask = create_mask(x)

        output = self.transformer_encoder(x, src_mask)
        logits = self.lm_head(x)
        return logits

    def forward(self,x,src_mask=None,key_padding_mask=None):

        seq_length = x.size(0)

        # Add positional embeddings to the input embeddings
        x = self.embed(x)* math.sqrt(self.embed_size) #src = self.embedding(src) * math.sqrt(self.d_model)
        x = self.positional_encoding(x)

        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask, src_padding_mask = create_mask(x)

        output = self.transformer_encoder(x, src_mask,key_padding_mask)
        x = self.lm_head(x)

        return x

### Model configuration and initialization

- `ntokens`: The total number of unique tokens in the vocabulary, which the model will use to represent words.
- `emsize`: The size of each embedding vector. In this model, each word will be represented by a 200-dimensional vector.
- `nlayers`: The number of transformer encoder layers in the model. We are using two layers in this configuration.
- `nhead`: The number of attention heads in the multi-head attention mechanism. The model will use two attention heads.
- `dropout`: A regularization technique where randomly selected neurons are ignored during training to prevent overfitting. Here, we set the dropout probability to 0.2.


In [None]:
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability

model = CustomGPTModel(embed_size=emsize, num_heads=nhead, num_layers=nlayers, vocab_size=ntokens,dropout=dropout).to(DEVICE)

Embedding size:  200


### Prompting

In [None]:
def encode_prompt(prompt, block_size=BLOCK_SIZE):
    # Handle None prompt
    while prompt is None:
        prompt = input("Sorry, prompt cannot be empty. Please enter a valid prompt: ")

    tokens = tokenizer(prompt)
    number_of_tokens = len(tokens)

    # Handle long prompts
    if number_of_tokens > block_size:
        tokens = tokens[-block_size:]  # Keep last block_size characters

    prompt_indices = vocab(tokens)
    prompt_encoded = torch.tensor(prompt_indices, dtype=torch.int64).reshape(-1, 1)
    return prompt_encoded

In [None]:
print(index_to_en(encode_prompt(None)))

Sorry, prompt cannot be empty. Please enter a valid prompt: Hello
hello


In [None]:
print(index_to_en(encode_prompt("This is a prompt to get model generate next words." ) ))

this is a prompt to get model generate next words .


In [None]:
prompt_encoded = encode_prompt("This is a prompt to get model generate next words.").to(DEVICE)
prompt_encoded

tensor([[   15],
        [   11],
        [    6],
        [33700],
        [   10],
        [   86],
        [ 2076],
        [ 5673],
        [  388],
        [  665],
        [    3]], device='cuda:0')

In [None]:
logits = model.decoder(prompt_encoded,src_mask=None).to(DEVICE)

In [None]:
# 11 tokens per output, single batch, logit values
logits.shape

torch.Size([11, 1, 68813])

In [None]:
logits = logits.transpose(0, 1)
logits.shape

torch.Size([1, 11, 68813])

In [None]:
logit_preiction = logits[:,-1]
logit_preiction.shape

torch.Size([1, 68813])

In [None]:
_, next_word_index = torch.max(logit_preiction, dim=1)
next_word_index

tensor([56984], device='cuda:0')

In [None]:
index_to_en(next_word_index)

'original\x97is'

## Autoregressive text generation
In decoder models, we simply append the output to the input to generate the next response. We stop this process when we encounter the end-of-sequence tag <|endoftext|> or if the input becomes too large.

In [None]:
prompt = "this is the beginning of"

In [None]:
prompt_encoded = encode_prompt(prompt).to(DEVICE)
print("Device for prompt_encoded:", prompt_encoded.shape)

Device for prompt_encoded: torch.Size([5, 1])


In [None]:
max_new_tokens = 10

In [None]:
for i in range(max_new_tokens):
    logits = model.decoder(prompt_encoded,src_mask=None)
    logits = logits.transpose(0, 1)
    print(" ")
    print(f"Shape of logits at step {i}: {logits.shape}")

    logit_preiction = logits[:, -1]
    print(f"Shape of logit_prediction at step {i}: {logit_preiction.shape}")

    next_token_encoded = torch.argmax(logit_preiction, dim=-1).reshape(-1, 1)
    print(f"Shape of next_token_encoded at step {i}: {next_token_encoded.shape}")

    prompt_encoded = torch.cat((prompt_encoded, next_token_encoded), dim=0).to(DEVICE)
    print(f"Sequence for step {i}: {[index_to_en(j) for j in prompt_encoded]}")
    print(f"Shape of prompt_encoded after concatenation at step {i}: {prompt_encoded.shape}")

 
Shape of logits at step 0: torch.Size([1, 5, 68813])
Shape of logit_prediction at step 0: torch.Size([1, 68813])
Shape of next_token_encoded at step 0: torch.Size([1, 1])
Sequence for step 0: ['this', 'is', 'the', 'beginning', 'of', 'establishing']
Shape of prompt_encoded after concatenation at step 0: torch.Size([6, 1])
 
Shape of logits at step 1: torch.Size([1, 6, 68813])
Shape of logit_prediction at step 1: torch.Size([1, 68813])
Shape of next_token_encoded at step 1: torch.Size([1, 1])
Sequence for step 1: ['this', 'is', 'the', 'beginning', 'of', 'establishing', 'unsuspensful']
Shape of prompt_encoded after concatenation at step 1: torch.Size([7, 1])
 
Shape of logits at step 2: torch.Size([1, 7, 68813])
Shape of logit_prediction at step 2: torch.Size([1, 68813])
Shape of next_token_encoded at step 2: torch.Size([1, 1])
Sequence for step 2: ['this', 'is', 'the', 'beginning', 'of', 'establishing', 'unsuspensful', 'barrages']
Shape of prompt_encoded after concatenation at step 2: 

In [None]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, EOS_IDX = 0, 1, 2
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<|endoftext|>' ]
BLOCK_SIZE

30

In [None]:
# Auto-regressive Language Model text generation
def generate(model, prompt=None, max_new_tokens=500, block_size=BLOCK_SIZE, vocab=vocab, tokenizer=tokenizer):
    # Move model to the specified device (e.g., GPU or CPU)
    model.to(DEVICE)

    # Encode the input prompt using the provided encode_prompt function
    prompt_encoded = encode_prompt(prompt).to(DEVICE)
    tokens = []

    # Generate new tokens up to max_new_tokens
    for _ in range(max_new_tokens):
        # Decode the encoded prompt using the model's decoder
        logits = model(prompt_encoded,src_mask=None,key_padding_mask=None)

        # Transpose the logits to bring the sequence length to the first dimension
        logits = logits.transpose(0, 1)

        # Select the logits of the last token in the sequence
        logit_prediction = logits[:, -1]

        # Choose the most probable next token from the logits(greedy decoding)
        next_token_encoded = torch.argmax(logit_prediction, dim=-1).reshape(-1, 1)

        # If the next token is the end-of-sequence (EOS) token, stop generation
        if next_token_encoded.item() == EOS_IDX:
            break

        # Append the next token to the prompt_encoded and keep only the last 'block_size' tokens
        prompt_encoded = torch.cat((prompt_encoded, next_token_encoded), dim=0)[-block_size:]

        # Convert the next token index to a token string using the vocabulary
        # Move the tensor back to CPU for vocab lookup if needed
        token_id = next_token_encoded.to('cpu').item()
        tokens.append(vocab.get_itos()[token_id])

    # Join the generated tokens into a single string and return
    return ' '.join(tokens)

In [None]:
generate(model,prompt="this is the beginning of",max_new_tokens=30,vocab=vocab,tokenizer=tokenizer)

'haggis- hadha hurl-buckets mid-flight gregarious spookiness alerting classes millionaire- entanglements amoeba-like >>ff no-good remo stuck automotive neuroticism middlebrow long-anticipated stridence blithe postmark falak service moroccan poe bad--wrong includes newbies sartre'

### Decoding the differences: Training vs. inference

The key difference between the training and inference stages lies in the inputs to the decoder. During training, the decoder benefits from exposure to the ground truth--receiving the exact target sequence tokens incrementally through a technique known as "teacher forcing."

To start the training, first create a Cross Entropy Loss object. The loss will not consider PAD tokens.


In [None]:
from torch.nn import CrossEntropyLoss

loss_fn = CrossEntropyLoss(ignore_index=PAD_IDX)

# Create mask
src, tgt=next(iter(dataloader))
mask, padding_mask = create_mask(src)

logits = model(src,src_mask=mask,key_padding_mask=padding_mask)
print(logits.shape)

torch.Size([30, 1, 68813])


In [None]:
print("Output shape: ", logits.shape)
print("Source shape: ", src)

Output shape:  torch.Size([30, 1, 68813])
Source shape:  tensor([[  127],
        [  269],
        [    3],
        [   23],
        [   20],
        [   93],
        [ 2541],
        [    9],
        [    4],
        [66930],
        [  166],
        [   95],
        [ 2195],
        [    8],
        [  621],
        [    5],
        [   13],
        [   91],
        [   64],
        [   77],
        [    6],
        [11306],
        [  436],
        [   16],
        [   70],
        [   23],
        [   85],
        [  788],
        [   10],
        [   12]], device='cuda:0')


In [None]:
tgt
print(tgt.shape)

torch.Size([30, 1])


In [None]:
print(logits.reshape(-1, logits.shape[-1]).shape)
print(tgt.reshape(-1).shape)

torch.Size([30, 68813])
torch.Size([30])


In [None]:
# Calculate the loss
loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt.reshape(-1))
print(loss.item())

36.47765350341797


In [None]:
def evaluate(model: nn.Module, eval_data) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for src,tgt in eval_data:
            tgt = tgt.to(DEVICE)
            #seq_len = src.size(0)
            logits = model(src,src_mask=None,key_padding_mask=None)
            total_loss +=  loss_fn(logits.reshape(-1, logits.shape[-1]), tgt.reshape(-1)).item()
    return total_loss / (len(list(eval_data)) - 1)

In [None]:
evaluate(model, val_dataloader)

35.02987586185881

## Training the model

- **Optimizer**: Initializes an ADAM optimizer.

Within the `train` function:
- The model is set to train mode, which enables dropout and batch normalization layers.
- A loop iterates over the training data, which is loaded in batches. For each batch:
    - The source (`src`) and target (`tgt`) sequences are extracted.
    - The model performs a forward pass to get logits.
    - The logits are reshaped for loss calculation.
    - The loss is computed using `loss_fn`, which likely refers to a loss function such as cross-entropy that measures the difference between the predicted logits and the target sequences.
- Gradient clipping is applied to prevent exploding gradients, which is common in training deep neural networks.
- The optimizer updates the model parameters based on the computed gradients.

Logging occurs every `10000` steps, or when reaching a specific batch (batch `42060` is hardcoded as an example). During logging:

- The average loss and the perplexity (a measure of how well the probability model predicts a sample) are calculated and printed, providing insights into the model's performance.
- The elapsed time per batch since the last log interval is measured and reported, giving an indication of training efficiency.



In [None]:
optimizer = Adam(model.parameters(), lr=1e-2, weight_decay=0.01, betas=(0.9, 0.999))
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10000, gamma=0.9)

def train(model: nn.Module,train_data) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 10000
    start_time = time.time()

    num_batches = len(list(train_data)) // block_size
    for batch,srctgt in enumerate(train_data):
        src= srctgt[0]
        tgt= srctgt[1]
        logits = model(src,src_mask=None)
        logits_flat = logits.reshape(-1, logits.shape[-1])
        loss = loss_fn(logits_flat, tgt.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss.item()

        if (batch % log_interval == 0 and batch > 0) or batch==42060:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            #cur_loss = total_loss / log_interval
            cur_loss = total_loss / batch
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch//block_size:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.4f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            start_time = time.time()

    return total_loss

In [None]:
best_val_loss = float('inf')
epochs = 30
Train_losses = []
Val_losses = []
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train_loss = train(model,dataloader)
    val_loss = evaluate(model, val_dataloader)
    val_ppl = math.exp(val_loss)
    Train_losses.append(train_loss)
    Val_losses.append(val_loss)

    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
        f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'model_best_val_loss.pt')

| epoch   1 |  1000/ 1250 batches | lr 0.0100 | ms/batch  7.08 | loss  8.33 | ppl  4147.98
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 165.01s | valid loss  8.18 | valid ppl  3586.16
-----------------------------------------------------------------------------------------
| epoch   2 |  1000/ 1250 batches | lr 0.0100 | ms/batch  7.05 | loss  8.22 | ppl  3718.75
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 165.08s | valid loss  8.18 | valid ppl  3574.39
-----------------------------------------------------------------------------------------
| epoch   3 |  1000/ 1250 batches | lr 0.0100 | ms/batch  7.07 | loss  8.22 | ppl  3705.70
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 165.28s | valid loss  8.10 | valid ppl  3309.81
----------------------------------------------------

In [None]:
# Calculate the number of epochs (assuming the lengths of train_losses and val_losses are equal)
num_epochs = len(Train_losses)

# Create a figure and a set of subplots
fig, ax = plt.subplots()

# Plot the training losses
ax.plot(range(num_epochs), Train_losses, label='Training Loss', color='blue')

# Plot the validation losses
ax.plot(range(num_epochs), Val_losses, label='Validation Loss', color='orange')

# Set the x-axis label
ax.set_xlabel('Epoch')

# Set the y-axis label
ax.set_ylabel('Loss')

# Set the title of the plot
ax.set_title('Training and Validation Losses')

# Add a legend to the plot
ax.legend()

# Show the plot
plt.show()

![loss_gpt.png](https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/V1Fda63Q4CrNfgT5g1HfVQ.png)


## Loading the saved model

In [None]:
#!wget 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/kyn1_OsXrzjef0xihlsXmg.pt'
#model.load_state_dict(torch.load('kyn1_OsXrzjef0xihlsXmg.pt',map_location=torch.device('cpu')))

In [None]:
print(generate(model,prompt="the movie was",max_new_tokens=10,vocab=vocab,tokenizer=tokenizer))