<a href="https://colab.research.google.com/github/chandrasai-Durgapu/llm-basic-predict-next-word/blob/main/llm_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

LLM uses a deep learning architecture that learns word relationships through self-attention. The goal of our language model will be to predict the next word.

Here are the six main components we’ll cover:

Tokenization,
Embedding Layer,
Positional Encoding,
Self-Attention,
Transformer Block,
Full Language Model.


In [155]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

In [156]:
class SimpleLLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_heads):
        super(SimpleLLM, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim)
        self.transformer_blocks = nn.Sequential(*[TransformerBlock(embedding_dim, hidden_dim, num_heads) for _ in range(num_layers)])
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(0, 1) # Transpose for positional encoding
        x = self.positional_encoding(x)
        x = x.transpose(0, 1) # Transpose back
        x = self.transformer_blocks(x)
        x = self.output(x)
        return x

In [157]:
class TransformerBlock(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_heads):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embedding_dim, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embedding_dim)
        )
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        attended, _ = self.attention(x)  # SelfAttention returns attention_values and attention_weights
        x = self.norm1(x + attended)
        forwarded = self.feed_forward(x)
        x = self.norm2(x + forwarded)
        return x

In [158]:
class SelfAttention(nn.Module):
  def __init__(self,embedding_dim,num_heads):
    super(SelfAttention,self).__init__()
    self.Query=nn.Linear(embedding_dim,embedding_dim)
    self.Key=nn.Linear(embedding_dim,embedding_dim)
    self.Value=nn.Linear(embedding_dim,embedding_dim)
  def forward(self,x):
    queries= self.Query(x)
    keys=self.Key(x)
    values=self.Value(x)
    scores=torch.bmm(queries,keys.transpose(1,2))/torch.sqrt(torch.tensor(x.size(-1),dtype=torch.float32))
    atttention_weights=torch.softmax(scores,dim=-1)
    attention_values=torch.bmm(attention_weights,values)
    return attention_values,attention_weights

In [159]:
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_sql_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_sql_len, embedding_dim)
        position = torch.arange(0, max_sql_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1) # Shape: [max_sql_len, 1, embedding_dim]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: [sequence_length, batch_size, embedding_dim]
        # pe shape: [max_sql_len, 1, embedding_dim]
        # We need to add pe to x, broadcasting over the batch dimension (dim 1)
        # Slice pe to match the sequence length of x
        return x + self.pe[:x.size(0), :, :] # Add pe, broadcasting over batch dimension and slicing sequence length

In [160]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Embedding, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, x):
        return self.embeddings(x)

In [161]:
def tokenize(text,vocab):
  return list(vocab.get(word, vocab["<UNK>"]) for word in text.split())

In [162]:
class TransformerBlock(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_heads):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embedding_dim, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embedding_dim)
        )
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        attended, _ = self.attention(x)  # SelfAttention returns attention_values and attention_weights
        x = self.norm1(x + attended)
        forwarded = self.feed_forward(x)
        x = self.norm2(x + forwarded)
        return x

In [163]:
class SimpleLLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_heads):
        super(SimpleLLM, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim)
        self.transformer_blocks = nn.Sequential(*[TransformerBlock(embedding_dim, hidden_dim, num_heads) for _ in range(num_layers)])
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(0, 1) # Transpose for positional encoding
        x = self.positional_encoding(x)
        x = x.transpose(0, 1) # Transpose back
        x = self.transformer_blocks(x)
        x = self.output(x)
        return x

In [164]:
class SelfAttention(nn.Module):
  def __init__(self,embedding_dim,num_heads):
    super(SelfAttention,self).__init__()
    self.Query=nn.Linear(embedding_dim,embedding_dim)
    self.Key=nn.Linear(embedding_dim,embedding_dim)
    self.Value=nn.Linear(embedding_dim,embedding_dim)
  def forward(self,x):
    queries= self.Query(x)
    keys=self.Key(x)
    values=self.Value(x)
    scores=torch.bmm(queries,keys.transpose(1,2))/torch.sqrt(torch.tensor(x.size(-1),dtype=torch.float32))
    attention_weights=torch.softmax(scores,dim=-1)
    attention_values=torch.bmm(attention_weights,values)
    return attention_values,attention_weights

In [165]:
vocab = {
    "hello": 0, "world": 1, "how": 2, "are": 3, "you": 4,
    "good": 5, "morning": 6, "evening": 7, "night": 8,
    "friend": 9, "nice": 10, "to": 11, "meet": 12, "learning": 13,
    "AI": 14, "is": 15, "fun": 16, "great": 17, "awesome": 18,
    "day": 19, "doing": 20, "today": 21, "hope": 22, "all": 23,
    "well": 24, "<UNK>": 25 # Remove empty string and adjust indices, <UNK> is now 25
}

vocab_size = len(vocab) # vocab_size will now be 26
embedding_dim = 16
hidden_dim = 32
num_layers = 2
num_heads = 4 # Add num_heads

model = SimpleLLM(vocab_size, embedding_dim, hidden_dim, num_layers, num_heads) # Pass num_heads
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

data = [
    "hello world how are you",
    "how are you hello world",
    "good morning friend",
    "nice to meet you",
    "learning AI is fun",
    "have a great day",
    "hope you are doing well",
    "AI is awesome",
    "what are you doing today",
    "good evening to all"
]

tokenized_data = [tokenize(sentence, vocab) for sentence in data]

for epoch in range(100):
    for sentence in tokenized_data:
        for i in range(1, len(sentence)):
            input_seq = torch.tensor(sentence[:i]).unsqueeze(0)
            target = torch.tensor([sentence[i]], dtype=torch.long) # Ensure target is shape [1]
            optimizer.zero_grad()
            output = model(input_seq)
            loss = criterion(output[:, -1, :], target)
            loss.backward()
            optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

Epoch 0, Loss: 4.461989402770996
Epoch 10, Loss: 1.92235267162323
Epoch 20, Loss: 0.6763872504234314
Epoch 30, Loss: 0.24692803621292114
Epoch 40, Loss: 0.12351427227258682
Epoch 50, Loss: 0.06678084284067154
Epoch 60, Loss: 0.041416414082050323
Epoch 70, Loss: 0.02816096507012844
Epoch 80, Loss: 0.020215198397636414
Epoch 90, Loss: 0.015071799978613853


In [166]:
def predict_next_word(model, tokenizer, vocab, input_text, top_k=1):
    model.eval() # Set the model to evaluation mode
    with torch.no_grad():
        # Tokenize the input text
        tokenized_input = tokenizer(input_text, vocab)
        # Convert tokens to tensor and add batch dimension
        input_tensor = torch.tensor(tokenized_input).unsqueeze(0)
        # Get model output (logits)
        output = model(input_tensor)
        # Get the logits for the last token in the sequence
        last_token_logits = output[:, -1, :]
        # Get the probabilities
        probabilities = torch.softmax(last_token_logits, dim=-1)
        # Get the top k predicted token indices
        top_k_probs, top_k_indices = torch.topk(probabilities, top_k)
        # Convert indices back to words
        idx_to_word = {idx: word for word, idx in vocab.items()}
        predicted_words = [idx_to_word[index.item()] for index in top_k_indices[0]]
        return predicted_words

# Example usage:
input_sentence = "hello world how are"
predicted_words = predict_next_word(model, tokenize, vocab, input_sentence, top_k=3)
print(f"Input: {input_sentence}")
print(f"Predicted next words: {predicted_words}")

input_sentence = "good morning"
predicted_words = predict_next_word(model, tokenize, vocab, input_sentence, top_k=3)
print(f"Input: {input_sentence}")
print(f"Predicted next words: {predicted_words}")

Input: hello world how are
Predicted next words: ['you', 'doing', 'to']
Input: good morning
Predicted next words: ['friend', 'doing', 'all']
