In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter

# Step 1: Prepare Data

# Example data
data = [
    "Transformers have revolutionized the field of natural language processing.",
    "They are widely used in tasks such as translation, summarization, and text generation.",
    "Attention mechanisms allow transformers to focus on relevant parts of the input sequence.",
    "The architecture consists of multiple layers of self-attention and feedforward networks.",
    "Pretrained transformer models like BERT, GPT, and T5 have achieved state-of-the-art results.",
    "PyTorch makes it easy to implement deep learning models, including transformers.",
    "Training large transformer models requires significant computational resources.",
    "The transformer model can handle long-range dependencies effectively.",
    "Researchers are constantly exploring ways to improve transformer architectures.",
    "Self-attention mechanisms enable parallel computation over input sequences.",
    "BERT stands for Bidirectional Encoder Representations from Transformers.",
    "GPT models are generative and can produce human-like text.",
    "Transformers have been successfully applied to fields beyond NLP, such as vision.",
    "The concept of attention was first introduced in machine translation tasks.",
    "Multi-head attention allows the model to attend to different parts of the input in parallel.",
    "Sequence-to-sequence models can be built using transformer encoders and decoders.",
    "Transformers use positional encoding to retain the order of the input sequence.",
    "Optimizing transformers requires advanced techniques like learning rate scheduling.",
    "Transfer learning with transformers has significantly reduced training time for many tasks.",
    "OpenAI's GPT-3 model has demonstrated remarkable text generation capabilities.",
    "Transformers are known for their scalability and flexibility in handling various data types.",
    "With enough data, transformers can generalize well across multiple domains.",
    "Fine-tuning pretrained transformers on specific tasks yields excellent results.",
    "The rise of transformers has shifted the focus from recurrent networks to attention-based models.",
    "The concept of self-attention underpins the strength of the transformer architecture.",
    "Large-scale transformer models require careful optimization to avoid overfitting.",
    "Natural language understanding tasks like question answering have benefited from transformers.",
    "Advances in transformer research continue to push the boundaries of AI and machine learning.",
    "Hybrid models combining transformers with other architectures are being explored for enhanced performance.",
    "Applications of transformers in speech processing and computer vision are gaining traction."
]


# Tokenizer: splits sentences into words by spaces
def tokenize(sentence):
    return sentence.lower().split()

# Build a vocabulary from the dataset
def build_vocab(data):
    counter = Counter()
    for sentence in data:
        counter.update(tokenize(sentence))
    vocab = {word: i+4 for i, word in enumerate(counter.keys())}  # Reserve 0, 1, 2, 3 for special tokens
    vocab["<unk>"] = 0
    vocab["<pad>"] = 1
    vocab["<bos>"] = 2
    vocab["<eos>"] = 3
    return vocab

vocab = build_vocab(data)
vocab_size = len(vocab)

# Encode the data into indices
def encode(sentence, vocab):
    tokens = ["<bos>"] + tokenize(sentence) + ["<eos>"]
    return [vocab.get(token, vocab["<unk>"]) for token in tokens]

encoded_data = [encode(sentence, vocab) for sentence in data]

# Step 2: Prepare Batches

# Pad sequences to the same length
def pad_sequence(sequences, pad_value):
    max_len = max(len(seq) for seq in sequences)
    return [seq + [pad_value] * (max_len - len(seq)) for seq in sequences]

# Convert the data into batches
def create_batches(encoded_data, batch_size, pad_value):
    batches = []
    for i in range(0, len(encoded_data), batch_size):
        batch = encoded_data[i:i+batch_size]
        batch = pad_sequence(batch, pad_value)
        batches.append(torch.tensor(batch, dtype=torch.long).T)  # Transpose to (sequence_length, batch_size)
    return batches

batch_size = 2
batches = create_batches(encoded_data, batch_size, vocab["<pad>"])

# Step 3: Define the Transformer Model

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, hidden_dim, num_layers):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.pos_encoder = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(self.pos_encoder, num_layers=num_layers)
        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.dropout = nn.Dropout(0.1)
        self.embed_size = embed_size

    def forward(self, src):
        # src shape: [sequence_length, batch_size]
        src = self.embedding(src) * torch.sqrt(torch.tensor(self.embed_size, dtype=torch.float32))
        src = self.dropout(src)
        memory = self.transformer(src)
        out = self.fc_out(memory)
        return out

# Step 4: Train the Model

def train(model, data_batches, optimizer, criterion, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in data_batches:
            optimizer.zero_grad()
            # Shift target for teacher forcing
            input_seq = batch[:-1]  # All except the last token
            target_seq = batch[1:]  # All except the first token
            output = model(input_seq)
            
            # Use reshape instead of view to flatten the tensors
            loss = criterion(output.reshape(-1, vocab_size), target_seq.reshape(-1))
            
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(data_batches):.4f}')


# Model parameters
embed_size = 64
num_heads = 4
hidden_dim = 128
num_layers = 2
num_epochs = 25
learning_rate = 0.001

# Initialize model, optimizer, and loss function
model = TransformerModel(vocab_size, embed_size, num_heads, hidden_dim, num_layers)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"])

# Train the model
train(model, batches, optimizer, criterion, num_epochs)

# Step 5: Generate Text

def generate_text(model, start_token, max_length=20):
    model.eval()
    src = torch.tensor([vocab[start_token]], dtype=torch.long).unsqueeze(1)
    generated_tokens = [start_token]
    
    for _ in range(max_length):
        output = model(src)
        next_token = torch.argmax(output[-1, :], dim=1).item()
        generated_tokens.append(list(vocab.keys())[next_token])
        if list(vocab.keys())[next_token] == "<eos>":
            break
        src = torch.cat([src, torch.tensor([[next_token]], dtype=torch.long)], dim=0)
    
    return ' '.join(generated_tokens)

# Generate some text
generated_sentence = generate_text(model, "<bos>")
print("Generated Text:", generated_sentence)


Epoch [1/25], Loss: 5.2547
Epoch [2/25], Loss: 4.6993
Epoch [3/25], Loss: 4.1982
Epoch [4/25], Loss: 3.6537
Epoch [5/25], Loss: 3.2897
Epoch [6/25], Loss: 2.9637
Epoch [7/25], Loss: 2.6938
Epoch [8/25], Loss: 2.4230
Epoch [9/25], Loss: 2.2165
Epoch [10/25], Loss: 2.0562
Epoch [11/25], Loss: 1.8169
Epoch [12/25], Loss: 1.7134
Epoch [13/25], Loss: 1.5065
Epoch [14/25], Loss: 1.3765
Epoch [15/25], Loss: 1.2551
Epoch [16/25], Loss: 1.1304
Epoch [17/25], Loss: 1.0319
Epoch [18/25], Loss: 0.9174
Epoch [19/25], Loss: 0.8565
Epoch [20/25], Loss: 0.7645
Epoch [21/25], Loss: 0.7196
Epoch [22/25], Loss: 0.6694
Epoch [23/25], Loss: 0.5860
Epoch [24/25], Loss: 0.5511
Epoch [25/25], Loss: 0.4982
Generated Text: <bos> encoder representations from gpt generative produce human-like significant the time across domains. field model transfer with achieved training time across
