# Домашнее задание: Генератор текста на базе Transformer

## 1. Архитектура модели

### Создайте класс `GeneratorTransformer`, который авторегрессивно генерирует продолжение текста. Обучите его на книгах или каких-нибудь текстах, которые вы найдете в интернете

Создал в файле generator.py


In [None]:
from datasets import load_dataset
from tokenizers import Tokenizer
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, tokenizer_path, max_length=128):
        self.tokenizer = Tokenizer.from_file(tokenizer_path)
        self.tokenizer.add_special_tokens(["<s>", "</s>", "<pad>"])
        self.max_length = max_length

        dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
        raw_text = "\n\n".join(dataset["text"])
        tokens = self.tokenizer.encode(raw_text).ids

        bos = self.tokenizer.token_to_id("<s>")
        eos = self.tokenizer.token_to_id("</s>")

        self.sequences = []
        for i in range(0, len(tokens) - max_length, max_length):
            chunk = tokens[i:i + max_length]
            self.sequences.append([bos] + chunk + [eos])

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = torch.tensor(self.sequences[idx])
        return {
            "input_ids": seq[:-1],
            "target_ids": seq[1:]
        }


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch.utils.data import DataLoader

dataset = TextDataset("transformer_basics/mistral_tokenizer.json", max_length=128)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

for batch in loader:
    print(batch["input_ids"].shape, batch["target_ids"].shape)
    break

torch.Size([1, 129]) torch.Size([1, 129])


In [10]:
from generator import GeneratorTransformer

## 4. Обучение

In [11]:
from tqdm import tqdm
from torch.amp import autocast, GradScaler

def train_model(model, dataset, num_epochs=3, batch_size=8, lr=1e-4, save_path="generator.pt"):
    model = model.to(model.device)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    criterion = nn.CrossEntropyLoss(ignore_index=model.pad_token_id)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scaler = GradScaler(device='cuda')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        progress = tqdm(dataloader, desc=f"Epoch {epoch+1}")
        for batch in progress:
            input_ids = batch["input_ids"].to(model.device)
            target_ids = batch["target_ids"].to(model.device)

            optimizer.zero_grad()

            with autocast(device_type='cuda', dtype=torch.float16):
                logits = model(input_ids)
                loss = criterion(logits.view(-1, logits.size(-1)), target_ids.view(-1))

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            progress.set_postfix(loss=total_loss / (progress.n + 1))

        print(f"Epoch {epoch+1} finished. Loss: {total_loss / len(dataloader):.4f}")

        # тестовая генерация
        model.eval()
        print("Generated:", model.generate("In the future", max_out_tokens=50))

        torch.save(model.state_dict(), f"{save_path}_epoch{epoch+1}.pt")

    print("Training complete.")



In [12]:

tokenizer = Tokenizer.from_file("transformer_basics/mistral_tokenizer.json")
tokenizer.add_special_tokens(["<s>", "</s>", "<pad>"])

dataset = TextDataset("transformer_basics/mistral_tokenizer.json", max_length=128)

model = GeneratorTransformer(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=256,
    num_heads=8,
    d_ff=512,
    num_layers=4,
    pad_token_id=tokenizer.token_to_id("<pad>"),
    bos_token_id=tokenizer.token_to_id("<s>"),
    eos_token_id=tokenizer.token_to_id("</s>"),
    max_len=128,
    tokenizer=tokenizer,
    device="cuda"
)

train_model(model, dataset, num_epochs=7, batch_size=32)


Epoch 1: 100%|██████████| 687/687 [00:34<00:00, 19.63it/s, loss=6.87]


Epoch 1 finished. Loss: 6.8463
Generated: In the future of symbolvert , in providedative wifeorous for the minor . Later and conditions wasv of theyley spread parking — tro announced . highlighted , within calledott , and yield are the international Vill to one named ofatch important becomes Bon writing history Som


Epoch 2: 100%|██████████| 687/687 [00:34<00:00, 19.86it/s, loss=6.01]


Epoch 2 finished. Loss: 5.9890
Generated: In the future concept on 28484 Society , spiniviaedistics , an most Kil Ver , in 500reation to 1 , 1 , although in the originalfl , Dylan . 


 The priorities = =


Epoch 3: 100%|██████████| 687/687 [00:34<00:00, 19.88it/s, loss=5.73]


Epoch 3 finished. Loss: 5.7130
Generated: In the future most capture . When Beco is approachedmentanistent due to bemate in an honorberrygeon . Compet for amounts as a nightiba and fewnaments 's own Sweet of Nelson blest Museum as a species of the shle of them vocals


Epoch 4: 100%|██████████| 687/687 [00:34<00:00, 19.99it/s, loss=5.51]


Epoch 4 finished. Loss: 5.5150
Generated: In the future records into the left of the maximum to anboard and Fame . Reaga of the New Zealand such as the company of the Qufully , of this period ( tenons ) , once designed , Christathens SAels a troops a spiritualau Pan


Epoch 5: 100%|██████████| 687/687 [00:34<00:00, 19.99it/s, loss=5.36]


Epoch 5 finished. Loss: 5.3634
Generated: In the future of the valley . Its ends because siming had been made being think in the Paris and him to a opening . However , " He translate on August , " Ithe interests rayly lular 's evident worked of most involved the cap , but


Epoch 6: 100%|██████████| 687/687 [00:34<00:00, 19.98it/s, loss=5.25]


Epoch 6 finished. Loss: 5.2395
Generated: In the future of the Detroit after by fish to its predecessial crAL , in an 159 . The following order reached the Spanish to the alminated Bay of the roadensive , the bottom . 


 Beyon and following episode director


Epoch 7: 100%|██████████| 687/687 [00:34<00:00, 19.88it/s, loss=5.13]


Epoch 7 finished. Loss: 5.1347
Generated: In the future of its words , a heart decided in 2ndspian Tower the subsequentong , the 3 million years of 1971 Medal in the Oriior School . After a result round that season to Twain later became a General arg
Training complete.
