In [4]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import tiktoken

import sys
import os
project_root = os.path.abspath('..')

if project_root not in sys.path:
    sys.path.append(project_root)
from model import GPTModel
from utils import create_dataloader_v1

## Wczytanie i konfiguracja modelu

In [5]:
GPT_CONFIG_124M = {
    "vocab_size": 50257, #słownik
    "context_length": 256, #kontekst
    "emb_dim": 768, #osadzenia
    "n_heads": 12, #głowice uwagi
    "n_layers": 12, #warstwy
    "drop_rate": 0.1, #porzucenie
    "qkv_bias": False
}

In [6]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [7]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())


In [8]:
from utils import generate_text_simple

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

In [9]:
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)
print("Tekst wynikowy:\n", token_ids_to_text(token_ids, tokenizer))

Tekst wynikowy:
 Every effort moves you rentingetic minion mobilized Macicone warrantyuler anim supra


## Oblicznanie straty

### Ładowaie tekstu

In [10]:
file_path = "the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read() 

In [11]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Liczba znaków:", total_characters)
print("Liczba tokenów:", total_tokens)

Liczba znaków: 20479
Liczba tokenów: 5145


### Podział danych

In [12]:
train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [13]:
train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_lenght=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_lenght=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [14]:
print("Szkoleniowy mechanizm ładujący:")
for x, y in train_loader:
    print(x.shape, y.shape)

Szkoleniowy mechanizm ładujący:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [15]:
print("\nWalidacyjny mechanizm ładujący:")
for x, y in val_loader:
    print(x.shape, y.shape)


Walidacyjny mechanizm ładujący:
torch.Size([2, 256]) torch.Size([2, 256])


In [16]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
        logits.flatten(0,1), target_batch.flatten()
    )
    return loss

In [17]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [18]:
print(torch.backends.mps.is_available())

True


In [20]:
device = torch.device("mps" if torch.mps.is_available() else "cpu")
model.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [21]:
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
    print("Strata zbioru szkoleniowego: ", train_loss)
    print("Strata zbioru walidacyjnego:", val_loss)

Strata zbioru szkoleniowego:  10.602880239486694
Strata zbioru walidacyjnego: 10.764405250549316


## Prosta pętla szkoleniowa

In [27]:
def train_model_simple(model, train_loader, val_loader, optimizer,
                       device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [],[],[]
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )

            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epoka {epoch+1} (krok {global_step:06d}): "
                      f"Strata zbioru szkoleniowego {train_loss:.3f}, "
                      f"Strata zbioru walidacyjnego {val_loss:.3f}")
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )
    return train_losses, val_losses, track_tokens_seen

In [23]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(
            train_loader, model, device, num_batches=eval_iter
        )

        val_loss = calc_loss_loader(
            val_loader, model, device, num_batches=eval_iter
        )
        model.train()
        return train_loss, val_loss

In [24]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)

    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model,
            idx=encoded,
            max_new_tokens=50,
            context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()

In [28]:
torch.manual_seed(123)
model=GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.0004, weight_decay=0.1)
num_epochs=10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves you", tokenizer=tokenizer
)

Epoka 1 (krok 000000): Strata zbioru szkoleniowego 9.625, Strata zbioru walidacyjnego 9.780
Epoka 1 (krok 000005): Strata zbioru szkoleniowego 7.848, Strata zbioru walidacyjnego 8.166
Every effort moves you,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Epoka 2 (krok 000010): Strata zbioru szkoleniowego 6.665, Strata zbioru walidacyjnego 7.018
Epoka 2 (krok 000015): Strata zbioru szkoleniowego 6.098, Strata zbioru walidacyjnego 6.504
Every effort moves you,,,nd,,nd,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Epoka 3 (krok 000020): Strata zbioru szkoleniowego 5.679, Strata zbioru walidacyjnego 6.422
Every effort moves youarrassarrassarrassarrassarrassarrassarrassppelarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrassarrass
Epoka 4 (krok 000025): Strata zbioru szkoleniowego 5.548, Strata zbioru