In [1]:
import math

import tokenizer
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from datasets import load_dataset
from transformers import BertTokenizer
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
class InputEmbeddingNew(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)



Создаём эмбеддинг-слой, который превращает номер слова в вектор фиксированной размерности `d_model`
Это улучшает обучение модели (так в статье)



In [52]:
class PositionalEncodingNew(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_len, d_model) #Таблица [макс дина x размерность], записываем информацию о позициях.
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].detach()
        return self.dropout(x)

Добавляет информацию о позиции токена во входной последовательности с помощью синусоид.

## Формулы:
PE(pos, 2i) = sin(pos / 10000^(2i/d_model)) 

PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))

Где:
- `pos` — позиция токена.
- `i` — индекс размерности.

Позволяет модели использовать положение слов.

In [53]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        return self.net(x)



Обычная нейросеть: два линейных слоя, с ReLU посередине.
 
## Формула:

FFN(x) = max(0, xW₁ + b₁)W₂ + b₂

- Первая линейная трансформация увеличивает размерность до `d_ff`, затем применяется ReLU, и возвращается к `d_model`.

In [54]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        assert d_model % heads == 0

        self.d_k = d_model // heads
        self.heads = heads

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, q, k, v, mask=None):
        B, L, _ = q.shape

        def transform(x, layer):
            x = layer(x)
            x = x.view(B, -1, self.heads, self.d_k).transpose(1, 2)
            return x

        q = transform(q, self.w_q)
        k = transform(k, self.w_k)
        v = transform(v, self.w_v)

        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) #Считаем, насколько слово q связано со словом k. Делим на корень, чтобы числа не разлетались
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
#Превращаем оценки в вероятности 
        context = torch.matmul(attn, v) 
        context = context.transpose(1, 2).contiguous().view(B, -1, self.heads * self.d_k)
        return self.w_o(context)

Позволяет модели "смотреть" на разные части входа одновременно с разных "углов зрения".

## Формулы:
1. Входные преобразования:
Q = XW^Q, K = XW^K, V = XW^V


2. Считаем "внимание":
Attention(Q, K, V) = softmax(QKᵀ / sqrt(d_k)) V


3. Разделение на головы (h голов):
MultiHead(Q, K, V) = Concat(head₁, ..., head_h) W^O

где:

head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)

- Каждая голова работает с проекциями меньшей размерности (d_k = d_model / heads).
- Маска используется для запрета определенных позиций

In [55]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model, heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, heads, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        x2 = self.norm1(x)
        x = x + self.dropout(self.self_attn(x2, x2, x2, mask))
        x2 = self.norm2(x)
        x = x + self.dropout(self.ff(x2))
        return x


Один слой энкодера = self-attention + feed-forward.

- Нормализация перед каждым блоком (pre-norm).
- Остаточные связи стабилизируют обучение.

In [56]:
class Encoder(nn.Module):
    def __init__(self, layer, N, d_model):
        super().__init__()
        self.layers = nn.ModuleList([layer for _ in range(N)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


Последовательное применение `N` энкодер-блоков.

Каждый блок применяет self-attention и feed-forward над входом, и результат нормализуется в конце.


In [57]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, heads, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, memory, src_mask, tgt_mask):
        x2 = self.norm1(x)
        x = x + self.dropout(self.self_attn(x2, x2, x2, tgt_mask))
        x2 = self.norm2(x)
        x = x + self.dropout(self.cross_attn(x2, memory, memory, src_mask))
        x2 = self.norm3(x)
        x = x + self.dropout(self.ff(x2))
        return x
 

Один слой декодера. Включает:
1. Masked self-attention
2. Cross-attention (с encoder output)
3. Feed-forward

Self-attention (только на предыдущие слова)

Cross-attention (на выход энкодера)

x = x + Dropout(SelfAttention(LayerNorm(x))) x = x + Dropout(CrossAttention(LayerNorm(x), memory)) x = x + Dropout(FeedForward(LayerNorm(x)))

- `masked self-attention`: маскирует будущие токены.
- `cross-attention`: позволяет декодеру использовать информацию из энкодера.

In [58]:
class Decoder(nn.Module):
    def __init__(self, layer, N, d_model):
        super().__init__()
        self.layers = nn.ModuleList([layer for _ in range(N)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

Применяет `N` блоков декодера

.
На каждом шаге декодирования используется как сам ввод, так и закодированное представление из энкодера.

In [59]:
class ProjectionLayer(nn.Module):
    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        return torch.log_softmax(self.proj(x), dim=-1)

Проецирует выход декодера на размерность словаря и применяет логарифм от softmax.

## Формула:
P(y_t | ...) = log_softmax(Linear(output), dim=-1)

Где:
- `Linear(output)` — логиты по словарю.
- `log_softmax` — логарифм вероятностей для обучения с использованием `NLLLoss`.

На выходе — логарифм вероятностей каждого слова в словаре.

In [60]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, N=6, heads=8, d_ff=2048, dropout=0.1, max_len=5000, share_weights=True):
        super().__init__()

        self.src_embed = nn.Sequential(
            InputEmbedding(d_model, src_vocab_size),
            PositionalEncoding(d_model, max_len, dropout)
        )

        self.tgt_embed = nn.Sequential(
            InputEmbedding(d_model, tgt_vocab_size),
            PositionalEncoding(d_model, max_len, dropout)
        )

        encoder_block = EncoderBlock(d_model, heads, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, heads, d_ff, dropout)

        self.encoder = Encoder(encoder_block, N, d_model)
        self.decoder = Decoder(decoder_block, N, d_model)

        self.projection = ProjectionLayer(d_model, tgt_vocab_size)

        #(если размеры совпадают)
        if share_weights and src_vocab_size == tgt_vocab_size:
            self.tgt_embed[0].embedding.weight = self.projection.proj.weight

        self._init_weights()

    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def encode(self, src, src_mask):
        x = self.src_embed(src)
        return self.encoder(x, src_mask)

    def decode(self, tgt, memory, src_mask, tgt_mask):
        x = self.tgt_embed(tgt)
        return self.decoder(x, memory, src_mask, tgt_mask)

    def forward(self, src, tgt, src_mask, tgt_mask):
        memory = self.encode(src, src_mask)
        out = self.decode(tgt, memory, src_mask, tgt_mask)
        return self.projection(out)
        


## Компоненты:
1. Эмбеддинг + позиционная энкодировка (src & tgt).
2. Энкодер: `N` EncoderBlock.
3. Декодер: `N` DecoderBlock.
4. Линейная проекция в словарь.




In [2]:
import math
import torch
import torch.nn as nn


class InputEmbedding(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)


class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, nhead=8, num_encoder_layers=6,
                 num_decoder_layers=6, dim_feedforward=1024, dropout=0.1, max_len=5000, share_weights=True):
        super().__init__()

        self.src_embed = nn.Sequential(
            InputEmbedding(d_model, src_vocab_size),
            PositionalEncoding(d_model, max_len, dropout)
        )

        self.tgt_embed = nn.Sequential(
            InputEmbedding(d_model, tgt_vocab_size),
            PositionalEncoding(d_model, max_len, dropout)
        )

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True 
        )

        self.projection = nn.Linear(d_model, tgt_vocab_size)

        if share_weights and src_vocab_size == tgt_vocab_size:
            self.tgt_embed[0].embedding.weight = self.projection.weight

        self._init_weights()

    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None,
                src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        src_emb = self.src_embed(src)
        tgt_emb = self.tgt_embed(tgt)

        output = self.transformer(
            src=src_emb,
            tgt=tgt_emb,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            memory_mask=memory_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask
        )

        return torch.log_softmax(self.projection(output), dim=-1)


In [3]:
dataset = load_dataset("Helsinki-NLP/opus-100", "en-ru")

In [4]:
dataset["train"] = dataset["train"].select(range(50000))  


In [6]:
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})


In [5]:
import sentencepiece as spm
from transformers import MarianTokenizer

src_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
tgt_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")


In [6]:
import torch
from torch.utils.data import DataLoader


def preprocess_function(examples):
    src_texts = [ex["en"] for ex in examples["translation"]]
    tgt_texts = [ex["ru"] for ex in examples["translation"]]

    model_inputs = src_tokenizer(
        src_texts, max_length=128, truncation=True, padding="max_length"
    )

    labels = tgt_tokenizer(
        tgt_texts, max_length=128, truncation=True, padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
src_vocab_size = src_tokenizer.vocab_size
tgt_vocab_size = tgt_tokenizer.vocab_size
d_model = 256
nhead = 4
num_layers = 4
batch_size = 32
num_epochs = 100

In [8]:



model = TransformerModel(
    src_vocab_size=src_tokenizer.vocab_size,
    tgt_vocab_size=tgt_tokenizer.vocab_size,
    d_model=d_model,
    nhead=nhead,
    num_encoder_layers=num_layers,
    num_decoder_layers=num_layers
)


criterion = nn.NLLLoss(ignore_index=tgt_tokenizer.pad_token_id)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)


In [9]:
pytorch_total_params = sum(p.numel() for p in model.parameters())

print(pytorch_total_params)

39445558


In [20]:
print(model)

TransformerModel(
  (src_embed): Sequential(
    (0): InputEmbedding(
      (embedding): Embedding(62518, 128)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (tgt_embed): Sequential(
    (0): InputEmbedding(
      (embedding): Embedding(62518, 128)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
          )
          (linear1): Linear(in_features=128, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=128, bias=True)
          (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((128,), eps=1e-05

In [10]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = np.inf
        self.counter = 0

    def step(self, val_loss):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1

        return self.counter >= self.patience

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

TransformerModel(
  (src_embed): Sequential(
    (0): InputEmbedding(
      (embedding): Embedding(62518, 256)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (tgt_embed): Sequential(
    (0): InputEmbedding(
      (embedding): Embedding(62518, 256)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-3): 4 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=1024, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-

In [12]:
def log(msg):
    print(msg)
    log_file.write(msg + "\n")
    log_file.flush()


def generate_square_subsequent_mask(sz):
    return torch.triu(torch.ones(sz, sz) * float("-inf"), diagonal=1)


def collate_fn(batch):
    src = torch.tensor([x["input_ids"] for x in batch], dtype=torch.long)
    tgt = torch.tensor([x["labels"] for x in batch], dtype=torch.long)
    return src.to(device), tgt.to(device)


In [13]:

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=batch_size, collate_fn=collate_fn)



In [14]:
from torch.cuda.amp import GradScaler, autocast
import os
import torch
import matplotlib.pyplot as plt
from torch.nn.utils.rnn import pad_sequence
scaler = GradScaler()
log_file = open("training_log.txt", "w", encoding="utf-8")
best_val_loss = float("inf")

def train(model, criterion, optimizer, num_epochs=10, device="cuda", log_interval=1, train_dataloader = train_dataloader, val_dataloader = val_dataloader, patience=7):
    train_losses, val_losses = [], []
    best_val_loss = float("inf")
    early_stopping = EarlyStopping(patience=patience)
    os.makedirs("checkpoints", exist_ok=True)

    log_file = open("training.log", "w")

    def log(msg):
        print(msg)
        log_file.write(msg + "\n")
        log_file.flush()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        total_tokens = 0

        for batch_idx, (src, tgt) in enumerate(train_dataloader):
            src, tgt = src.to(device), tgt.to(device)

            tgt_tokenizer.bos_token_id = 1
            bos_id = tgt_tokenizer.bos_token_id
            tgt_input = torch.cat([
                torch.full((tgt.size(0), 1), bos_id, dtype=torch.long, device=device),
                tgt[:, :-1]
            ], dim=1)

            tgt_output = tgt[:, 1:]


            tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)).to(device)
            src_padding_mask = (src == src_tokenizer.pad_token_id)
            tgt_padding_mask = (tgt_input == tgt_tokenizer.pad_token_id)

            optimizer.zero_grad()

            with torch.autocast(device_type='cuda'):
                output = model(
                    src, tgt_input,
                    tgt_mask=tgt_mask,
                    src_key_padding_mask=src_padding_mask,
                    tgt_key_padding_mask=tgt_padding_mask,
                )
                output = output[:, :tgt_output.size(1), :]
                loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            non_pad_tokens = (tgt_output != tgt_tokenizer.pad_token_id).sum().item()
            total_tokens += non_pad_tokens
            total_loss += loss.item() * non_pad_tokens

            if batch_idx % log_interval == 0:
                log(
                    f"[Эпоха {epoch+1}/{num_epochs}] "
                    f"[Батч {batch_idx+1}/{len(train_dataloader)}] "
                    f"Потеря: {loss.item():.4f} | Средняя потеря: {total_loss/total_tokens:.4f}"
                )
                log_file.flush()

        avg_train_loss = total_loss / total_tokens
        train_losses.append(avg_train_loss)


        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_dataloader:
                src, tgt = batch
                src = src.to(device)
                tgt = tgt.to(device)

                bos_id = tgt_tokenizer.bos_token_id
                tgt_input = torch.cat([
                    torch.full((tgt.size(0), 1), bos_id, dtype=torch.long, device=device),
                    tgt[:, :-1]
                ], dim=1)
                tgt_output = tgt[:, 1:]
                tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)).to(device)

                src_padding_mask = (src == src_tokenizer.pad_token_id)
                tgt_padding_mask = (tgt_input == tgt_tokenizer.pad_token_id)

                output = model(
                    src, tgt_input,
                    tgt_mask=tgt_mask,
                    src_key_padding_mask=src_padding_mask,
                    tgt_key_padding_mask=tgt_padding_mask
                )
                loss = criterion(output.view(-1, output.size(-1)), tgt_output.reshape(-1))
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_dataloader)
        val_losses.append(avg_val_loss)

        log(f"Эпоха {epoch+1} все: Train loss = {avg_train_loss:.4f}, Val loss = {avg_val_loss:.4f}")
        log_file.flush()

        if early_stopping.step(val_loss):
            print(f"Стоп  обучения на эпохе {epoch+1} (без улучшений {patience} эпох)")
            break

        checkpoint_path = f"checkpoints/epoch_{epoch+1}.pt"
        torch.save({
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "train_loss": train_losses,
            "val_loss": val_losses,
        }, checkpoint_path)
        log(f"Сохранено: {checkpoint_path}")
        log_file.flush()


        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "checkpoints/best_model.pt")
            log("Сохранена новая модель")
            log_file.flush()

    log_file.close()
    return train_losses, val_losses

  scaler = GradScaler()


In [18]:
train_losses, val_losses = train(
    model=model, 
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=num_epochs,
    device=device,
    log_interval=1
)

[Эпоха 1/100] [Батч 1/1563] Потеря: 7.8852 | Средняя потеря: 7.8852
[Эпоха 1/100] [Батч 2/1563] Потеря: 8.1183 | Средняя потеря: 8.0119
[Эпоха 1/100] [Батч 3/1563] Потеря: 7.8970 | Средняя потеря: 7.9787
[Эпоха 1/100] [Батч 4/1563] Потеря: 8.0195 | Средняя потеря: 7.9901
[Эпоха 1/100] [Батч 5/1563] Потеря: 7.7092 | Средняя потеря: 7.9380
[Эпоха 1/100] [Батч 6/1563] Потеря: 7.9387 | Средняя потеря: 7.9381
[Эпоха 1/100] [Батч 7/1563] Потеря: 7.7699 | Средняя потеря: 7.9192
[Эпоха 1/100] [Батч 8/1563] Потеря: 7.8881 | Средняя потеря: 7.9146
[Эпоха 1/100] [Батч 9/1563] Потеря: 8.0773 | Средняя потеря: 7.9352
[Эпоха 1/100] [Батч 10/1563] Потеря: 8.0103 | Средняя потеря: 7.9455
[Эпоха 1/100] [Батч 11/1563] Потеря: 7.9267 | Средняя потеря: 7.9440
[Эпоха 1/100] [Батч 12/1563] Потеря: 7.8797 | Средняя потеря: 7.9377
[Эпоха 1/100] [Батч 13/1563] Потеря: 8.0557 | Средняя потеря: 7.9471
[Эпоха 1/100] [Батч 14/1563] Потеря: 7.7477 | Средняя потеря: 7.9336
[Эпоха 1/100] [Батч 15/1563] Потеря: 7.8629

KeyboardInterrupt: 