In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
# ===== 1. Dataset =====
class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_tokenizer, tgt_tokenizer, max_len=50):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.src_tok = src_tokenizer
        self.tgt_tok = tgt_tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src = self.src_tok(self.src_texts[idx], truncation=True, max_length=self.max_len, padding='max_length', return_tensors='pt')
        tgt = self.tgt_tok(self.tgt_texts[idx], truncation=True, max_length=self.max_len, padding='max_length', return_tensors='pt')
        return src['input_ids'].squeeze(), tgt['input_ids'].squeeze()

In [3]:
# ===== 2. Encoder-Decoder model =====
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        outputs, hidden = self.rnn(x)
        return outputs, hidden


class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        output, hidden = self.rnn(x, hidden)
        logits = self.fc(output)
        return logits, hidden


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        _, hidden = self.encoder(src)
        logits, _ = self.decoder(tgt[:, :-1], hidden)
        return logits

In [4]:
# ===== 3. –û–±—É—á–µ–Ω–∏–µ =====
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        logits = model(src, tgt)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
from transformers import MarianTokenizer, MarianMTModel, Trainer, TrainingArguments
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset

In [6]:
data = {
    "en": ["Hello world!", "How are you?", "Good morning"],
    "ru": ["–ü—Ä–∏–≤–µ—Ç, –º–∏—Ä!", "–ö–∞–∫ –¥–µ–ª–∞?", "–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ"]
}
dataset = Dataset.from_dict(data).train_test_split(test_size=0.2)

# ===== 2. –¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è =====
model_name = "Helsinki-NLP/opus-mt-en-ru"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [7]:
def preprocess(batch):
    model_inputs = tokenizer(batch["en"], truncation=True, padding="max_length", max_length=64)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["ru"], truncation=True, padding="max_length", max_length=64)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess, batched=True, remove_columns=["en", "ru"])

Map:   0%|          | 0/2 [00:00<?, ? examples/s]



Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [8]:
args = TrainingArguments(
    output_dir="./mt_checkpoints",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    learning_rate=3e-5
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer
)

trainer.train()

comet_ml is installed but the Comet API Key is not configured. Please set the `COMET_API_KEY` environment variable to enable Comet logging. Check out the documentation for other ways of configuring it: https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#set-the-api-key
  trainer = Trainer(


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 9.439791679382324, 'eval_runtime': 0.7696, 'eval_samples_per_second': 1.299, 'eval_steps_per_second': 1.299, 'epoch': 1.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.163665771484375, 'eval_runtime': 0.037, 'eval_samples_per_second': 27.028, 'eval_steps_per_second': 27.028, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.531033992767334, 'eval_runtime': 0.025, 'eval_samples_per_second': 40.012, 'eval_steps_per_second': 40.012, 'epoch': 3.0}
{'train_runtime': 15.48, 'train_samples_per_second': 0.388, 'train_steps_per_second': 0.194, 'train_loss': 9.056751251220703, 'epoch': 3.0}


TrainOutput(global_step=3, training_loss=9.056751251220703, metrics={'train_runtime': 15.48, 'train_samples_per_second': 0.388, 'train_steps_per_second': 0.194, 'total_flos': 101695094784.0, 'train_loss': 9.056751251220703, 'epoch': 3.0})