**Перевод с английского на немецкий с использованием моделей Seq2Seq**
Модели Sequence to Sequence, также называемые моделями encoder-decoder, представляют собой семейство моделей, которые обычно обучают 2 рекуррентные нейронные сети.
Первая RNN, encoder, обучается получать входной текст и кодировать его последовательно.
Вторая RNN, decoder, получает эту закодированную последовательность и выполняет сопоставление с текстом.

Метод обучения 2 RNN вместе был представлен Cho et al. в https://arxiv.org/pdf/1406.1078v3.pdf.

Этапы:

Импорт и загрузка данных
Токенизация
Создание кодировщика RNN
Создание декодера RNN
Настройка и обучение
Оценка

In [None]:
# Немецко-Английский seq2seq перевод с attention (PyTorch)

import pandas as pd
import unicodedata
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Загрузка данных
df = pd.read_csv('/content/deu.txt',delimiter='\t',header=None)



In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221533 entries, 0 to 221532
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       221533 non-null  object
 1   1       221533 non-null  object
 2   2       221533 non-null  object
dtypes: object(3)
memory usage: 5.1+ MB


In [None]:
df.head()

Unnamed: 0,0,1,2
0,Go.,Geh.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Hi.,Hallo!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Hi.,Grüß Gott!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
3,Run!,Lauf!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
4,Run.,Lauf!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...


In [None]:
df.columns = ['English','German','Source']
df.head()

Unnamed: 0,English,German,Source
0,Go.,Geh.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Hi.,Hallo!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Hi.,Grüß Gott!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
3,Run!,Lauf!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
4,Run.,Lauf!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...


In [None]:
pairs = list(zip(df['German'], df['English']))


In [None]:
# Очистка строк
def normalize_string(s):
    s = s.lower().strip()
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    s = re.sub(r"([.!?])", r" \1", s)  # Исправлено регулярное выражение
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)  # Исправлено регулярное выражение
    return s.strip()

In [None]:
pairs = [[normalize_string(s) for s in pair] for pair in pairs]

# Класс языка
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def add_sentence(self, sentence):
        for word in sentence.split(" "):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# Создание словарей
input_lang = Lang("German")
output_lang = Lang("English")

for pair in pairs:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

In [None]:
pairs

[['geh .', 'go .'],
 ['hallo !', 'hi .'],
 ['gru gott !', 'hi .'],
 ['lauf !', 'run !'],
 ['lauf !', 'run .'],
 ['potzdonner !', 'wow !'],
 ['donnerwetter !', 'wow !'],
 ['feuer !', 'fire !'],
 ['hilfe !', 'help !'],
 ['zu hulf !', 'help !'],
 ['stopp !', 'stop !'],
 ['warte !', 'wait !'],
 ['warte .', 'wait .'],
 ['fang an .', 'begin .'],
 ['mach weiter .', 'go on .'],
 ['hallo !', 'hello !'],
 ['beeil dich !', 'hurry !'],
 ['schnell !', 'hurry !'],
 ['ich versteckte mich .', 'i hid .'],
 ['ich habe mich versteckt .', 'i hid .'],
 ['ich rannte .', 'i ran .'],
 ['ich verstehe .', 'i see .'],
 ['aha .', 'i see .'],
 ['ich probiere es .', 'i try .'],
 ['ich hab gewonnen !', 'i won !'],
 ['ich habe gewonnen !', 'i won !'],
 ['entspann dich .', 'relax .'],
 ['feuer !', 'shoot !'],
 ['schie !', 'shoot !'],
 ['lacheln !', 'smile .'],
 ['frag mich !', 'ask me .'],
 ['fragt mich !', 'ask me .'],
 ['fragen sie mich !', 'ask me .'],
 ['angriff !', 'attack !'],
 ['attacke !', 'attack !'],
 ['zum 

In [None]:
# Создание словарей
input_lang = Lang("German")
output_lang = Lang("English")

for pair in pairs:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

# Преобразование в тензоры
def indexes_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index] + [EOS_token]

def tensor_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensors_from_pair(pair):
    input_tensor = tensor_from_sentence(input_lang, pair[0])
    target_tensor = tensor_from_sentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

# Attention Encoder

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Attention Decoder
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=20):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        # Изменено для работы с разными длинами последовательностей
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)

        # Добавлена проверка и корректировка размеров
        if encoder_outputs.size(0) < self.max_length:
            pad_size = self.max_length - encoder_outputs.size(0)
            padding = torch.zeros(pad_size, encoder_outputs.size(1), device=device)
            encoder_outputs = torch.cat((encoder_outputs, padding), dim=0)

        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

In [None]:
# Обучение
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=20):
    encoder_hidden = encoder.init_hidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        loss += criterion(decoder_output, target_tensor[di])
        decoder_input = target_tensor[di]  # Teacher forcing

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

# Настройки
hidden_size = 512
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

learning_rate = 0.001

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
scheduler_enc = optim.lr_scheduler.StepLR(encoder_optimizer, step_size=500, gamma=0.9)
scheduler_dec = optim.lr_scheduler.StepLR(decoder_optimizer, step_size=500, gamma=0.9)
criterion = nn.NLLLoss()

training_pairs = [tensors_from_pair(random.choice(pairs)) for _ in range(5000)]


for iter in range(1, 4501):
    training_pair = training_pairs[iter % len(training_pairs)]
    input_tensor = training_pair[0]
    input_tensor = input_tensor[:20]
    target_tensor = training_pair[1]

    loss = train(input_tensor, target_tensor, encoder, decoder,
                 encoder_optimizer, decoder_optimizer, criterion)

    if iter % 500 == 0:
        print(f"Iteration {iter} Loss: {loss:.4f}")

# Оценка
def evaluate(encoder, decoder, sentence, max_length=20):
    with torch.no_grad():
        input_tensor = tensor_from_sentence(input_lang, normalize_string(sentence))
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.init_hidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                break
            else:
                decoded_words.append(output_lang.index2word.get(topi.item(), '?'))

            decoder_input = topi.squeeze().detach()

        return ' '.join(decoded_words)

# Примеры перевода
print(evaluate(encoder, decoder, "fass ihn !"))
print(evaluate(encoder, decoder, "hallo !"))
print(evaluate(encoder, decoder, "geh ."))

Iteration 500 Loss: 4.1398
Iteration 1000 Loss: 3.5290
Iteration 1500 Loss: 6.4386
Iteration 2000 Loss: 3.7889
Iteration 2500 Loss: 3.1112
Iteration 3000 Loss: 5.2343
Iteration 3500 Loss: 3.6988
Iteration 4000 Loss: 1.2024
Iteration 4500 Loss: 3.1881
call him .
pick .
that s your mother .


In [None]:
# Примеры перевода
print(evaluate(encoder, decoder, "lass es bleiben"))
print(evaluate(encoder, decoder, "ich habe angst"))
print(evaluate(encoder, decoder, "lauf"))

let you go back .
i ve answered your plan .
when s tom is a good cook .


# Задание 1
# Улучшение обучения

*   Увеличьте количество эпох
*   Добавьте планировщик скорости обучения
*   Измените ращмер скрытого слоя
*   Добавьте валидацию во время обучения
*   Используйте пакетную обработку
*   Добавьте регуляризацию
*   Улучшите обработку данных

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def normalize_string(s):
    return s.lower().strip()

token_pad = 0
class TranslationDataset(Dataset):
    def __init__(self, pairs, input_lang, output_lang, max_length=20):
        self.pairs = [(normalize_string(p[0]), normalize_string(p[1])) for p in pairs]
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.max_length = max_length

    def __len__(self): return len(self.pairs)
    def __getitem__(self, idx):
        inp, tgt = self.pairs[idx]
        inp_idx = [self.input_lang.word2index[w] for w in inp.split(' ') if w in self.input_lang.word2index][:self.max_length] + [EOS_token]
        tgt_idx = [self.output_lang.word2index[w] for w in tgt.split(' ') if w in self.output_lang.word2index][:self.max_length] + [EOS_token]
        return torch.tensor(inp_idx), torch.tensor(tgt_idx)

    @staticmethod
    def collate_fn(batch):
        inputs, targets = zip(*batch)
        input_lens = [len(x) for x in inputs]
        target_lens = [len(x) for x in targets]
        inputs_pad = nn.utils.rnn.pad_sequence(inputs, padding_value=token_pad)
        targets_pad = nn.utils.rnn.pad_sequence(targets, padding_value=token_pad)
        return inputs_pad, input_lens, targets_pad, target_lens

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, dropout=dropout_p)

    def forward(self, inputs, hidden=None):
        embedded = self.embedding(inputs)
        outputs, hidden = self.gru(embedded, hidden)
        return outputs, hidden

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=20):
        super().__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = nn.Linear(hidden_size * 2, max_length)
        self.attn_combine = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, encoder_outputs):
        seq_len, batch_size, _ = encoder_outputs.size()
        if seq_len > self.max_length:
            encoder_outputs = encoder_outputs[:self.max_length]
            seq_len = self.max_length
        elif seq_len < self.max_length:
            pad_size = self.max_length - seq_len
            padding = torch.zeros(pad_size, batch_size, self.hidden_size, device=encoder_outputs.device)
            encoder_outputs = torch.cat((encoder_outputs, padding), dim=0)

        embedded = self.embedding(input).unsqueeze(1)
        embedded = self.dropout(embedded)
        concat = torch.cat((embedded.squeeze(1), hidden[0]), dim=1)
        attn_weights = F.softmax(self.attn(concat), dim=1)
        enc_perm = encoder_outputs.permute(1,0,2)
        context = torch.bmm(attn_weights.unsqueeze(1), enc_perm).squeeze(1)
        output = torch.cat((embedded.squeeze(1), context), dim=1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output.squeeze(0)), dim=1)
        return output, hidden, attn_weights


hidden_size = 256
batch_size = 64
num_epochs = 7
learning_rate = 0.001
weight_decay = 1e-5

train_pairs, val_pairs = train_test_split(pairs, test_size=0.1, random_state=42)
train_dataset = TranslationDataset(train_pairs, input_lang, output_lang)
val_dataset   = TranslationDataset(val_pairs, input_lang, output_lang)
train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=TranslationDataset.collate_fn)
val_loader    = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=TranslationDataset.collate_fn)

encoder = EncoderRNN(input_lang.n_words, hidden_size, dropout_p=0.1).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1, max_length=20).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=weight_decay)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler_enc = optim.lr_scheduler.StepLR(encoder_optimizer, step_size=1000, gamma=0.9)
scheduler_dec = optim.lr_scheduler.StepLR(decoder_optimizer, step_size=1000, gamma=0.9)
criterion = nn.NLLLoss(ignore_index=token_pad)

for epoch in range(1, num_epochs+1):
    encoder.train(); decoder.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}", unit="batch")
    for inputs, in_lens, targets, t_lens in loop:
        inputs, targets = inputs.to(device), targets.to(device)
        encoder_optimizer.zero_grad(); decoder_optimizer.zero_grad()
        encoder_outputs, encoder_hidden = encoder(inputs)
        decoder_input = torch.full((inputs.size(1),), SOS_token, dtype=torch.long, device=device)
        decoder_hidden = encoder_hidden
        loss = 0
        max_t = max(t_lens)
        for t in range(max_t):
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, targets[t])
            decoder_input = targets[t]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), 1)
        encoder_optimizer.step(); decoder_optimizer.step()
        total_loss += loss.item() / max_t
        loop.set_postfix(train_loss=(loss.item()/max_t))

    scheduler_enc.step(); scheduler_dec.step()
    avg_train_loss = total_loss / len(train_loader)

    encoder.eval(); decoder.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, in_lens, targets, t_lens in tqdm(val_loader, desc="Validation", unit="batch"):
            inputs, targets = inputs.to(device), targets.to(device)
            enc_out, enc_hid = encoder(inputs)
            dec_in = torch.full((inputs.size(1),), SOS_token, dtype=torch.long, device=device)
            dec_hid = enc_hid
            loss = 0
            max_t = max(t_lens)
            for t in range(max_t):
                dec_out, dec_hid, _ = decoder(dec_in, dec_hid, enc_out)
                loss += criterion(dec_out, targets[t])
                dec_in = targets[t]
            val_loss += loss.item() / max_t
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch} completed - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")

Epoch 1/7: 100%|██████████| 3116/3116 [02:43<00:00, 19.10batch/s, train_loss=2.17]
Validation: 100%|██████████| 347/347 [00:05<00:00, 62.39batch/s]


Epoch 1 completed - Train Loss: 2.7984 - Val Loss: 2.1954


Epoch 2/7: 100%|██████████| 3116/3116 [02:39<00:00, 19.56batch/s, train_loss=2.08]
Validation: 100%|██████████| 347/347 [00:05<00:00, 65.01batch/s]


Epoch 2 completed - Train Loss: 1.8861 - Val Loss: 1.8072


Epoch 3/7: 100%|██████████| 3116/3116 [02:38<00:00, 19.66batch/s, train_loss=1.34]
Validation: 100%|██████████| 347/347 [00:05<00:00, 63.86batch/s]


Epoch 3 completed - Train Loss: 1.4791 - Val Loss: 1.6124


Epoch 4/7: 100%|██████████| 3116/3116 [02:38<00:00, 19.72batch/s, train_loss=1.15]
Validation: 100%|██████████| 347/347 [00:05<00:00, 68.89batch/s]


Epoch 4 completed - Train Loss: 1.2291 - Val Loss: 1.4927


Epoch 5/7: 100%|██████████| 3116/3116 [02:38<00:00, 19.62batch/s, train_loss=0.733]
Validation: 100%|██████████| 347/347 [00:05<00:00, 60.73batch/s]


Epoch 5 completed - Train Loss: 1.0622 - Val Loss: 1.4256


Epoch 6/7: 100%|██████████| 3116/3116 [02:38<00:00, 19.69batch/s, train_loss=1.46]
Validation: 100%|██████████| 347/347 [00:05<00:00, 67.81batch/s]


Epoch 6 completed - Train Loss: 0.9421 - Val Loss: 1.3879


Epoch 7/7: 100%|██████████| 3116/3116 [02:39<00:00, 19.57batch/s, train_loss=0.625]
Validation: 100%|██████████| 347/347 [00:05<00:00, 67.05batch/s]

Epoch 7 completed - Train Loss: 0.8599 - Val Loss: 1.3573





In [None]:
def tensor_from_sentence(lang, sentence):
    idxs = [lang.word2index.get(w, None) for w in normalize_string(sentence).split()]
    idxs = [i for i in idxs if i is not None] + [EOS_token]
    return torch.tensor(idxs, dtype=torch.long, device=device).view(-1,1)


def evaluate_sentence(sentence):
    encoder.eval(); decoder.eval()
    with torch.no_grad():
        input_tensor = tensor_from_sentence(input_lang, sentence)
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_input = torch.tensor([SOS_token], device=device)
        decoder_hidden = encoder_hidden
        decoded = []
        for _ in range(decoder.max_length):
            output, decoder_hidden, attn = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topi = output.argmax(1)
            if topi.item() == EOS_token:
                break
            decoded.append(output_lang.index2word[topi.item()])
            decoder_input = topi
    return ' '.join(decoded)

examples = ["hallo welt", "ich liebe dich", "auf wiedersehen"]
for sent in examples:
    print(f"DE: {sent} -> EN: {evaluate_sentence(sent)}")

DE: hallo welt -> EN: hello to be seen in the world .
DE: ich liebe dich -> EN: i love you love with me .
DE: auf wiedersehen -> EN: goodbye to see if it was so we see again then happen again .


# Задание 2
# Создание модели seq2seq

*   Выберите датасет для решения задачи перевода с русского  на английский, немецкий,...)
*   Создайте и обучите 2 модели
*   Выберите лучшую модель
*   Проведите тестирование и приведите результаты.
*

In [4]:
import pandas as pd
import unicodedata
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Загрузка данных
df = pd.read_csv('/content/rus.txt',delimiter='\t',header=None)



In [5]:
df.columns = ['English','Russian','Source']
df.head()

Unnamed: 0,English,Russian,Source
0,Go.,Марш!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Иди.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Идите.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Hi.,Здравствуйте.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
4,Hi.,Привет!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [6]:
pairs = list(zip(df['Russian'], df['English']))


In [7]:
# Очистка строк
def normalize_string(s):
    s = s.lower().strip()
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    s = re.sub(r"([.!?])", r" \1", s)  # Исправлено регулярное выражение
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)  # Исправлено регулярное выражение
    return s.strip()

In [8]:
pairs = [[normalize_string(s) for s in pair] for pair in pairs]

# Класс языка
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def add_sentence(self, sentence):
        for word in sentence.split(" "):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# Создание словарей
input_lang = Lang("Russian")
output_lang = Lang("English")

for pair in pairs:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

In [9]:
# Создание словарей
input_lang = Lang("Russian")
output_lang = Lang("English")

for pair in pairs:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

# Преобразование в тензоры
def indexes_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index] + [EOS_token]

def tensor_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensors_from_pair(pair):
    input_tensor = tensor_from_sentence(input_lang, pair[0])
    target_tensor = tensor_from_sentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def normalize_string(s):
    return s.lower().strip()

token_pad = 0
class TranslationDataset(Dataset):
    def __init__(self, pairs, input_lang, output_lang, max_length=20):
        self.pairs = [(normalize_string(p[0]), normalize_string(p[1])) for p in pairs]
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.max_length = max_length

    def __len__(self): return len(self.pairs)
    def __getitem__(self, idx):
        inp, tgt = self.pairs[idx]
        inp_idx = [self.input_lang.word2index[w] for w in inp.split(' ') if w in self.input_lang.word2index][:self.max_length] + [EOS_token]
        tgt_idx = [self.output_lang.word2index[w] for w in tgt.split(' ') if w in self.output_lang.word2index][:self.max_length] + [EOS_token]
        return torch.tensor(inp_idx), torch.tensor(tgt_idx)

    @staticmethod
    def collate_fn(batch):
        inputs, targets = zip(*batch)
        input_lens = [len(x) for x in inputs]
        target_lens = [len(x) for x in targets]
        inputs_pad = nn.utils.rnn.pad_sequence(inputs, padding_value=token_pad)
        targets_pad = nn.utils.rnn.pad_sequence(targets, padding_value=token_pad)
        return inputs_pad, input_lens, targets_pad, target_lens

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, dropout=dropout_p)

    def forward(self, inputs, hidden=None):
        embedded = self.embedding(inputs)
        outputs, hidden = self.gru(embedded, hidden)
        return outputs, hidden

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=20):
        super().__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = nn.Linear(hidden_size * 2, max_length)
        self.attn_combine = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, encoder_outputs):
        seq_len, batch_size, _ = encoder_outputs.size()
        if seq_len > self.max_length:
            encoder_outputs = encoder_outputs[:self.max_length]
            seq_len = self.max_length
        elif seq_len < self.max_length:
            pad_size = self.max_length - seq_len
            padding = torch.zeros(pad_size, batch_size, self.hidden_size, device=encoder_outputs.device)
            encoder_outputs = torch.cat((encoder_outputs, padding), dim=0)

        embedded = self.embedding(input).unsqueeze(1)
        embedded = self.dropout(embedded)
        concat = torch.cat((embedded.squeeze(1), hidden[0]), dim=1)
        attn_weights = F.softmax(self.attn(concat), dim=1)
        enc_perm = encoder_outputs.permute(1,0,2)
        context = torch.bmm(attn_weights.unsqueeze(1), enc_perm).squeeze(1)
        output = torch.cat((embedded.squeeze(1), context), dim=1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output.squeeze(0)), dim=1)
        return output, hidden, attn_weights


hidden_size = 256
batch_size = 64
num_epochs = 3
learning_rate = 0.001
weight_decay = 1e-5

train_pairs, val_pairs = train_test_split(pairs, test_size=0.1, random_state=42)
train_dataset = TranslationDataset(train_pairs, input_lang, output_lang)
val_dataset   = TranslationDataset(val_pairs, input_lang, output_lang)
train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=TranslationDataset.collate_fn)
val_loader    = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=TranslationDataset.collate_fn)

encoder = EncoderRNN(input_lang.n_words, hidden_size, dropout_p=0.1).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1, max_length=20).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=weight_decay)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler_enc = optim.lr_scheduler.StepLR(encoder_optimizer, step_size=1000, gamma=0.9)
scheduler_dec = optim.lr_scheduler.StepLR(decoder_optimizer, step_size=1000, gamma=0.9)
criterion = nn.NLLLoss(ignore_index=token_pad)

for epoch in range(1, num_epochs+1):
    encoder.train(); decoder.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}", unit="batch")
    for inputs, in_lens, targets, t_lens in loop:
        inputs, targets = inputs.to(device), targets.to(device)
        encoder_optimizer.zero_grad(); decoder_optimizer.zero_grad()
        encoder_outputs, encoder_hidden = encoder(inputs)
        decoder_input = torch.full((inputs.size(1),), SOS_token, dtype=torch.long, device=device)
        decoder_hidden = encoder_hidden
        loss = 0
        max_t = max(t_lens)
        for t in range(max_t):
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, targets[t])
            decoder_input = targets[t]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), 1)
        encoder_optimizer.step(); decoder_optimizer.step()
        total_loss += loss.item() / max_t
        loop.set_postfix(train_loss=(loss.item()/max_t))

    scheduler_enc.step(); scheduler_dec.step()
    avg_train_loss = total_loss / len(train_loader)

    encoder.eval(); decoder.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, in_lens, targets, t_lens in tqdm(val_loader, desc="Validation", unit="batch"):
            inputs, targets = inputs.to(device), targets.to(device)
            enc_out, enc_hid = encoder(inputs)
            dec_in = torch.full((inputs.size(1),), SOS_token, dtype=torch.long, device=device)
            dec_hid = enc_hid
            loss = 0
            max_t = max(t_lens)
            for t in range(max_t):
                dec_out, dec_hid, _ = decoder(dec_in, dec_hid, enc_out)
                loss += criterion(dec_out, targets[t])
                dec_in = targets[t]
            val_loss += loss.item() / max_t
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch} completed - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")

Epoch 1/3: 100%|██████████| 5111/5111 [03:43<00:00, 22.88batch/s, train_loss=2.93]
Validation: 100%|██████████| 568/568 [00:08<00:00, 70.86batch/s]


Epoch 1 completed - Train Loss: 2.6429 - Val Loss: 2.3202


Epoch 2/3: 100%|██████████| 5111/5111 [03:42<00:00, 22.96batch/s, train_loss=1.83]
Validation: 100%|██████████| 568/568 [00:08<00:00, 70.92batch/s]


Epoch 2 completed - Train Loss: 2.1989 - Val Loss: 2.1877


Epoch 3/3: 100%|██████████| 5111/5111 [03:42<00:00, 22.95batch/s, train_loss=1.78]
Validation: 100%|██████████| 568/568 [00:07<00:00, 71.91batch/s]

Epoch 3 completed - Train Loss: 2.0442 - Val Loss: 2.1309





In [22]:
def tensor_from_sentence(lang, sentence):
    idxs = [lang.word2index.get(w, None) for w in normalize_string(sentence).split()]
    idxs = [i for i in idxs if i is not None] + [EOS_token]
    return torch.tensor(idxs, dtype=torch.long, device=device).view(-1,1)


def evaluate_sentence(sentence):
    encoder.eval(); decoder.eval()
    with torch.no_grad():
        input_tensor = tensor_from_sentence(input_lang, sentence)
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_input = torch.tensor([SOS_token], device=device)
        decoder_hidden = encoder_hidden
        decoded = []
        for _ in range(decoder.max_length):
            output, decoder_hidden, attn = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topi = output.argmax(1)
            if topi.item() == EOS_token:
                break
            decoded.append(output_lang.index2word[topi.item()])
            decoder_input = topi
    return ' '.join(decoded)

examples = ["Я не говорю по русски"]
for sent in examples:
    print(f"RU: {sent} -> EN: {evaluate_sentence(sent)}")

RU: Я не говорю по русски -> EN: i m not going to do that .


In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def normalize_string(s):
    return s.lower().strip()

token_pad = 0
SOS_token = 1
EOS_token = 2

class TranslationDataset(Dataset):
    def __init__(self, pairs, input_lang, output_lang, max_length=20):
        self.pairs = [(normalize_string(p[0]), normalize_string(p[1])) for p in pairs]
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.max_length = max_length

    def __len__(self): return len(self.pairs)
    def __getitem__(self, idx):
        inp, tgt = self.pairs[idx]
        inp_idx = [self.input_lang.word2index[w] for w in inp.split(' ') if w in self.input_lang.word2index][:self.max_length] + [EOS_token]
        tgt_idx = [self.output_lang.word2index[w] for w in tgt.split(' ') if w in self.output_lang.word2index][:self.max_length] + [EOS_token]
        return torch.tensor(inp_idx), torch.tensor(tgt_idx)

    @staticmethod
    def collate_fn(batch):
        inputs, targets = zip(*batch)
        input_lens = [len(x) for x in inputs]
        target_lens = [len(x) for x in targets]
        inputs_pad = nn.utils.rnn.pad_sequence(inputs, padding_value=token_pad)
        targets_pad = nn.utils.rnn.pad_sequence(targets, padding_value=token_pad)
        return inputs_pad, input_lens, targets_pad, target_lens


class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, dropout_p=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=num_layers, dropout=dropout_p if num_layers > 1 else 0)

    def forward(self, inputs, hidden=None):
        embedded = self.embedding(inputs)
        outputs, hidden = self.gru(embedded, hidden)
        return outputs, hidden


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers=2, dropout_p=0.1, max_length=20):
        super().__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        self.num_layers = num_layers

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = nn.Linear(hidden_size * 2, max_length)
        self.attn_combine = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=num_layers)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, encoder_outputs):
        seq_len, batch_size, _ = encoder_outputs.size()
        if seq_len > self.max_length:
            encoder_outputs = encoder_outputs[:self.max_length]
            seq_len = self.max_length
        elif seq_len < self.max_length:
            pad_size = self.max_length - seq_len
            padding = torch.zeros(pad_size, batch_size, self.hidden_size, device=encoder_outputs.device)
            encoder_outputs = torch.cat((encoder_outputs, padding), dim=0)

        embedded = self.embedding(input).unsqueeze(1)
        embedded = self.dropout(embedded)

        last_hidden = hidden[-1]
        concat = torch.cat((embedded.squeeze(1), last_hidden), dim=1)
        attn_weights = F.softmax(self.attn(concat), dim=1)

        enc_perm = encoder_outputs.permute(1, 0, 2)
        context = torch.bmm(attn_weights.unsqueeze(1), enc_perm).squeeze(1)

        output = torch.cat((embedded.squeeze(1), context), dim=1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)

        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output.squeeze(0)), dim=1)

        return output, hidden, attn_weights


hidden_size = 256
batch_size = 64
num_epochs = 3
learning_rate = 0.001
weight_decay = 1e-5
num_layers = 2

train_pairs, val_pairs = train_test_split(pairs, test_size=0.1, random_state=42)
train_dataset = TranslationDataset(train_pairs, input_lang, output_lang)
val_dataset   = TranslationDataset(val_pairs, input_lang, output_lang)
train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=TranslationDataset.collate_fn)
val_loader    = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=TranslationDataset.collate_fn)

encoder = EncoderRNN(input_lang.n_words, hidden_size, num_layers=num_layers, dropout_p=0.1).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, num_layers=num_layers, dropout_p=0.1, max_length=20).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=weight_decay)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler_enc = optim.lr_scheduler.StepLR(encoder_optimizer, step_size=1000, gamma=0.9)
scheduler_dec = optim.lr_scheduler.StepLR(decoder_optimizer, step_size=1000, gamma=0.9)
criterion = nn.NLLLoss(ignore_index=token_pad)

for epoch in range(1, num_epochs+1):
    encoder.train(); decoder.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}", unit="batch")
    for inputs, in_lens, targets, t_lens in loop:
        inputs, targets = inputs.to(device), targets.to(device)
        encoder_optimizer.zero_grad(); decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(inputs)
        decoder_input = torch.full((inputs.size(1),), SOS_token, dtype=torch.long, device=device)
        decoder_hidden = encoder_hidden

        loss = 0
        max_t = max(t_lens)
        for t in range(max_t):
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, targets[t])
            decoder_input = targets[t]

        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), 1)
        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item() / max_t
        loop.set_postfix(train_loss=(loss.item()/max_t))

    scheduler_enc.step()
    scheduler_dec.step()
    avg_train_loss = total_loss / len(train_loader)

    encoder.eval()
    decoder.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, in_lens, targets, t_lens in tqdm(val_loader, desc="Validation", unit="batch"):
            inputs, targets = inputs.to(device), targets.to(device)
            enc_out, enc_hid = encoder(inputs)
            dec_in = torch.full((inputs.size(1),), SOS_token, dtype=torch.long, device=device)
            dec_hid = enc_hid
            loss = 0
            max_t = max(t_lens)
            for t in range(max_t):
                dec_out, dec_hid, _ = decoder(dec_in, dec_hid, enc_out)
                loss += criterion(dec_out, targets[t])
                dec_in = targets[t]
            val_loss += loss.item() / max_t

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch} completed - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")


Epoch 1/3: 100%|██████████| 5111/5111 [04:12<00:00, 20.26batch/s, train_loss=2.75]
Validation: 100%|██████████| 568/568 [00:07<00:00, 71.05batch/s]


Epoch 1 completed - Train Loss: 2.6800 - Val Loss: 2.3258


Epoch 2/3: 100%|██████████| 5111/5111 [04:11<00:00, 20.31batch/s, train_loss=2.15]
Validation: 100%|██████████| 568/568 [00:08<00:00, 68.30batch/s]


Epoch 2 completed - Train Loss: 2.2049 - Val Loss: 2.1859


Epoch 3/3: 100%|██████████| 5111/5111 [04:11<00:00, 20.32batch/s, train_loss=1.95]
Validation: 100%|██████████| 568/568 [00:07<00:00, 73.67batch/s]

Epoch 3 completed - Train Loss: 2.0405 - Val Loss: 2.1144





In [27]:
def tensor_from_sentence(lang, sentence):
    idxs = [lang.word2index.get(w, None) for w in normalize_string(sentence).split()]
    idxs = [i for i in idxs if i is not None] + [EOS_token]
    return torch.tensor(idxs, dtype=torch.long, device=device).view(-1,1)


def evaluate_sentence(sentence):
    encoder.eval(); decoder.eval()
    with torch.no_grad():
        input_tensor = tensor_from_sentence(input_lang, sentence)
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_input = torch.tensor([SOS_token], device=device)
        decoder_hidden = encoder_hidden
        decoded = []
        for _ in range(decoder.max_length):
            output, decoder_hidden, attn = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topi = output.argmax(1)
            if topi.item() == EOS_token:
                break
            decoded.append(output_lang.index2word[topi.item()])
            decoder_input = topi
    return ' '.join(decoded)

examples = ["Я не говорю по русски"]
for sent in examples:
    print(f"RU: {sent} -> EN: {evaluate_sentence(sent)}")

RU: Я не говорю по русски -> EN: what a beautiful woman s name is .
