In [None]:
import os
import random
import gdown
import zipfile
from tqdm.autonotebook import tqdm

from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator, Vocab
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import random

  from tqdm.autonotebook import tqdm


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
with torch.no_grad():
  torch.cuda.empty_cache()

In [None]:
if not os.path.exists('data'):
    os.makedirs('data')

url = 'https://drive.google.com/uc?id=1cfUt8-cJR7gWLv68ya_gZwbIZgq29F7D'
zip_file_path = 'data/1mcorpus.zip'

gdown.download(url, zip_file_path, quiet=False)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('data')
os.remove(zip_file_path)
os.listdir('data/1mcorpus')

Downloading...
From (original): https://drive.google.com/uc?id=1cfUt8-cJR7gWLv68ya_gZwbIZgq29F7D
From (redirected): https://drive.google.com/uc?id=1cfUt8-cJR7gWLv68ya_gZwbIZgq29F7D&confirm=t&uuid=49c9f0cf-d701-4f35-bc15-efb8ffbdc973
To: /content/data/1mcorpus.zip
100%|██████████| 122M/122M [00:02<00:00, 58.6MB/s]


['corpus.en_ru.1m.ru', 'corpus.en_ru.1m.en']

In [None]:
with open('data/1mcorpus/corpus.en_ru.1m.en', 'r', encoding='utf-8') as f:
    english_sentences = f.readlines()

with open('data/1mcorpus/corpus.en_ru.1m.ru', 'r', encoding='utf-8') as f:
    russian_sentences = f.readlines()

############
english_sentences = english_sentences[:20000]
russian_sentences = russian_sentences[:20000]
#########

print(f"Количество предложений на английском: {len(english_sentences)}")
print(f"Количество предложений на русском: {len(russian_sentences)}")

train_size = int(0.8 * len(english_sentences))
train_english_sentences = english_sentences[:train_size]
train_russian_sentences = russian_sentences[:train_size]

test_english_sentences = english_sentences[train_size:]
test_russian_sentences = russian_sentences[train_size:]

print(f"Количество обучающих предложений: {len(train_english_sentences)}")
print(f"Количество тестовых предложений: {len(test_english_sentences)}")

Количество предложений на английском: 20000
Количество предложений на русском: 20000
Количество обучающих предложений: 16000
Количество тестовых предложений: 4000


In [None]:
russian_sentences[:5]

['Такое развитие характера Гарри может разочаровать читателей, полюбивших его былую мстительность, но с другой стороны это преображение укрепляет позицию тех, кто не видит глубже сюжета и изображения героев.\n',
 'Решение суда (группа вернулась под крыло к Elektra Entertainment) предотвратило дальнейшие нападки со стороны неугомонного Ульриха и не позволило ему обнародовать детали нового контракта.\n',
 'Когда тебе 18 или 19 лет, легко перенимать бандитские повадки и переносить их в группу.\n',
 'А сейчас куча триьютов тем же самым BLACK SABBATH и KISS.\n',
 'Я был единственным, кто занялся копированием демо на кассете.\n']

In [None]:
english_sentences[:5]

["This new development in Harry's character may be a disappointment to those readers who enjoyed his old vindictive ways, but it also reinforces the position of pro-Potter people who do not see beneath the surface appearance of the characters and plots.\n",
 'A nondisclosure clause in the final settlement (the band is back on Elektra) prevents Ulrich, an irrepressible motormouth, from providing any juicy contractual details.\n',
 "When you're 18 or 19 years old, you have that gang mentality in your band.\n",
 'Now you have Black Sabbath and Kiss tribute albums.\n',
 'I was the one who sat down and copied them.\n']

In [None]:
# Токенизация текстов
russian_tokenizer = get_tokenizer('basic_english')
english_tokenizer = get_tokenizer('basic_english')

# Построение словарей
def build_vocab(sentences, tokenizer):
    def yield_tokens(sentences):
        for sentence in sentences:
            yield tokenizer(sentence)

    return build_vocab_from_iterator(yield_tokens(sentences), specials=['<unk>', '<pad>', '<bos>', '<eos>'])

# Построим словари для русского и английского языков
russian_vocab = build_vocab(train_russian_sentences, russian_tokenizer)
english_vocab = build_vocab(train_english_sentences, english_tokenizer)

# Установим индексы для специальных токенов
russian_vocab.set_default_index(russian_vocab['<unk>'])
english_vocab.set_default_index(english_vocab['<unk>'])

# Проверим размер словарей
print(f"Размер словаря русского языка: {len(russian_vocab)}")
print(f"Размер словаря английского языка: {len(english_vocab)}")



Размер словаря русского языка: 58411
Размер словаря английского языка: 28647


In [None]:
class TranslationDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, src_vocab, tgt_vocab, src_tokenizer, tgt_tokenizer):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src_sentence = self.src_sentences[idx]
        tgt_sentence = self.tgt_sentences[idx]

        src_tokens = [self.src_vocab[token] for token in self.src_tokenizer(src_sentence)]
        tgt_tokens = [self.tgt_vocab[token] for token in self.tgt_tokenizer(tgt_sentence)]

        return torch.tensor(src_tokens), torch.tensor(tgt_tokens)

train_dataset = TranslationDataset(train_russian_sentences, train_english_sentences, russian_vocab, english_vocab, russian_tokenizer, english_tokenizer)
test_dataset = TranslationDataset(test_russian_sentences, test_english_sentences, russian_vocab, english_vocab, russian_tokenizer, english_tokenizer)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda x: x)

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        hidden = self.encoder(src)

        input = trg[0, :]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1

        return outputs

In [None]:
INPUT_DIM = len(russian_vocab)
OUTPUT_DIM = len(english_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=english_vocab['<pad>'])

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(iterator):
        src, trg = zip(*batch)
        src = nn.utils.rnn.pad_sequence(src, padding_value=russian_vocab['<pad>']).to(device)
        trg = nn.utils.rnn.pad_sequence(trg, padding_value=english_vocab['<pad>']).to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    for i, batch in enumerate(iterator):
        src, trg = zip(*batch)
        src = nn.utils.rnn.pad_sequence(src, padding_value=russian_vocab['<pad>']).to(device)
        trg = nn.utils.rnn.pad_sequence(trg, padding_value=english_vocab['<pad>']).to(device)

        output = model(src, trg, 0)

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

N_EPOCHS = 2
CLIP = 1

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    print('Loss ', train_loss)

Loss  7.292766580581665
Loss  6.851080514907837


In [None]:
test_loss = evaluate(model, test_loader, criterion)
print(test_loss)

7.170359771728515


In [None]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(58411, 256)
    (rnn): GRU(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(28647, 256)
    (rnn): GRU(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=28647, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def translate_sentence(model, sentence, src_vocab, trg_vocab, src_tokenizer, device, max_len=50):
    model.eval()

    tokens = [token for token in src_tokenizer(sentence)]
    tokens = ['<bos>'] + tokens + ['<eos>']
    src_indexes = [src_vocab[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    hidden = model.encoder(src_tensor)

    trg_indexes = [trg_vocab['<bos>']]
    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

        output, hidden = model.decoder(trg_tensor, hidden)

        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)

        if pred_token == trg_vocab['<eos>']:
            break

    trg_tokens = [trg_vocab.lookup_token(idx) for idx in trg_indexes]

    return trg_tokens[1:-1]  # Обрезаем <bos> и <eos>

# Функция для перевода и вывода результатов
def translate_and_print(model, dataset, src_vocab, trg_vocab, src_tokenizer, device, n_sentences=10):
    model.eval()
    for i in range(n_sentences):
        src_sentence, trg_sentence = dataset[i]
        src_sentence_text = ' '.join([src_vocab.lookup_token(idx.item()) for idx in src_sentence])
        trg_sentence_text = ' '.join([trg_vocab.lookup_token(idx.item()) for idx in trg_sentence])

        translation = translate_sentence(model, src_sentence_text, src_vocab, trg_vocab, src_tokenizer, device)
        translation_sentence = ' '.join(translation)

        print(f'Исходное предложение: {src_sentence_text}')
        print(f'Перевод: {translation_sentence}')
        print(f'Ожидаемый перевод: {trg_sentence_text}')
        print()

translate_and_print(model, test_dataset, russian_vocab, english_vocab, russian_tokenizer, device, n_sentences=10)

Исходное предложение: он окончательно убедился в том , что отец решил не противодействовать естественному ходу событий иисус был <unk> решимости не прибегать к каким-либо из своих возможностей <unk> , верховного главы своей вселенной , ради собственного спасения .
Перевод: the , , the the of of the , , the , of the , of the , , the , of the , of the , of the , of the , of the , of the , of the , of the , of the , of
Ожидаемый перевод: he was at last convinced that the father intended to allow natural events to take their course he was fully determined to employ none of his sovereign power as the supreme head of a universe to save himself .

Исходное предложение: эти два человека имеют много общего , в том числе <unk> и желание <unk> американскую мощь .
Перевод: the the of the the of the the of the the of the the of the and the of the the of the and the of the the of the and the of the the of the and the of the the of the and the of the the
Ожидаемый перевод: these two men share much , 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import random

# Параметры модели
INPUT_DIM = len(russian_vocab)
OUTPUT_DIM = len(english_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=english_vocab['<pad>'])

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(iterator):
        src, trg = zip(*batch)
        src = nn.utils.rnn.pad_sequence(src, padding_value=russian_vocab['<pad>']).to(device)
        trg = nn.utils.rnn.pad_sequence(trg, padding_value=english_vocab['<pad>']).to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, trg = zip(*batch)
            src = nn.utils.rnn.pad_sequence(src, padding_value=russian_vocab['<pad>']).to(device)
            trg = nn.utils.rnn.pad_sequence(trg, padding_value=english_vocab['<pad>']).to(device)

            output = model(src, trg, 0)

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

N_EPOCHS = 2
CLIP = 1

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    print('Loss ', train_loss)

# Функция для перевода предложения
def translate_sentence(model, sentence, src_vocab, trg_vocab, src_tokenizer, device, max_len=50):
    model.eval()

    tokens = [token for token in src_tokenizer(sentence)]
    tokens = ['<bos>'] + tokens + ['<eos>']
    src_indexes = [src_vocab[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    with torch.no_grad():
        hidden = model.encoder(src_tensor)

    trg_indexes = [trg_vocab['<bos>']]
    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden)

        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)

        if pred_token == trg_vocab['<eos>']:
            break

    trg_tokens = [trg_vocab.lookup_token(idx) for idx in trg_indexes]

    return trg_tokens[1:-1]  # Обрезаем <bos> и <eos>

# Функция для перевода и вывода результатов
def translate_and_print(model, dataset, src_vocab, trg_vocab, src_tokenizer, device, n_sentences=10):
    model.eval()
    for i in range(n_sentences):
        src_sentence, trg_sentence = dataset[i]
        src_sentence_text = ' '.join([src_vocab.lookup_token(idx.item()) for idx in src_sentence])
        trg_sentence_text = ' '.join([trg_vocab.lookup_token(idx.item()) for idx in trg_sentence])

        translation = translate_sentence(model, src_sentence_text, src_vocab, trg_vocab, src_tokenizer, device)
        translation_sentence = ' '.join(translation)

        print(f'Исходное предложение: {src_sentence_text}')
        print(f'Перевод: {translation_sentence}')
        print(f'Ожидаемый перевод: {trg_sentence_text}')
        print()

# Пример использования
translate_and_print(model, test_dataset, russian_vocab, english_vocab, russian_tokenizer, device, n_sentences=10)

Loss  7.280425794601441
Loss  6.774588759422302
Исходное предложение: он окончательно убедился в том , что отец решил не противодействовать естественному ходу событий иисус был <unk> решимости не прибегать к каким-либо из своих возможностей <unk> , верховного главы своей вселенной , ради собственного спасения .
Перевод: , the the the the the that the is not to the the the of the the , of the , , the the the the of the the , of the the , of the the , of the the . of the . . the . .
Ожидаемый перевод: he was at last convinced that the father intended to allow natural events to take their course he was fully determined to employ none of his sovereign power as the supreme head of a universe to save himself .

Исходное предложение: эти два человека имеют много общего , в том числе <unk> и желание <unk> американскую мощь .
Перевод: , the the the the and of the and and and the the and of the and and the the . of the . . . . . . . . . . . . . . . . . . . . . . . . . .
Ожидаемый перевод: these 