In [None]:
%matplotlib inline

In [None]:
!pip3 install torchtext==0.8.1 spacy==3.0.0

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download ru_core_news_sm

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, BucketIterator, Example, Field, Dataset
from torchtext.data.metrics import bleu_score
import spacy 

In [None]:
def translate_sentence(model, sentence, english, russian, device, max_length=50):
    spacy_eng = spacy.load("en_core_web_sm")
    spacy_rus = spacy.load("ru_core_news_sm")

    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_eng(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens.insert(0, english.init_token)
    tokens.append(english.eos_token)

    text_to_indices = [english.vocab.stoi[token] for token in tokens]

    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [russian.vocab.stoi["<sos>"]]
    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == russian.vocab.stoi["<eos>"]:
            break

    translated_sentence = [russian.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]


In [None]:
import re
import pandas as pd

df = pd.read_csv("rus.txt", sep='\t', header=None).drop(2, axis=1)
df = df.rename(columns={0: "text", 1: "label"})
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-z ]+', '', x.lower()))
df['label'] = df['label'].apply(lambda x: re.sub(r'[^а-яё ]+', '', x.lower()))
df.sample(5)

Unnamed: 0,text,label
241863,tom told mary he despised her,том сказал мэри что презирает её
112438,i have to do something,мне надо коечто сделать
100680,lets all go together,пойдём все вместе
291986,didnt you know tom lived with us,ты не знал что том живёт с нами
305331,he cut his sister a piece of bread,он отрезал сестре кусок хлеба


In [None]:
class DataFrameDataset(Dataset):
    def __init__(self, df: pd.DataFrame, fields: list):
        super(DataFrameDataset, self).__init__(
            [Example.fromlist(list(r), fields) for i, r in df.iterrows()], 
            fields
        )

In [None]:
spacy_eng = spacy.load("en_core_web_sm")
spacy_rus = spacy.load("ru_core_news_sm")

def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


def tokenize_rus(text):
    return [tok.text for tok in spacy_rus.tokenizer(text)]


english = Field(
    tokenize=tokenize_eng, 
    lower=True,
    init_token="<sos>",
    eos_token="<eos>"
)

russian = Field(
    tokenize=tokenize_rus,
    lower=True,
    init_token="<sos>",
    eos_token="<eos>"
)

df["text"] = df['text'].apply(lambda x: english.preprocess(x))
df["label"] = df['label'].apply(lambda x: russian.preprocess(x))

train_dataset, test_dataset = DataFrameDataset(
    df=df, 
    fields=(('src', english), ('trg', russian))
).split()

english.build_vocab(train_dataset, max_size=10000, min_freq=2)
russian.build_vocab(train_dataset, max_size=10000, min_freq=2)

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape
        trg_seq_length = trg_seq_length

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )
        try:
          embed_src1 = self.src_word_embedding(src)
          embed_src2 = self.src_position_embedding(src_positions)
          embed_src = self.dropout(embed_src1 + embed_src2)
          embed_trg1 = self.trg_word_embedding(trg)
          embed_trg2 = self.trg_position_embedding(trg_positions)
          embed_trg = self.dropout(embed_trg1 + embed_trg2)
        except IndexError as e:
          print(src)
          print(embed_src1.shape)
          print(src_positions.shape)
          print(embed_src2.shape)
          raise e

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length)
        
        trg_mask = trg_mask.to(self.device)

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = 'cpu'
num_epochs = 100
learning_rate = 3e-4
batch_size = 32

src_vocab_size = len(english.vocab)
trg_vocab_size = len(russian.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 105
forward_expansion = 4
src_pad_idx = english.vocab.stoi["<pad>"]

step = 0

train_iterator, test_iterator = BucketIterator.splits(
    (train_dataset, test_dataset),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)




In [None]:
sentence = "let me put that somewhere safe"
print(sentence)
tokens = [token.text.lower() for token in spacy_eng(sentence)]
print(tokens)
text_to_indices = [english.vocab.stoi[token] for token in tokens]
print(text_to_indices)
sentence_tensor = torch.LongTensor(text_to_indices)
print(sentence_tensor)
s = sentence_tensor.unsqueeze(1)
print(s)
print(device)
s.to(device)

let me put that somewhere safe
['let', 'me', 'put', 'that', 'somewhere', 'safe']
[142, 15, 178, 11, 802, 552]
tensor([142,  15, 178,  11, 802, 552])
tensor([[142],
        [ 15],
        [178],
        [ 11],
        [802],
        [552]])
cpu


tensor([[142],
        [ 15],
        [178],
        [ 11],
        [802],
        [552]])

In [None]:
sentence = "let me put that somewhere safe"

num_epochs = 10

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    model.eval()
    translated_sentence = translate_sentence(
        model, sentence, english, russian, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")
    model.train()
    losses = []

    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)
        output = model(inp_data, target[:-1, :])
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        step += 1

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)

[Epoch 0 / 10]
Translated example sentence: 
 ['позвольте', 'мне', 'положить', 'это', 'куданибудь', 'в', 'какоенибудь', 'место', '<eos>']




[Epoch 1 / 10]
Translated example sentence: 
 ['давай', 'я', 'положу', 'это', 'гденибудь', 'в', 'безопасности', '<eos>']
[Epoch 2 / 10]
Translated example sentence: 
 ['давай', 'я', 'положу', 'это', 'гденибудь', 'в', 'безопасности', '<eos>']
[Epoch 3 / 10]
Translated example sentence: 
 ['позвольте', 'мне', 'положить', 'это', 'гдето', 'в', 'какоенибудь', 'надёжное', 'место', '<eos>']
[Epoch 4 / 10]
Translated example sentence: 
 ['давай', 'я', 'положу', 'это', 'гденибудь', 'в', 'безопасности', '<eos>']
[Epoch 5 / 10]
Translated example sentence: 
 ['давай', 'я', 'положу', 'это', 'гденибудь', 'в', 'какоенибудь', 'место', '<eos>']
[Epoch 6 / 10]
Translated example sentence: 
 ['давайте', 'я', 'положу', 'это', 'в', 'какоенибудь', 'надёжное', 'место', '<eos>']
[Epoch 7 / 10]
Translated example sentence: 
 ['давай', 'я', 'положу', 'это', 'гдето', 'в', 'какоенибудь', 'надёжное', 'место', '<eos>']
[Epoch 8 / 10]
Translated example sentence: 
 ['давайте', 'я', 'положу', 'это', 'в', 'какоенибуд

In [117]:
def bleu(data, model, english, russian, device):
    targets = []
    outputs = []
    i = 0
    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]
        
        prediction = translate_sentence(model, src, english, russian, device)
        prediction = prediction[:-1]
        trg = list(filter(lambda x: x != ' ', trg))
        prediction = list(filter(lambda x: x != ' ', prediction))
        i += 1
        if i % 10 == 0:
          print("Original: ", src)
          print("Translate: ", trg)
          print("Prediction: ", prediction)
          print("---" * 5)

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)




In [118]:
test_for_blue = test_dataset[1:100]

In [119]:
score = bleu(test_for_blue, model, english, russian, device)
print(f"Bleu score {score * 100:.2f}")

Original:  ['were', 'you', 'with', 'anyone']
Translate:  ['ты', 'был', 'с', 'кемто']
Prediction:  ['вы', 'были', 'с', 'кемто']
---------------
Original:  ['i', 'did', 'nt', 'know', 'that', 'tom', 'was', 'a', 'friend', 'of', 'yours']
Translate:  ['я', 'не', 'знал', 'что', 'том', 'твой', 'друг']
Prediction:  ['я', 'не', 'знал', 'что', 'том', 'твой', 'друг']
---------------
Original:  ['what', 'are', 'you', 'looking', 'for']
Translate:  ['что', 'вы', 'ищете']
Prediction:  ['что', 'вы', 'ищете']
---------------
Original:  ['i', 'just', 'do', 'nt', 'want', 'tom', 'to', 'win']
Translate:  ['я', 'просто', 'не', 'хочу', 'чтобы', 'том', 'выиграл']
Prediction:  ['я', 'просто', 'не', 'хочу', 'чтобы', 'том', 'победил']
---------------
Original:  ['please', 'do', 'nt', 'judge', 'me']
Translate:  ['пожалуйста', 'не', 'суди', 'меня']
Prediction:  ['пожалуйста', 'не', 'судить', 'ко', 'мне']
---------------
Original:  ['the', 'teacher', 'told', 'me', 'to', 'stand', 'up']
Translate:  ['учитель', 'сказал