# Machine Translation: Russian-to-English & English-to-Russian

This project utilizes RNNs as well as a pre-trained GloVe embeddings for the purposes of machine translation.

In [None]:
# importing necessary libraries
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import time
import math

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Model 1: Russian-to-English

Base code was taken from Author: Sean Robertson <https://github.com/spro/practical-pytorch>_

In [None]:
# to create a unique index per word
# SOS: start of sentence
# EOS: end of sentence

SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
# turn a Unicode string to plain ASCII
# https://stackoverflow.com/a/518232/2809427

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# make lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Zа-яА-Я.!?]+", r" ", s) # added cyrillic letters
    return s

In [None]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('/content/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')[:2]] for l in lines] # added a [:2] since there were
    #copywrite notes on each line

    # Reverse pairs if needed
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [None]:
# trim the data since there are a lot of examples and we don't want this to take forever
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i'm ",
    "he is", "he's ",
    "she is", "she's ",
    "you are", "you're ",
    "we are", "we're ",
    "they are", "they're "
)


# filters
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
# full data preparation function:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse) #read the data
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs) # filter and trim
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0]) # make pairs into lists
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'rus', True)
print(random.choice(pairs))

Reading lines...
Read 479223 sentence pairs
Trimmed to 3879 sentence pairs
Counting words...
Counted words:
rus 3309
eng 1735
['ты сел не на тот поезд .', 'you are on the wrong train .']


In [None]:
#ENCODER - outputs some value for every word from the input sentence.
#For every input word the encoder outputs a
#vector and a hidden state, and uses the hidden state for the next input word.
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
#DECODER - takes the encoder output vector(s) and outputs a sequence of words to create the translation.
# using attention decoder which focuses on parts of outputs
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
# TRAINING prep - adding indexes and marking EOS
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
# TRAIN function - randomly if are gonna use
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
# to keep track of evapsed time
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

#plotting results
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
# ITERACTIONS OF TRAIN
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
# EVALUATION (similar to training but no targets - just decoder output)
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
# random evaluation
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
# TRAIN AND EVALUATE MODEL 1
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

1m 28s (- 20m 38s) (5000 6%) 2.8988
2m 51s (- 18m 37s) (10000 13%) 2.1663
4m 20s (- 17m 22s) (15000 20%) 1.6673
5m 46s (- 15m 52s) (20000 26%) 1.2768
7m 12s (- 14m 24s) (25000 33%) 0.9662
8m 37s (- 12m 56s) (30000 40%) 0.7064
10m 3s (- 11m 30s) (35000 46%) 0.5060
11m 28s (- 10m 2s) (40000 53%) 0.3697
12m 53s (- 8m 35s) (45000 60%) 0.2466
14m 18s (- 7m 9s) (50000 66%) 0.1874
15m 43s (- 5m 43s) (55000 73%) 0.1184
17m 7s (- 4m 16s) (60000 80%) 0.0844
18m 32s (- 2m 51s) (65000 86%) 0.0611
19m 57s (- 1m 25s) (70000 93%) 0.0549
21m 21s (- 0m 0s) (75000 100%) 0.0588


In [None]:
evaluateRandomly(encoder1, attn_decoder1)

> я объясняю правила .
= i am explaining the rules .
< i am explaining the rules . <EOS>

> сеичас у него дела лучше чем раньше .
= he is better off than he was .
< he is better off than he was . <EOS>

> он влиятелен .
= he is influential .
< he is powerful . <EOS>

> у нее нет стыда .
= she is shameless .
< she is shameless . <EOS>

> он пытается совмещать две работы .
= he is trying to maintain two jobs .
< he is trying to maintain two jobs . <EOS>

> он сегодня дома .
= he is at home today .
< he is at home today . <EOS>

> он красивыи .
= he is handsome .
< he is handsome . <EOS>

> боюсь ваш план не будет работать .
= i am afraid your plan will not work .
< i am afraid your plan will not work . <EOS>

> боюсь я не могу тебе помочь .
= i am afraid i can t help you .
< i am afraid i can t help you . <EOS>

> он слишком мал чтобы купаться одному .
= he is too young to go swimming alone .
< he is too young to go swimming alone . <EOS>



## Model 2: English-to-Russian with pretrained embeddings

In [None]:
# had to change this function since the order is changed
def filterPair(p):
    return len(p[1].split(' ')) < MAX_LENGTH and \
        len(p[0].split(' ')) < MAX_LENGTH and \
        p[0].startswith(eng_prefixes)

input_lang, output_lang, pairs = prepareData('eng', 'rus') # not reversed
print(random.choice(pairs))

Reading lines...
Read 479223 sentence pairs
Trimmed to 3879 sentence pairs
Counting words...
Counted words:
eng 1735
rus 3309
['you aren t any better than me .', 'ты ничуть не лучше меня .']


In [None]:
# installing GloVe

#!python -m spacy download en_core_web_lg

In [None]:
import spacy

# load spaCy's pre-trained GloVe model (glove.6B.100d)
nlp = spacy.load("en_core_web_lg")

# vocabulary size
vocab_size = len(nlp.vocab.vectors)

#  list to store the GloVe vectors
glove_vectors = []

for word in nlp.vocab.vectors.keys():
    vector = nlp.vocab.vectors[word]
    glove_vectors.append(vector)

# convert to a PyTorch tensor
pretrained_embeddings = torch.tensor(glove_vectors)

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, pretrained_embeddings=None, freeze_embeddings=True):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        if pretrained_embeddings is not None:
            # initialize the embedding layer with pre-trained embeddings
            self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=freeze_embeddings)
        else:
            # if no pre-trained embeddings are provided, create a new embedding layer
            self.embedding = nn.Embedding(input_size, hidden_size)

        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
pretrained_embeddings.shape

torch.Size([514157, 300])

In [None]:
# decoder - same
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
hidden_size = 300
# create the encoder with pre-trained embeddings
encoder2 = EncoderRNN(vocab_size, hidden_size, pretrained_embeddings, freeze_embeddings=True).to(device)
# decoder
attn_decoder2 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
# training iterations
trainIters(encoder2, attn_decoder2, 75000, print_every=5000)

1m 22s (- 19m 18s) (5000 6%) 3.5674
2m 39s (- 17m 14s) (10000 13%) 2.7551
3m 55s (- 15m 42s) (15000 20%) 2.1678
5m 13s (- 14m 23s) (20000 26%) 1.7951
6m 31s (- 13m 3s) (25000 33%) 1.4954
7m 50s (- 11m 45s) (30000 40%) 1.2399
9m 8s (- 10m 27s) (35000 46%) 1.0820
10m 28s (- 9m 10s) (40000 53%) 0.9288
11m 46s (- 7m 51s) (45000 60%) 0.8330
13m 6s (- 6m 33s) (50000 66%) 0.7654
14m 23s (- 5m 14s) (55000 73%) 0.6608
15m 42s (- 3m 55s) (60000 80%) 0.5884
17m 0s (- 2m 37s) (65000 86%) 0.5802
18m 20s (- 1m 18s) (70000 93%) 0.5337
19m 38s (- 0m 0s) (75000 100%) 0.4862


In [None]:
evaluateRandomly(encoder2, attn_decoder2)


> you aren t my mother .
= вы мне не мать .
< вы мне не мать . <EOS>

> she is in a bad mood .
= она в дурном настроении .
< у нее плохое настроение . <EOS>

> she is about your age .
= она примерно вашего возраста .
< она примерно твоего возраста . <EOS>

> we are authorized to use force if necessary .
= в случае необходимости нам разрешено применять силу .
< в случае необходимости нам разрешено применять силу . <EOS>

> she is anxious about her safety .
= она беспокоится о ее безопасности .
< она беспокоится о своеи безопасности . <EOS>

> he isn t at home is he ?
= его что нет дома ?
< его ведь нет дома ? <EOS>

> he is an old friend of mine .
= он мои старыи друг .
< он мои старыи друг . <EOS>

> he is roasting coffee beans .
= он обжаривает кофеиные зерна .
< он обжаривает кофеиные зерна . <EOS>

> i am concerned about his poor health .
= меня беспокоит его слабое здоровье .
< меня беспокоит его слабое здоровье . <EOS>

> we aren t doing anything right now .
= мы ничем не заняты с

## Input 5 well formed sentences from the English vocab to Model 2, and input the resultant translated sentences to Model 1. Display all model outputs in each case.


In [None]:
# evaluating second model
def evaluateAndShowAttentionModel2(input_sentence):
    output_words, attentions = evaluate(
        encoder2, attn_decoder2, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))


evaluateAndShowAttentionModel2("we are sorry for his mistake .")

evaluateAndShowAttentionModel2("you are a beautiful woman .")

evaluateAndShowAttentionModel2("you are too late .")

evaluateAndShowAttentionModel2("you aren t like the other kids .")

input = we are sorry for his mistake .
output = мы сожалеем о его ошибке . <EOS>
input = you are a beautiful woman .
output = ты красивая женщина . <EOS>
input = you are too late .
output = ты опоздал . <EOS>
input = you aren t like the other kids .
output = ты не такая как другие дети . <EOS>


In [None]:
# evaluating first model based on second model's output
def evaluateAndShowAttentionModel1(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))

# back to eng filters
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

# reverse languages back
input_lang, output_lang, pairs = prepareData('eng', 'rus', True)

Reading lines...
Read 479223 sentence pairs
Trimmed to 3879 sentence pairs
Counting words...
Counted words:
rus 3309
eng 1735


In [None]:
evaluateAndShowAttentionModel1("мы сожалеем о его ошибке .")

evaluateAndShowAttentionModel1("вы красивая женщина .")

evaluateAndShowAttentionModel1("ты опоздал .")

evaluateAndShowAttentionModel1("вы не такие как другие дети .")

input = мы сожалеем о его ошибке .
output = we are sorry for his mistake . <EOS>
input = вы красивая женщина .
output = you are a beautiful woman . <EOS>
input = ты опоздал .
output = you are late . <EOS>
input = вы не такие как другие дети .
output = you aren t like the other kids . <EOS>


## Results and Assumptions

Knowing both languages, we could tell that both models perform rather well. Some differences between input and output that were observed over multiple evalutions weren't significant and still mostly remain the main meaning of the sentence. For instance, sometimes the casual word for "you" (ты) would be substituted by the formal version (вы) and vice-versa which isn't really a mistake.

Finally, through evaluations we have noticed that the input data is not perfect: there are incomplete sentence pairs that are just cropped in the middle, for instance. This is another thing that we should keep in mind when assessing the models.