In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter



In [4]:
!python -m spacy download  de_core_news_sm #en_core_web_sm


Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 16.6MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907057 sha256=e789faa78a4ff8ec993b5242bd8a2d9b3626aa3b3e164666bada1dea12d18611
  Stored in directory: /tmp/pip-ephem-wheel-cache-v8257d94/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [2]:
spacy_ger = spacy.load('de_core_news_sm')
spacy_eng = spacy.load('en_core_web_sm')

In [3]:
def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]
def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [4]:
german = Field(tokenize=tokenizer_ger, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenizer_eng, lower=True, init_token='<sos>', eos_token='<eos>')

In [5]:
train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                         fields = (german, english))

In [6]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [28]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_szie, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p)
        self.embedding =  nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional = True, dropout=p)
        self.fc_hidden = nn.Linear(hidden_size*2, hidden_size)
        self.fc_cell = nn.Linear(hidden_size*2, hidden_size)



    def forward(self, x):
        embedding = self.dropout(self.embedding(x))
        # shape of X is (seq leng, N), N is batch size
        # embedding shape == (seq_length, N, embedding_size(lets say 300))

        encoder_states, (hidden, cell) = self.rnn(embedding) 
        # output shape: (seq_length, Batch, hidden_size)
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell  = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))
        # hidden shape: (2, Batch_size, hidden_szie)
        return encoder_states, hidden, cell


class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(hidden_size*2 + embedding_size, hidden_size, num_layers, dropout=p) # here we are sending the context vectore with the embedding size
        #hidden size of encoder and decoder are the same


        self.energy = nn.Linear(hidden_size*3, 1)
        # Hidden state of the deocder of previous time step + encoder_states
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, encoder_states, hidden, cell):
        # shape of x is (N), we want  (1, N), in encoder we are sending a sentence, whereas in decoder we take a single word input from the predicted output and feed it as inoput
        x = x.unsqueeze(0)
        embedding = self.dropout(self.embedding(x))
        # embedding shape is (1, N, embedding_size)

        sequence_length = encoder_states.shape[0]
        print(hidden.shape)
        h_reshaped = hidden.repeat(sequence_length, 1, 1)# as the decoder hidden state is same for all outputs(enocder states) from the encoder
        print(h_reshaped.shape)
        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        attention = self.softmax(energy)
        # attention (seq_len, N, 1)

        attention = attention.permute(1, 2, 0)
        #(N,1 , seq_length)
        #print(attention.shape)
        encoder_states = encoder_states.permute(1, 0, 2)
        # (N, seq_length, hidden_size*2)
        #print(encoder_states.shape)
        # (N, 1, hidden_size*2)
        context_vector = torch.bmm(attention, encoder_states).permute(1, 0, 2)

        rnn_input = torch.cat((context_vector, embedding), dim=2)

        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        # shape of output is 1, N, hidden_size
        predictions = self.fc(output)
        # shape of preds is (1, N, length of vocab)
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        # teacher force ratio 50% of the time we send predicted outputs as inputs in the decoder and 50% of the time we send the actual values to the decoder inputs
        batch_size = source.shape[1] #()
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to("cuda")
        encoder_states, hidden, cell = self.encoder(source)

        x = target[0] # get start token
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
            # output has the final predcitons, outputs contain the whole sentences
            outputs[t] = output

            best_guess = output.argmax(1)

            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [None]:
Blog to understand the attention mechanism implementation better
https://towardsdatascience.com/attaining-attention-in-deep-learning-a712f93bdb1e

In [17]:
# Training

In [29]:
num_epochs = 20
learning_rate = 0.001
batch_size = 64
load_model = False
device = torch.device('cuda')
input_size_encoder = len(german.vocab)
output_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024 # used in seq2seq paper

num_layers = 1
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard
writer = SummaryWriter(f'runs/loss_plot')
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, validation_data, test_data), batch_size=batch_size, sort_within_batch=True, sort_key = lambda x: len(x.src), device=device)
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(output_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

if load_model:
    load_checkpoint(torch.load('my_checkpoint.pth.ptar'), model, optimizer)

  "num_layers={}".format(dropout, num_layers))


In [17]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/f2/17/e7c588245aece7aa93f360894179374830daf60d7ed0bbb59332de3b3b61/torchtext-0.6.0-py3-none-any.whl (64kB)
[K     |████████████████████████████████| 71kB 7.6MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 19.7MB/s 
Installing collected packages: sentencepiece, torchtext
  Found existing installation: torchtext 0.3.1
    Uninstalling torchtext-0.3.1:
      Successfully uninstalled torchtext-0.3.1
Successfully installed sentencepiece-0.1.94 torchtext-0.6.0


In [40]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys


def translate_sentence(model, sentence, german, english, device, max_length=50):
    # print(sentence)

    # sys.exit()

    # Load german tokenizer
    spacy_ger = spacy.load("de_core_news_sm")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        o, hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, o, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [44]:
sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."
for epoch in range(num_epochs):
    print(f'Epoch [{epoch}/{num_epochs}]')

    checkpoint = {'state_dict':model.state_dict(), 'optimizer':optimizer.state_dict()}
    save_checkpoint(checkpoint)
    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, german, english, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()
    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)
        output = model(inp_data, target)

        # output_shape: (trg_len, batch_size, output_dim )

        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        optimizer.zero_grad()
        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1)
        optimizer.step()

        writer.add_scalar('Trainig Loss', loss, global_step=step)
        step += 1

Epoch [0/20]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'boat', 'carrying', 'several', 'men', 'is', 'pulled', 'to', 'shore', 'by', 'a', 'large', 'team', '.', 'horses', '.', '<eos>']




Epoch [1/20]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'boat', 'carrying', 'several', 'men', 'is', 'pulled', 'to', 'shore', 'by', 'a', 'large', 'team', 'of', 'horses', '.', '<eos>']
Epoch [2/20]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'boat', 'carrying', 'several', 'men', 'is', 'pulled', 'to', 'shore', 'by', 'a', 'large', 'team', 'of', 'horses', '.', '<eos>']
Epoch [3/20]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'boat', 'carrying', 'several', 'men', 'pulled', 'pulled', 'shore', 'by', 'shore', 'by', 'a', 'large', 'team', '.', '<eos>']
Epoch [4/20]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'boat', 'carrying', 'several', 'men', 'is', 'pulled', 'to', 'shore', 'by', 'a', 'large', 'team', 'of', 'horses', '.', '<eos>']
Epoch [5/20]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'boat', 'carrying', 'several', 'men', 'is', 'pulled', 'to', 'shore', 'by', 'a', 'large', 'team', 'of', 'horses', '.', '<eos>']
Epoch [6/20]


In [43]:
score = bleu(test_data, model, german, english, device)
print(f"Bleu score {score*100:.2f}")

Bleu score 22.07


In [None]:
# Attention theory
