In [1]:
'''dahyun+darwin = dahwin'''

'dahyun+darwin = dahwin'

In [2]:
!pip install spacy==2.3.4



In [38]:
!python -m spacy download en
!python -m spacy download de

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.3 MB/s eta 0:00:01
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.1-py3-none-any.whl size=12047087 sha256=f29dce5fbaf6e5ff044beea4c41550e60bdf55f0efc89c7a3b37b17b93f74e0e
  Stored in directory: /tmp/pip-ephem-wheel-cache-tffcbwvk/wheels/19/d6/1c/5484b95647df5d7afaf74abde458c66c1cd427e69e801fe826
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.5.0
    Uninstalling en-core-web-sm-3.5.0:
      Successfully uninstalled en-core-web-sm-3.5.0
Successfully installed en-core-web-sm-2.3.1
[38;5;2m✔ Download and installation

In [1]:
import spacy
spacy.load('en_core_web_sm')
spacy.load('de_core_news_sm')

<spacy.lang.de.German at 0x7f3e48387250>

In [2]:
spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")

In [None]:
!python -m spacy download en
!python -m spacy download de

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")


def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")

english = Field(
    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)

train_data, valid_data, test_data = Multi30k.splits(
    exts=(".de", ".en"), fields=(german, english)
)

german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

        
        
class Decoder(nn.Module):
    def __init__(self,input_size,embedding_size,hidden_size,
                output_size,num_layers,dropout):
        super(Decoder,self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size,embedding_size)
        self.rnn = nn.LSTM(embedding_size,hidden_size,num_layers,dropout=dropout)
        self.fc = nn.Linear(hidden_size,output_size)
    def forward(self,x,hidden,cell):
        x = x.unsqueeze(0)
        embedded = self.dropout(self.embedding(x))
        outputs,(hidden,cell) = self.rnn(embedded,(hidden,cell))
        predictions = self.fc(outputs)
        predictions = predictions.squeeze(0)
        return predictions,hidden,cell
    
        
        
class SeqSeq(nn.Module):
    
    def __init__(self,encoder,decoder):
        super(SeqSeq,self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self,source,target,teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)
        outputs = torch.zeros(target_len,batch_size,target_vocab_size).to(device)
        hidden,cell = self.encoder(source)
        # grab the start token
        
        x = target[0]
        for t in range(1,target_len):
            output, hidden,cell = self.decoder(x,hidden,cell)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess
        return outputs

num_epochs = 60
learning_rate = 0.001
batch_size = 64

load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 4
enc_dropout = 0.5
dec_dropout = 0.5
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data,valid_data,test_data),
    batch_size = batch_size,
    sort_within_batch= True,
    sort_key = lambda x : len(x.src),
    device= device)

encoder_net = Encoder(input_size_encoder,encoder_embedding_size,
                      hidden_size,num_layers,enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder,decoder_embedding_size,
                      hidden_size,output_size,num_layers,dec_dropout).to(device)

model = SeqSeq(encoder_net,decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
step = 0
if load_model :
    load_checkpoint(torch.load('my_checkpoint.pth.ptar'),model,optimizer)
sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."
for epoch in range(num_epochs):
    print(f'epoch[{epoch}/ {num_epochs}]')
    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    save_checkpoint(checkpoint)
    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, german, english, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()
    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)
        output = model(inp_data,target)
        output = output[1:].reshape(-1,output.shape[2])
        target = target[1:].reshape(-1)
        optimizer.zero_grad()
        loss = criterion(output,target)
        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()
        step +=1

score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")




epoch[0/ 60]
=> Saving checkpoint
Translated example sentence: 
 ['web', 'one', 'one', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'swim', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'swim', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'swim', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds', 'holds']
epoch[1/ 60]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'man', 'is', 'a', 'a', 'a', 'shirt', 'is', 'a', 'a', 'a', '.', 'a', '.', '.', '<eos>']
epoch[2/ 60]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'young', 'dog', 'is', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', '.', '<eos>']
epoch[3/ 60]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'couple', 'is', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
epoch[4/ 60]
=> Saving checkpoint
Tra

In [None]:
import torch
print(torch.__version__)


In [None]:
from torchtext.data.metrics import bleu_score