In [None]:
!python -m spacy download de

Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 23.7MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=b94b4acc0dd6b8974cceb18d06534f5c55e6904c41c9951c68717cecdcc133bd
  Stored in directory: /tmp/pip-ephem-wheel-cache-9nf3ahf9/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time

from pathlib import Path

spacy_de = spacy.load('de', disable=['tagger', 'parser', 'ner'])
spacy_en = spacy.load('en', disable=['tagger', 'parser', 'ner'])

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = False)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = False)
            
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (SRC, TRG))

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
max_len=100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = 512,
     device = device)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, intoken, outtoken, hidden, nlayers=3, dropout=0.1):
        super(TransformerModel, self).__init__()
        nhead = 8 #hidden//64
        
        self.encoder = nn.Embedding(intoken, hidden)
        self.pos_encoder = PositionalEncoding(hidden, dropout)
        #self.drop1 = nn.Dropout(dropout)
        self.decoder = nn.Embedding(outtoken, hidden)
        self.pos_decoder = PositionalEncoding(hidden, dropout)
        #self.drop2 = nn.Dropout(dropout)
        self.inscale = math.sqrt(intoken)
        self.outscale = math.sqrt(outtoken)

        self.transformer = nn.Transformer(d_model=hidden, nhead=nhead, num_encoder_layers=nlayers, num_decoder_layers=nlayers, dim_feedforward=2048, dropout=dropout, activation='relu')
        self.fc_out = nn.Linear(hidden, outtoken)

        self.src_mask = None
        self.trg_mask = None
        self.memory_mask = None

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz), 1)
        mask = mask.masked_fill(mask==1, float('-inf'))
        return mask

    def make_len_mask(self, inp, index):
        return (inp == index).transpose(0, 1)

    def forward(self, src, trg):
        if self.trg_mask is None or self.trg_mask.size(0) != len(trg):
            self.trg_mask = self.generate_square_subsequent_mask(len(trg)).to(trg.device)

        src_pad_mask = self.make_len_mask(src, SRC_PAD_IDX)
        trg_pad_mask = self.make_len_mask(trg, TRG_PAD_IDX)

        src = self.encoder(src)
        src = self.pos_encoder(src)
        #src = self.drop1(src)
        trg = self.decoder(trg)
        trg = self.pos_decoder(trg)
        #trg = self.drop2(trg)
        output = self.transformer(src, trg, src_mask=self.src_mask, tgt_mask=self.trg_mask, memory_mask=self.memory_mask,
                                  src_key_padding_mask=src_pad_mask, tgt_key_padding_mask=trg_pad_mask, memory_key_padding_mask=src_pad_mask)
        output = self.fc_out(output)

        return output


model = TransformerModel(INPUT_DIM, OUTPUT_DIM, hidden=512, nlayers=3)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
#optimizer = Over9000(model.parameters())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')


criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

def train(model, optimizer, criterion, iterator):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg[:-1,:])

        output_dim = output.shape[-1]

        loss = criterion(output.contiguous().view(-1, output_dim), trg[1:,:].contiguous().view(-1))
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)
        
def translate_sentence (sentence):
    if isinstance(sentence, str):
        tokens = [token.text.lower() for token in spacy_de(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [SRC.init_token] + tokens + [SRC.eos_token]
    src_indexes = [SRC.vocab.stoi[token] for token in tokens]
    src = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    with torch.no_grad():
        memory = model.transformer.encoder(model.pos_encoder(model.encoder(src)))

    out_indexes = [TRG.vocab.stoi[TRG.init_token]]

    for i in range(max_len):
        trg_tensor = torch.LongTensor(out_indexes).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model.fc_out(model.transformer.decoder(model.pos_decoder(model.decoder(trg_tensor)), memory))
        out_token = output.argmax(2)[-1].item()
        out_indexes.append(out_token)
        if out_token == TRG.vocab.stoi[TRG.eos_token]:
            break
    trg_tokens = [TRG.vocab.itos[i] for i in out_indexes]

    return (trg_tokens[1:])

def example(example_idx):
    model.eval()

    src = vars(train_data.examples[example_idx])['src']
    trg = vars(train_data.examples[example_idx])['trg']

    print()
    print(f'Source: {src}')
    print(f'Target: {trg}')
    print(f'Pred:   {translate_sentence(src)}')

def evaluate(model, criterion, iterator):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():    
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            output = model(src, trg[:-1,:])

            output_dim = output.shape[-1]

            loss = criterion(output.contiguous().view(-1, output_dim), trg[1:,:].contiguous().view(-1))

            epoch_loss += loss.item()

        '''
        for i in [8, 6, 10]:
            src_tokens = [SRC.vocab.itos[j] for j in src[:,i]]
            y_tokens = [TRG.vocab.itos[j] for j in trg[1:,i]]
            trg_tokens = [TRG.vocab.itos[j] for j in output[:,i].argmax(-1)]
            print('word:', src_tokens)
            print('real p: ', y_tokens)
            print('predict p: ', trg_tokens)

        example_idx = 8

        src = vars(train_data.examples[example_idx])['src']
        trg = vars(train_data.examples[example_idx])['trg']

        print(f'src = {src}')
        print(f'trg = {trg}')
        print (translate_sentence(src))

        s1 = 'Ein Hund schläft friedlich.'
        s = [tok.text.lower() for tok in spacy_de.tokenizer(s1)]

        print (translate_sentence(s))

        s2 = 'Du würdest all das kriegen, sprichst du es laut aus.'
        s = [tok.text.lower() for tok in spacy_de.tokenizer(s2)]

        print (translate_sentence(s))
        '''
        
        example(10)
        example(8)
        example(6)

    return epoch_loss / len(iterator)
    
    
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 20

best_valid_loss = float('inf')

#model.load_state_dict(torch.load('g2p_model.pt'))

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, optimizer, criterion, train_iterator)
    valid_loss = evaluate(model, criterion, valid_iterator)

    epoch_mins, epoch_secs = epoch_time(start_time, time.time())

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'g2p_model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

model.eval()
model.load_state_dict(torch.load('g2p_model.pt'))
test_loss = evaluate(model, criterion, test_iterator)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')


s1 = 'Ein Hund schläft friedlich.'
s = [tok.text.lower() for tok in spacy_de.tokenizer(s1)]

print (translate_sentence(s))

s2 = 'Du würdest all das kriegen, sprichst du es laut aus.'
s = [tok.text.lower() for tok in spacy_de.tokenizer(s2)]

print (translate_sentence(s))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 1.04MB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 274kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 267kB/s]


The model has 32,133,381 trainable parameters

Source: ['eine', 'ballettklasse', 'mit', 'fünf', 'mädchen', ',', 'die', 'nacheinander', 'springen', '.']
Target: ['a', 'ballet', 'class', 'of', 'five', 'girls', 'jumping', 'in', 'sequence', '.']
Pred:   ['a', 'woman', 'is', 'sitting', 'on', 'a', 'large', 'street', '.', '<eos>']

Source: ['eine', 'frau', 'mit', 'einer', 'großen', 'geldbörse', 'geht', 'an', 'einem', 'tor', 'vorbei', '.']
Target: ['a', 'woman', 'with', 'a', 'large', 'purse', 'is', 'walking', 'by', 'a', 'gate', '.']
Pred:   ['a', 'man', 'is', 'sitting', 'on', 'a', 'large', 'street', '.', '<eos>']

Source: ['ein', 'mann', 'lächelt', 'einen', 'ausgestopften', 'löwen', 'an', '.']
Target: ['a', 'man', 'is', 'smiling', 'at', 'a', 'stuffed', 'lion']
Pred:   ['a', 'man', 'is', 'sitting', 'on', 'a', 'man', 'in', 'a', 'man', '.', '<eos>']
Epoch: 01 | Time: 0m 46s
	Train Loss: 4.957 | Train PPL: 142.101
	 Val. Loss: 3.881 |  Val. PPL:  48.462

Source: ['eine', 'ballettklasse', 'mit', 'f