# Testing code

# Importing stuff

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings

warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [2]:
import os
import os.path as osp
import random

import mlflow
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm_notebook

from nlpclass.config import model_config
from nlpclass.data.data_utils import TranslationDataset, text_collate_func
from nlpclass.models.evaluation_utils import bleu_eval, output_to_translations
from nlpclass.models.models import DecoderRNN, EncoderCNN, EncoderRNN, TranslationModel
from nlpclass.models.training_utils import load_data

In [3]:
CURRENT_PATH = os.getcwd()
DATA_DIR = osp.join(CURRENT_PATH, '..', 'data')
MODEL_DIR = osp.join(CURRENT_PATH, '..','models')

# Data

In [6]:
data, data_loaders, max_length = load_data('vi', batch_size=16)

pidor
Counting words...
Counted words:
eng 35151
vi 43279
pidor
Counting words...
Counted words:
eng 3562
vi 3678
pidor
Counting words...
Counted words:
eng 3361
vi 3518


In [5]:
data['train'].input_lang.n_words

35151

In [20]:
encoder = EncoderCNN(
    data['train'].input_lang.n_words,
    embedding_size=100,
    hidden_size=128,
    num_layers=2).to(model_config.device)
decoder = DecoderRNN(
    embedding_size=100,
    hidden_size=128,
    output_size=data['train'].output_lang.n_words,
    attention=False).to(model_config.device)
translation_model = TranslationModel(encoder, decoder, teacher_forcing_ratio=0.5).to(model_config.device)

In [7]:
encoder = EncoderRNN(
    data['train'].input_lang.n_words,
    embedding_size=100,
    hidden_size=128,
    num_layers=1,
    dropout=0.0,
    bidirectional=False).to(model_config.device)
if encoder.bidirectional:
    multiplier = 2
else:
    multiplier = 1
decoder = DecoderRNN(
    embedding_size=100,
    hidden_size=multiplier * 128,
    output_size=data['train'].output_lang.n_words,
    attention=True).to(model_config.device)
translation_model = TranslationModel(encoder, decoder, teacher_forcing_ratio=0.5).to(model_config.device)

In [20]:
optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, translation_model.parameters()), 1e-3)

In [21]:
weight = torch.ones(translation_model.decoder.output_size).to(model_config.device)
weight[model_config.PAD_token] = 0
criterion = nn.CrossEntropyLoss(weight)

In [22]:
def calc_loss(logits, target, criterion):
    logits_flat = logits.view(-1, logits.size(-1))
    target_flat = target.view(-1, 1).squeeze()
    return criterion(logits_flat, target_flat)

In [8]:
def evaluate(model, data, data_loaders, dataset_type='dev', max_batch=100, greedy=True):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        original_strings = []
        translated_strings = []
        for i, batch in enumerate(data_loaders[dataset_type]):
            if i > max_batch:
                break
            logits, loss = model(batch)
            epoch_loss += loss.item()
            original = output_to_translations(batch['target'], data['train'])
            if greedy:
                translations = output_to_translations(
                    model.greedy(batch), data['train'])
            else:
                translations = output_to_translations(
                    model.beam_search(batch), data['train'])
            original_strings.extend(original)
            translated_strings.extend(translations)
        bleu = bleu_eval(original_strings, translated_strings)
        model.train()

    return epoch_loss / (i + 1), bleu

In [27]:
for i in tqdm_notebook(range(25)):
    for batch in data_loaders['train']:
        translation_model.train()
        optimizer.zero_grad()
        logits, total_loss = translation_model(batch)
        loss = calc_loss(logits, batch['target'], criterion)
        print(total_loss, loss)
        total_loss.backward()
        
        encoder_norm = 0
        for p in translation_model.encoder.parameters():
            param_norm = p.grad.data.norm(2)
            encoder_norm += param_norm.item() ** 2
        decoder_norm = 0
        for p in translation_model.decoder.parameters():
            param_norm = p.grad.data.norm(2)
            decoder_norm += param_norm.item() ** 2
            
        print(encoder_norm, decoder_norm, loss.item())
            
        clip_grad_norm_(filter(lambda p: p.requires_grad,
                                   translation_model.parameters()), 5.0)
        
        optimizer.step() 
    original, translation = evaluate(translation_model, data, data_loaders, dataset_type='train')

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

tensor([[0.0178, 0.0299, 0.0279, 0.0209, 0.0313, 0.0487, 0.0495, 0.0530, 0.0559,
         0.0278, 0.0396, 0.0406, 0.0126, 0.0104, 0.0072, 0.0139, 0.0226, 0.0143,
         0.0253, 0.0175, 0.0113, 0.0120, 0.0283, 0.0371, 0.0397, 0.0187, 0.0081,
         0.0078, 0.0216, 0.0296, 0.0339, 0.0433, 0.0236, 0.0107, 0.0172, 0.0250,
         0.0198, 0.0454],
        [0.0039, 0.0045, 0.0032, 0.0056, 0.0041, 0.0023, 0.0029, 0.0025, 0.0038,
         0.0028, 0.0059, 0.0041, 0.0034, 0.0020, 0.0043, 0.0034, 0.0016, 0.0017,
         0.0036, 0.0050, 0.0038, 0.0041, 0.0045, 0.0075, 0.0078, 0.0047, 0.0035,
         0.0065, 0.0049, 0.0025, 0.0031, 0.0093, 0.1445, 0.1445, 0.1445, 0.1445,
         0.1445, 0.1445],
        [0.0104, 0.0117, 0.0089, 0.0181, 0.0156, 0.0093, 0.0060, 0.0091, 0.0062,
         0.0074, 0.0048, 0.0050, 0.0055, 0.0031, 0.0035, 0.0097, 0.0072, 0.0065,
         0.0043, 0.0092, 0.0116, 0.0135, 0.0168, 0.0084, 0.0080, 0.0069, 0.0159,
         0.0689, 0.0689, 0.0689, 0.0689, 0.0689, 0.0689, 

tensor([[1.6955e-02, 4.0828e-02, 2.6771e-02, 1.9567e-02, 2.2242e-02, 2.2175e-02,
         3.3467e-02, 4.3353e-02, 4.7657e-02, 1.5557e-02, 1.0926e-02, 1.5398e-02,
         1.5612e-02, 3.1233e-02, 1.8142e-02, 3.7405e-02, 2.5010e-02, 3.4925e-02,
         4.9825e-02, 1.4334e-02, 1.2760e-02, 2.5728e-02, 2.2370e-02, 3.1930e-02,
         4.8781e-02, 3.0399e-02, 1.4082e-02, 1.4360e-02, 3.4218e-02, 2.8623e-02,
         3.0006e-02, 2.1926e-02, 2.9836e-02, 1.7514e-02, 3.4986e-02, 2.4111e-02,
         1.9339e-02, 1.7648e-02],
        [1.8187e-06, 1.0767e-05, 1.6130e-05, 6.0763e-06, 3.2572e-05, 1.7123e-05,
         8.6613e-05, 5.4379e-05, 4.6865e-05, 2.5996e-05, 7.5788e-06, 1.4984e-05,
         1.6891e-05, 2.0006e-05, 2.6881e-05, 3.8706e-05, 8.6586e-05, 2.0278e-05,
         6.6601e-05, 4.3422e-05, 1.8475e-05, 3.6610e-05, 1.4308e-05, 4.7337e-06,
         4.4923e-06, 1.7141e-05, 3.2138e-05, 4.7986e-05, 1.3696e-05, 1.2116e-05,
         7.1856e-05, 2.7436e-05, 1.6651e-01, 1.6651e-01, 1.6651e-01, 1.6651

tensor([[1.3313e-02, 4.1978e-02, 2.5596e-02, 1.7426e-02, 3.1908e-02, 4.0889e-02,
         5.4039e-02, 4.2625e-02, 7.2849e-02, 2.7589e-02, 2.9437e-02, 4.2778e-02,
         3.3649e-02, 1.8882e-02, 1.8224e-02, 1.7892e-02, 2.1130e-02, 1.0672e-02,
         3.6611e-02, 2.6269e-02, 1.7791e-02, 1.2718e-02, 1.5997e-02, 2.1096e-02,
         4.5963e-02, 3.2608e-02, 1.3917e-02, 1.8944e-02, 2.6032e-02, 1.3769e-02,
         1.6787e-02, 1.8026e-02, 1.5002e-02, 1.6636e-02, 1.7056e-02, 2.0959e-02,
         1.9805e-02, 3.3138e-02],
        [2.9749e-07, 1.8685e-06, 2.2356e-06, 5.8928e-07, 3.1444e-06, 2.2443e-06,
         1.2811e-05, 8.4800e-06, 2.9257e-06, 2.3668e-06, 8.7499e-07, 2.8556e-06,
         2.3113e-06, 2.4818e-06, 3.7893e-06, 8.8538e-06, 1.8840e-05, 2.9504e-06,
         6.9301e-06, 5.0203e-06, 2.5325e-06, 4.4422e-06, 1.1000e-06, 3.7364e-07,
         3.6695e-07, 1.4447e-06, 2.7995e-06, 4.5232e-06, 1.0059e-06, 1.3509e-06,
         1.4858e-05, 5.9923e-06, 1.6664e-01, 1.6664e-01, 1.6664e-01, 1.6664

tensor([[0.0173, 0.0122, 0.0249,  ..., 0.0069, 0.0090, 0.0265],
        [0.0141, 0.0177, 0.0091,  ..., 0.0226, 0.2191, 0.2191],
        [0.0014, 0.0030, 0.0025,  ..., 0.0372, 0.0372, 0.0372],
        ...,
        [0.0020, 0.0010, 0.0005,  ..., 0.0168, 0.0168, 0.0168],
        [0.0021, 0.0027, 0.0031,  ..., 0.0161, 0.0161, 0.0161],
        [0.0014, 0.0015, 0.0040,  ..., 0.0158, 0.0158, 0.0158]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.0529e-02, 6.1674e-03, 1.3742e-02,  ..., 1.4650e-02, 2.1039e-02,
         1.8539e-02],
        [2.8670e-04, 3.7566e-04, 1.3280e-04,  ..., 2.5824e-04, 4.9428e-01,
         4.9428e-01],
        [5.1290e-06, 1.9192e-05, 7.1153e-06,  ..., 4.1656e-02, 4.1656e-02,
         4.1656e-02],
        ...,
        [4.7911e-04, 4.6471e-04, 1.0476e-04,  ..., 1.6881e-02, 1.6881e-02,
         1.6881e-02],
        [2.2798e-04, 1.6588e-04, 2.2353e-04,  ..., 1.6359e-02, 1.6359e-02,
         1.6359e-02],
        [4.0945e-04, 7.7806e-04, 1.0453e-03,  ..., 1.

tensor([[2.1160e-02, 1.6668e-02, 1.4867e-02,  ..., 9.2293e-03, 2.3909e-02,
         2.6975e-02],
        [6.0210e-06, 9.1096e-06, 5.4873e-06,  ..., 1.0105e-05, 4.9971e-01,
         4.9971e-01],
        [7.2261e-07, 1.3607e-06, 5.0036e-07,  ..., 4.1666e-02, 4.1666e-02,
         4.1666e-02],
        ...,
        [1.3733e-07, 1.4190e-07, 2.6009e-08,  ..., 1.6949e-02, 1.6949e-02,
         1.6949e-02],
        [1.7346e-07, 6.3598e-08, 1.4792e-07,  ..., 1.6393e-02, 1.6393e-02,
         1.6393e-02],
        [5.0084e-07, 2.2637e-06, 6.2355e-07,  ..., 1.5873e-02, 1.5873e-02,
         1.5873e-02]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.9429e-02, 2.0281e-02, 2.0478e-02,  ..., 8.0282e-03, 1.9631e-02,
         2.4845e-02],
        [4.3789e-06, 4.3787e-06, 4.6433e-06,  ..., 1.4877e-05, 4.9965e-01,
         4.9965e-01],
        [7.2638e-07, 1.3674e-06, 5.0262e-07,  ..., 4.1666e-02, 4.1666e-02,
         4.1666e-02],
        ...,
        [1.3727e-07, 1.4184e-07, 2.5992e-08,  ..., 1.694

0.01166981895618408 40.13528643914675 10.843682289123535
tensor([[2.1713e-02, 1.6232e-02, 3.7439e-02, 1.5637e-02, 1.6018e-02, 8.8628e-03,
         8.1566e-03, 2.9654e-02, 4.8759e-02, 4.2870e-02, 1.9859e-02, 1.4765e-02,
         1.8315e-02, 1.5101e-02, 1.5490e-02, 1.9074e-02, 1.7750e-02, 2.1724e-02,
         1.9332e-02, 3.7856e-02, 2.5240e-02, 2.0531e-02, 1.5720e-02, 1.0279e-02,
         2.1916e-02, 1.5923e-02, 1.0258e-02, 1.0303e-02, 1.1337e-02, 8.3228e-03,
         8.1054e-03, 2.2963e-02, 2.6051e-02, 2.5406e-02, 2.1980e-02, 4.0178e-02,
         2.7271e-02, 4.4933e-02, 3.5021e-02, 2.3861e-02, 2.3630e-02, 1.6608e-02,
         1.6544e-02, 1.9321e-02, 5.3693e-02],
        [1.6475e-03, 3.2124e-03, 2.4896e-03, 3.0954e-03, 3.1298e-03, 3.1995e-03,
         2.9822e-03, 1.9915e-03, 5.1781e-03, 3.5662e-03, 2.2168e-03, 2.4128e-03,
         1.7366e-03, 1.7286e-03, 1.6478e-03, 2.2446e-03, 2.5009e-03, 2.3589e-03,
         3.1292e-03, 1.5129e-03, 1.6629e-03, 1.8920e-03, 4.0306e-03, 3.8173e-03,
      

tensor([[1.9685e-02, 1.3183e-02, 1.9598e-02, 1.9836e-02, 1.7181e-02, 1.4900e-02,
         1.6690e-02, 2.8148e-02, 2.5440e-02, 2.7238e-02, 1.9552e-02, 1.9786e-02,
         2.1391e-02, 1.5051e-02, 1.4527e-02, 1.2652e-02, 1.4355e-02, 1.8903e-02,
         1.9620e-02, 2.6802e-02, 2.7164e-02, 1.9559e-02, 1.4345e-02, 1.6691e-02,
         2.7830e-02, 2.2726e-02, 2.0418e-02, 2.0865e-02, 1.6226e-02, 2.2208e-02,
         2.2534e-02, 2.0821e-02, 2.0925e-02, 2.3120e-02, 1.7589e-02, 2.6172e-02,
         2.5478e-02, 3.0026e-02, 3.3659e-02, 5.9268e-02, 3.3490e-02, 1.5765e-02,
         1.5377e-02, 1.9575e-02, 4.3631e-02],
        [1.5525e-04, 3.2522e-04, 4.5214e-04, 9.2886e-04, 6.2287e-04, 5.7462e-04,
         7.1597e-04, 4.0251e-04, 1.6172e-03, 5.1872e-04, 1.5342e-04, 1.0129e-04,
         1.0105e-04, 2.7666e-04, 2.6941e-04, 2.8127e-04, 1.9547e-04, 1.3503e-04,
         2.1076e-04, 2.6337e-04, 6.3820e-04, 2.9071e-04, 5.4062e-04, 9.4968e-04,
         2.1370e-04, 2.5657e-04, 4.3091e-04, 2.4709e-04, 2.3558

tensor([[3.4196e-02, 2.7809e-02, 2.1866e-02, 1.7080e-02, 3.2840e-02, 4.4256e-02,
         2.3226e-02, 2.5619e-02, 1.9752e-02, 3.2135e-02, 1.8723e-02, 2.5257e-02,
         1.2058e-02, 1.0736e-02, 2.1941e-02, 2.2113e-02, 1.4525e-02, 1.8109e-02,
         3.3717e-02, 2.9190e-02, 3.0467e-02, 3.4626e-02, 2.3508e-02, 1.8094e-02,
         2.0674e-02, 1.6647e-02, 1.6479e-02, 2.9383e-02, 2.3473e-02, 2.7763e-02,
         1.9374e-02, 2.2502e-02, 1.5429e-02, 2.3414e-02, 3.0237e-02, 2.5693e-02,
         1.6230e-02, 1.9338e-02, 2.4229e-02, 1.0603e-02, 1.3589e-02, 1.4459e-02,
         1.3487e-02, 9.9535e-03, 1.5203e-02],
        [3.5047e-07, 9.3028e-07, 2.6509e-06, 5.5354e-06, 6.7656e-06, 1.2997e-06,
         3.5107e-07, 2.0622e-07, 2.6609e-06, 9.9978e-07, 1.8539e-07, 6.9221e-08,
         2.0045e-07, 2.4132e-07, 7.5556e-07, 8.0841e-07, 1.9448e-07, 1.1520e-07,
         2.0334e-07, 5.1364e-07, 4.3549e-06, 2.0419e-06, 2.6703e-06, 2.6568e-06,
         5.1499e-07, 4.4156e-07, 1.3650e-06, 3.9411e-07, 3.6791

tensor([[3.6846e-02, 3.7199e-02, 3.2656e-02, 2.5830e-02, 1.1617e-02, 1.2264e-02,
         1.6164e-02, 2.0563e-02, 2.7326e-02, 2.3215e-02, 8.6060e-03, 1.0602e-02,
         8.6175e-03, 1.9665e-02, 1.3628e-02, 1.6811e-02, 1.6374e-02, 1.6975e-02,
         4.4639e-02, 4.0921e-02, 2.7276e-02, 2.1806e-02, 3.1708e-02, 5.9021e-02,
         3.9875e-02, 3.3414e-02, 5.7256e-02, 1.1258e-02, 1.2407e-02, 1.6071e-02,
         1.6544e-02, 1.0988e-02, 1.2979e-02, 1.9031e-02, 2.2586e-02, 1.4933e-02,
         1.4633e-02, 2.0669e-02, 1.9650e-02, 1.3571e-02, 1.1670e-02, 2.7652e-02,
         2.0815e-02, 1.4074e-02, 9.5944e-03],
        [6.9658e-07, 2.5354e-06, 4.3408e-06, 1.3591e-05, 1.5375e-05, 2.6947e-06,
         8.4956e-07, 4.4073e-07, 3.9264e-06, 1.5975e-06, 5.1041e-07, 1.8737e-07,
         7.4516e-07, 1.2224e-06, 2.2193e-06, 2.4019e-06, 4.8097e-07, 3.2048e-07,
         7.2899e-07, 1.6665e-06, 1.0642e-05, 2.7275e-06, 7.5225e-06, 1.0049e-05,
         1.7562e-06, 7.3062e-07, 2.9898e-06, 1.1744e-06, 7.6914

0.014238175125493056 66.43201263554019 10.943562507629395
tensor([[0.0047, 0.0080, 0.0068,  ..., 0.0124, 0.0072, 0.0173],
        [0.0005, 0.0004, 0.0010,  ..., 0.0157, 0.0157, 0.0157],
        [0.0006, 0.0013, 0.0019,  ..., 0.0143, 0.0143, 0.0143],
        ...,
        [0.0001, 0.0001, 0.0001,  ..., 0.0110, 0.0110, 0.0110],
        [0.0011, 0.0021, 0.0010,  ..., 0.0107, 0.0107, 0.0107],
        [0.0010, 0.0006, 0.0008,  ..., 0.0106, 0.0106, 0.0106]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[6.7326e-03, 1.1554e-02, 7.1633e-03,  ..., 1.5932e-02, 7.0774e-03,
         1.8373e-02],
        [9.9204e-06, 9.9260e-06, 2.3103e-05,  ..., 1.6109e-02, 1.6109e-02,
         1.6109e-02],
        [1.2498e-05, 3.2404e-05, 3.9407e-05,  ..., 1.4696e-02, 1.4696e-02,
         1.4696e-02],
        ...,
        [4.5837e-06, 2.8414e-06, 6.6340e-06,  ..., 1.0988e-02, 1.0988e-02,
         1.0988e-02],
        [1.6131e-05, 2.7140e-05, 7.7705e-06,  ..., 1.0750e-02, 1.0750e-02,
         1.0750e-

tensor([[9.3060e-03, 1.0687e-02, 9.4730e-03,  ..., 9.4005e-03, 1.3128e-02,
         1.1961e-02],
        [4.1963e-09, 1.4166e-08, 1.5159e-07,  ..., 1.6129e-02, 1.6129e-02,
         1.6129e-02],
        [4.5761e-08, 1.5133e-07, 2.0782e-07,  ..., 1.4706e-02, 1.4706e-02,
         1.4706e-02],
        ...,
        [2.7864e-08, 2.6159e-08, 6.3177e-08,  ..., 1.0989e-02, 1.0989e-02,
         1.0989e-02],
        [1.1808e-08, 1.8194e-08, 1.3364e-08,  ..., 1.0753e-02, 1.0753e-02,
         1.0753e-02],
        [2.8129e-08, 1.2930e-08, 2.8805e-09,  ..., 1.0638e-02, 1.0638e-02,
         1.0638e-02]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.6115e-02, 1.0638e-02, 7.6966e-03,  ..., 1.6425e-02, 1.1519e-02,
         7.5430e-03],
        [4.2059e-09, 1.4207e-08, 1.5209e-07,  ..., 1.6129e-02, 1.6129e-02,
         1.6129e-02],
        [4.5925e-08, 1.5177e-07, 2.0853e-07,  ..., 1.4706e-02, 1.4706e-02,
         1.4706e-02],
        ...,
        [2.7940e-08, 2.6237e-08, 6.3386e-08,  ..., 1.098

0.008736864006363351 61.655416119022675 10.885157585144043
tensor([[0.0316, 0.0189, 0.0099,  ..., 0.0092, 0.0121, 0.0327],
        [0.0050, 0.0040, 0.0042,  ..., 0.0422, 0.0422, 0.0422],
        [0.0016, 0.0011, 0.0014,  ..., 0.0452, 0.0452, 0.0452],
        ...,
        [0.0011, 0.0009, 0.0006,  ..., 0.0187, 0.0187, 0.0187],
        [0.0005, 0.0006, 0.0004,  ..., 0.0181, 0.0181, 0.0181],
        [0.0012, 0.0013, 0.0016,  ..., 0.0176, 0.0176, 0.0176]],
       device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.8186e-02, 1.7111e-02, 1.0226e-02,  ..., 1.0557e-02, 1.0574e-02,
         2.8964e-02],
        [1.8747e-03, 1.1378e-03, 5.7164e-04,  ..., 4.7962e-02, 4.7962e-02,
         4.7962e-02],
        [2.6984e-05, 3.4570e-05, 2.0521e-05,  ..., 4.9971e-02, 4.9971e-02,
         4.9971e-02],
        ...,
        [1.9005e-05, 1.2621e-05, 6.8672e-06,  ..., 1.8864e-02, 1.8864e-02,
         1.8864e-02],
        [3.7140e-04, 7.6674e-04, 8.1652e-04,  ..., 1.7973e-02, 1.7973e-02,
         1.7973e

tensor([[2.8111e-02, 4.6608e-02, 2.2225e-02,  ..., 1.1075e-02, 4.8563e-03,
         1.5858e-02],
        [8.6507e-07, 6.1772e-07, 6.4266e-08,  ..., 5.2630e-02, 5.2630e-02,
         5.2630e-02],
        [1.8897e-06, 1.7928e-06, 6.9641e-07,  ..., 4.9998e-02, 4.9998e-02,
         4.9998e-02],
        ...,
        [2.3986e-07, 1.1260e-07, 6.9815e-08,  ..., 1.8868e-02, 1.8868e-02,
         1.8868e-02],
        [1.9214e-07, 7.0507e-08, 3.0979e-07,  ..., 1.8182e-02, 1.8182e-02,
         1.8182e-02],
        [1.9025e-07, 6.9859e-08, 9.1195e-08,  ..., 1.7857e-02, 1.7857e-02,
         1.7857e-02]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[1.5487e-02, 2.1547e-02, 1.9425e-02,  ..., 1.4145e-02, 5.9582e-03,
         9.6775e-03],
        [8.6937e-07, 6.2099e-07, 6.4607e-08,  ..., 5.2630e-02, 5.2630e-02,
         5.2630e-02],
        [1.8930e-06, 1.7959e-06, 6.9740e-07,  ..., 4.9998e-02, 4.9998e-02,
         4.9998e-02],
        ...,
        [2.3983e-07, 1.1258e-07, 6.9800e-08,  ..., 1.886

tensor([[1.7156e-02, 1.5220e-02, 2.2653e-02, 2.6322e-02, 2.6067e-02, 2.8357e-02,
         1.2548e-02, 1.8213e-02, 1.8567e-02, 1.8716e-02, 2.6393e-02, 1.7664e-02,
         1.2987e-02, 2.1902e-02, 2.2993e-02, 3.5777e-02, 2.0519e-02, 1.2748e-02,
         1.4893e-02, 1.5772e-02, 1.3989e-02, 2.1344e-02, 1.4621e-02, 1.5625e-02,
         2.3306e-02, 2.9403e-02, 9.3826e-03, 1.6356e-02, 2.2660e-02, 1.6343e-02,
         3.7938e-02, 3.0974e-02, 3.1476e-02, 1.3196e-02, 1.8901e-02, 2.2262e-02,
         3.0086e-02, 3.1741e-02, 2.5257e-02, 2.5511e-02, 2.5121e-02, 2.7221e-02,
         2.3561e-02, 1.5533e-02, 1.5744e-02, 1.6210e-02, 2.0773e-02],
        [1.1267e-05, 3.6478e-06, 7.3313e-06, 5.8313e-06, 4.3607e-05, 1.9138e-05,
         7.6554e-06, 4.1918e-05, 1.4970e-05, 2.0390e-05, 7.1013e-06, 2.5054e-06,
         2.3775e-06, 9.1048e-06, 8.6329e-06, 3.8339e-06, 7.8131e-06, 1.4537e-05,
         7.1105e-06, 4.3057e-05, 6.2547e-06, 4.8730e-06, 8.8797e-06, 1.7662e-05,
         5.5006e-06, 1.0462e-05, 2.4490

tensor([[8.5326e-03, 1.0944e-02, 2.7620e-02, 4.6681e-02, 2.4839e-02, 1.7298e-02,
         1.5195e-02, 3.9095e-02, 2.1328e-02, 2.3242e-02, 4.0969e-02, 2.8683e-02,
         1.6901e-02, 1.6635e-02, 8.7406e-03, 1.8581e-02, 1.0882e-02, 2.0009e-02,
         1.6149e-02, 1.7708e-02, 2.6450e-02, 2.4228e-02, 1.3641e-02, 1.0599e-02,
         1.0720e-02, 1.1753e-02, 1.2370e-02, 1.6542e-02, 3.0601e-02, 2.5477e-02,
         1.6585e-02, 1.8971e-02, 1.8259e-02, 1.6463e-02, 1.8172e-02, 2.4551e-02,
         3.9067e-02, 5.1839e-02, 4.1442e-02, 2.9388e-02, 1.7885e-02, 1.0937e-02,
         1.3760e-02, 1.7230e-02, 1.1899e-02, 7.3966e-03, 3.3742e-02],
        [2.3506e-06, 7.8099e-07, 1.4248e-06, 1.0884e-06, 8.9808e-06, 4.0621e-06,
         1.5220e-06, 8.4582e-06, 3.2820e-06, 4.2808e-06, 1.4850e-06, 5.4317e-07,
         4.9592e-07, 1.9721e-06, 1.8765e-06, 7.6319e-07, 1.5443e-06, 3.2378e-06,
         1.3758e-06, 8.8859e-06, 1.0816e-06, 9.6613e-07, 1.6526e-06, 3.5345e-06,
         1.0565e-06, 2.1494e-06, 5.4846

tensor([[1.7155e-02, 1.0881e-02, 1.7247e-02, 1.7108e-02, 1.0612e-02, 9.9928e-03,
         1.7621e-02, 1.8804e-02, 1.0500e-02, 2.2818e-02, 2.9512e-02, 1.3961e-02,
         1.0359e-02, 7.3462e-03, 4.3259e-03, 8.4305e-03, 1.6683e-02, 3.1581e-02,
         1.3693e-02, 2.7918e-02, 3.7786e-02, 3.1328e-02, 2.7650e-02, 3.5806e-02,
         2.9716e-02, 1.5179e-02, 1.8233e-02, 1.9268e-02, 2.7385e-02, 1.3834e-02,
         4.5308e-02, 4.1135e-02, 4.6743e-02, 3.4102e-02, 1.7076e-02, 3.9100e-02,
         3.6167e-02, 2.2392e-02, 1.4374e-02, 8.6399e-03, 6.6524e-03, 1.2947e-02,
         2.6466e-02, 2.2826e-02, 1.4926e-02, 1.5833e-02, 2.2579e-02],
        [1.8566e-06, 6.1223e-07, 1.1218e-06, 8.4514e-07, 7.0812e-06, 3.3530e-06,
         1.2002e-06, 6.6495e-06, 2.6246e-06, 3.4285e-06, 1.1602e-06, 4.3019e-07,
         3.9259e-07, 1.5845e-06, 1.5321e-06, 6.2417e-07, 1.2047e-06, 2.6843e-06,
         1.0967e-06, 7.0979e-06, 8.2483e-07, 7.6833e-07, 1.3045e-06, 2.8087e-06,
         8.4659e-07, 1.7340e-06, 4.5082

tensor([[8.0573e-03, 7.9743e-03, 2.2042e-02, 1.6454e-02, 1.4261e-02, 1.1610e-02,
         2.1697e-02, 3.1474e-02, 2.6421e-02, 2.7735e-02, 3.1998e-02, 3.0984e-02,
         4.4118e-02, 2.1627e-02, 2.0193e-02, 1.8413e-02, 1.3261e-02, 1.7066e-02,
         2.0080e-02, 1.7774e-02, 1.8229e-02, 1.9915e-02, 7.2149e-03, 1.9282e-02,
         2.6445e-02, 2.3076e-02, 2.4410e-02, 2.6547e-02, 3.6074e-02, 3.0612e-02,
         1.9653e-02, 1.4109e-02, 3.0014e-02, 2.6080e-02, 2.0034e-02, 2.2691e-02,
         3.1473e-02, 1.9197e-02, 1.1488e-02, 9.8835e-03, 1.1850e-02, 1.3402e-02,
         3.1736e-02, 3.9928e-02, 2.5201e-02, 9.8052e-03, 8.4084e-03],
        [1.8754e-06, 6.1717e-07, 1.1428e-06, 8.5208e-07, 7.1630e-06, 3.4728e-06,
         1.2166e-06, 6.6992e-06, 2.6782e-06, 3.5133e-06, 1.1632e-06, 4.2711e-07,
         3.9242e-07, 1.6167e-06, 1.5836e-06, 6.5004e-07, 1.2211e-06, 2.8116e-06,
         1.1234e-06, 7.2422e-06, 8.3235e-07, 7.8415e-07, 1.3353e-06, 2.8758e-06,
         8.7979e-07, 1.7817e-06, 4.6589

KeyboardInterrupt: 

In [13]:
x = next(iter(data_loaders['train']))
original = output_to_translations(x['target'], data['train'])
translations = output_to_translations(translation_model.greedy(x), data['train'])
print(bleu_eval(original, translations))
print(list(zip(translations[0:3], original[0:3])))

0.0074967908706895636
[('and the .', 'and you know the amount of work yeah alexey pajitnov was working for the soviet government and thats how he developed tetris and alexey himself reconstructed the whole game and even gave us a simulation of the cathode ray tube that makes it look slightly bombed .'), ('and the .', 'and therefore when you hear about the corrupt africa corruption all the time i want you to know that the people and the governments are trying hard to fight this in some of the countries and that some successes are emerging .'), ('and the .', 'and he says quot it seems as if the ideal argument for most philosophers is you give your audience the premises and then you give them the inferences and the conclusion and if they dont accept the conclusion they die .')]


In [14]:
translation_model.beam_size = 3

In [15]:
original = output_to_translations(x['target'], data['train'])
translations = output_to_translations(translation_model.beam_search(x), data['train'])
print(bleu_eval(original, translations))
print(list(zip(translations[0:3], original[0:3])))

0.0074967908706895636
[('and the .', 'and you know the amount of work yeah alexey pajitnov was working for the soviet government and thats how he developed tetris and alexey himself reconstructed the whole game and even gave us a simulation of the cathode ray tube that makes it look slightly bombed .'), ('and the .', 'and therefore when you hear about the corrupt africa corruption all the time i want you to know that the people and the governments are trying hard to fight this in some of the countries and that some successes are emerging .'), ('and the .', 'and he says quot it seems as if the ideal argument for most philosophers is you give your audience the premises and then you give them the inferences and the conclusion and if they dont accept the conclusion they die .')]


In [18]:
x = next(iter(data_loaders['train']))

optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, translation_model.parameters()), 1e-2)

for i in range(100):
    optimizer.zero_grad()
    logits, loss = translation_model(x)
    print(loss)
    loss.backward()
    clip_grad_norm_(filter(lambda p: p.requires_grad,
                               translation_model.parameters()), model_config.grad_norm)
    optimizer.step()

tensor(10.6888, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(10.4965, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(9.9548, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(8.5220, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(6.5732, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(5.3038, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(4.4904, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(4.5320, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(4.3945, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(4.3354, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(4.2597, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(4.0287, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(3.8626, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(3.6340, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(4.1306, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(3.9795, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(3.1753, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(3.9226, device='cuda:0

KeyboardInterrupt: 

In [9]:
encoder = EncoderRNN(
    data['train'].input_lang.n_words,
    embedding_size=100,
    hidden_size=128,
    num_layers=1,
    dropout=0.0,
    bidirectional=False).to(model_config.device)
if encoder.bidirectional:
    multiplier = 2
else:
    multiplier = 1
decoder = DecoderRNN(
    embedding_size=100,
    hidden_size=multiplier * 128,
    output_size=data['train'].output_lang.n_words,
    attention=False).to(model_config.device)
translation_model = TranslationModel(encoder, decoder, teacher_forcing_ratio=0.75).to(model_config.device)

In [10]:
optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, translation_model.parameters()), 1e-4)

In [12]:
for i, batch in enumerate(tqdm_notebook(data_loaders['train'])):
    if i % 300 == 0:
        loss, bleu = evaluate(translation_model, data, data_loaders)
        print(loss, bleu)
        #evaluate(translation_model, data, data_loaders, dataset_type='train')
    optimizer.zero_grad()
    logits, loss = translation_model(batch)
    print(loss)
    loss.backward()
    clip_grad_norm_(filter(lambda p: p.requires_grad,
                               translation_model.parameters()), model_config.grad_norm)
    optimizer.step()

HBox(children=(IntProgress(value=0, max=8333), HTML(value='')))

10.674764895439148 7.798895887009536e-05
tensor(10.6922, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(10.6758, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(10.6658, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(10.6828, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(10.6810, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(10.6761, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(10.6533, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(10.6628, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(10.6760, device='cuda:0', grad_fn=<MeanBackward1>)
tensor(10.6547, device='cuda:0', grad_fn=<MeanBackward1>)


KeyboardInterrupt: 

In [45]:
x = next(iter(data_loaders['train']))
original = output_to_translations(x['target'], data['train'])
translations = output_to_translations(translation_model.greedy(x), data['train'])
print(bleu_eval(original, translations))
print(list(zip(translations[0:3], original[0:3])))

0.03994552846659007
[('and we have to be .', 'now when your eye moves from eye to eye if there was something else there like a nose youd see a nose where an eye is supposed to be and youd go oh shit you know theres something wrong about this person .'), ('so we have to be .', 'he was able to do that so easily because he built it on top of facebook and facebook was built on top of the web and that was built on top of the internet and so on and so forth .'), ('so we have to be .', 'well weve seen already that quite some of you already ate insects maybe occasionally but i can tell you that every one of you is eating insects without any exception .')]


In [13]:
original_strings[0:10]

['baseball be later but of volumes all. hope. answer. only were',
 'be this. 65. know what must truth. i most',
 'games be practical everybody pins your needles all forward careful were',
 'i moment. it attack. be sorrows. ten.',
 'canadian be appointment careful laugh. no truth. something time were',
 'beautiful cupboard. hot. wine these',
 'much. but how bitter really dishonesty fed recently',
 'applied all else fed to climate husbands. matter. key',
 'but they answer. perfect. today. tape',
 'i is invented i situation getting powers your few']

In [14]:
translated_strings[0:10]

['i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the']

In [48]:
decoder_input.size()

torch.Size([16])

In [23]:
total_loss.backward()

In [30]:
predictions = translation_model.greedy(x)

In [215]:
for row in predictions.cpu().numpy():
    decoded_words = []
    for elem in row[1:]:
        decoded_words.append(data['train']['output_lang'].index2word[elem])
        if elem == model_config.EOS_token:
            break

In [167]:
yo = Variable(torch.LongTensor([model_config.SOS_token] * 8)).to(model_config.device)
yo = torch.stack((yo, topi.squeeze(), topi.squeeze()), dim=1)

In [99]:
total_loss.backward()

In [57]:
seq_range = torch.autograd.Variable(torch.LongTensor(np.repeat([2], len(x['input_length'])))).to(model_config.device)
mask = seq_range < x['input_length']
loss = -torch.gather(decoder_output, dim=1, index=input_var.unsqueeze(1)).squeeze() * mask.float()

In [66]:
loss.sum() / torch.sum(loss > 0).float()

tensor(11.4193, device='cuda:0', grad_fn=<DivBackward1>)

In [63]:
torch.sum(loss > 0).cpu().numpy()

array(8)

In [None]:
encoder_output, encoder_hidden = encoder(x['input'], x['input_length'])

In [290]:
context = None
if decoder.attention:
    context = Variable(torch.zeros(encoder_output.size(0), encoder_output.size(2))).unsqueeze(1).to(model_config.device)

In [291]:
decoder_output, decoder_hidden, context, weights = decoder(input_var, encoder_hidden, encoder_output, context)

In [16]:
def train_model(model, optimizer, train_loader, criterion):
    model.train()
    loss_train = 0
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(batch)
        loss = criterion(outputs, batch['label'])
        loss.backward()
        optimizer.step()
        loss_train += loss.item() * \
            len(batch['label']) / len(train_loader.dataset)
    return loss_train

In [17]:
optimizer = torch.optim.Adam(translation_model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()
train_model(translation_model, optimizer, train_loader, criterion)

KeyError: 'label'