# Testing code

# Importing stuff

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings

warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [2]:
import os
import os.path as osp
import random

import mlflow
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm_notebook

from nlpclass.config import model_config
from nlpclass.data.data_utils import TranslationDataset, text_collate_func
from nlpclass.models.evaluation_utils import bleu_eval, output_to_translations
from nlpclass.models.models import DecoderRNN, EncoderRNN, TranslationModel
from nlpclass.models.training_utils import load_data

In [3]:
CURRENT_PATH = os.getcwd()
DATA_DIR = osp.join(CURRENT_PATH, '..', 'data')
MODEL_DIR = osp.join(CURRENT_PATH, '..','models')

# Data

In [37]:
data, data_loaders, max_length = load_data('vi', batch_size=24, subsample=0.025)

Counting words...
Counted words:
eng 6845
vi 7837
Counting words...
Counted words:
eng 3562
vi 3678
Counting words...
Counted words:
eng 3361
vi 3518


In [44]:
encoder = EncoderRNN(
    data['train'].input_lang.n_words,
    embedding_size=100,
    hidden_size=128,
    num_layers=1,
    dropout=0.0,
    bidirectional=False).to(model_config.device)
if encoder.bidirectional:
    multiplier = 2
else:
    multiplier = 1
decoder = DecoderRNN(
    embedding_size=100,
    hidden_size=multiplier * 128,
    output_size=data['train'].output_lang.n_words,
    attention=True).to(model_config.device)
translation_model = TranslationModel(encoder, decoder, teacher_forcing_ratio=0.5).to(model_config.device)

In [45]:
optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, translation_model.parameters()), 1e-3)

In [46]:
weight = torch.ones(translation_model.decoder.output_size).to(model_config.device)
weight[model_config.PAD_token] = 0
criterion = nn.CrossEntropyLoss(weight)

In [47]:
def calc_loss(logits, target, criterion):
    logits_flat = logits.view(-1, logits.size(-1))
    target_flat = target.view(-1, 1).squeeze()
    return criterion(logits_flat, target_flat)

In [58]:
for i in tqdm_notebook(range(25)):
    for batch in data_loaders['train']:
        optimizer.zero_grad()
        logits = translation_model(batch)
        loss = calc_loss(logits, batch['target'], criterion)
        loss.backward()
        
        encoder_norm = 0
        for p in translation_model.encoder.parameters():
            param_norm = p.grad.data.norm(2)
            encoder_norm += param_norm.item() ** 2
        decoder_norm = 0
        for p in translation_model.decoder.parameters():
            param_norm = p.grad.data.norm(2)
            decoder_norm += param_norm.item() ** 2
            
        print(encoder_norm, decoder_norm, loss.item())
            
        clip_grad_norm_(filter(lambda p: p.requires_grad,
                                   translation_model.parameters()), 5.0)
        
        optimizer.step() 
    original, translation = evaluate(translation_model, data, data_loaders, dataset_type='train')

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

0.5323623781369315 0.6384738324226595 2.9965109825134277
1.1833885286967296 1.5191947158592114 3.8476264476776123
0.68561794305979 0.9065427408113447 4.032967567443848
0.46524512080742364 0.7038301689602428 3.133288860321045
0.2857694633456527 0.5556603392750223 2.967956066131592
0.5252451296192282 0.9562742735142539 3.0776419639587402
0.5393170590917016 0.8682865410554442 4.061707973480225
0.4835519248407226 0.6842183907242092 4.356184959411621
0.382283094992247 0.7239963586947947 4.1839375495910645
0.36742259317705245 0.5610998070375346 3.0768773555755615
0.4825356135458308 0.7298022513616915 4.475253582000732
0.6575753538861309 1.2432645740388337 3.0507659912109375
0.3427396570294039 0.6678792110003401 2.951038360595703
0.4959792972914256 0.7361025113060172 4.112887382507324
0.8281186030710224 1.0656926001488236 4.351569175720215
0.6141591774033082 0.8825596063844623 4.045034408569336
0.2848792163121469 0.5977499334678613 3.125828981399536
0.6873011338192654 0.8287029860618442 4.345

0.6247771480622244 0.6934788565432377 2.83133864402771
0.43944450320885753 0.7344072003332729 2.9693639278411865
1.0512882646075308 0.910162458781122 4.209873676300049
0.5090807951583418 0.907170328156408 2.537632942199707
0.699422335586839 0.7134340619742768 4.214484214782715
0.5957761964333996 1.4336070764633215 2.9032492637634277
0.6698004986655177 0.8795180881652589 4.186452388763428
0.49874703299667156 0.848485822506524 4.096124172210693
0.6706856694014176 0.9513234775984956 2.846153497695923
0.4805135770250118 0.7531397036053026 4.403075218200684
1.427363143204746 2.6108600227471808 3.9640839099884033
0.3006225694618961 0.6640706166212265 2.8987221717834473
0.6805723535883719 1.0332440248070502 4.085095405578613
1.0786905701830025 1.0553690239140834 4.0126051902771
0.7566753699997233 0.9055880119852237 3.633864641189575
0.6297851197614472 0.8908084016318703 4.2562384605407715
0.564301406226783 1.0459455909679145 2.928145408630371
0.6196142748879843 0.8007102645402796 2.9764714241

0.45748864092789143 0.6490991554186081 3.991987705230713
0.6744506681675766 0.8391768846172488 3.8935320377349854
0.3329523772736357 0.5937351468626845 2.661522388458252
0.4330918374319828 0.7293801159098113 2.7151925563812256
0.667137200284494 0.7604635055074398 2.714444398880005
0.3635384685016004 0.5494986919405769 3.055238962173462
0.5159975964725743 0.7921314872403672 2.7560040950775146
1.1412543190667135 1.1019737990712197 3.600043535232544
0.45258519882153053 0.7261827707841276 3.8736484050750732
0.9102381566143021 0.9004061659916075 3.8764586448669434
0.3941067181424584 0.5612481715827526 2.992666721343994
1.1742457681398966 1.3649681621172403 4.426257610321045
0.8725237542662401 0.9224531739461024 3.213383436203003
0.4384569112776428 0.7157758174598485 2.7985386848449707
0.4086481074986443 0.5241085969204038 2.537703037261963
1.3549785272967512 1.113976907273681 3.9423794746398926
0.7137225164582768 0.7854206235627404 2.9170329570770264
0.7689130228161745 0.8677016192674148 3.

KeyboardInterrupt: 

In [67]:
x = next(iter(data_loaders['train']))
original = output_to_translations(x['target'], data['train'])
translations = output_to_translations(translation_model.greedy(x), data['train'])
print(bleu_eval(original, translations))
print(list(zip(translations[0:2], original[0:2])))

7.496940411158645
[('and he was very well and and i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot i quot quot and i quot i quot and i quot i quot and i quot i quot and i quot i quot and i quot i quot quot', 'hes very very thin and he is indeed very sick with pneumonia and hes too sick to talk to me so i talk to his daughter kathleen and i say to her quot did you and jim ever talk about what you would want done if he ended up in this kind of situation ? quot '), ('and as a i i i like a a of of my my i could minutes i and and and', 'and what that is is i imagine explaining a work of art to my grandmother in five minutes and if i cant explain it in five minutes then its too obtuse or esoteric and it hasnt been refined enough yet .')]


In [60]:
original = output_to_translations(x['target'], data['train'])
translations = output_to_translations(translation_model.beam_search(x), data['train'])
print(bleu_eval(original, translations))
print(translations)

0.002632711811722298
['and its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its its', 'and so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so', 'we just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just just ju

In [24]:
x = next(iter(data_loaders['train']))
input_seq = x['input']
target_seq = x['target']
input_length = x['input_length']
target_length = x['target_length']

encoded_input, hidden = translation_model.encoder.forward(input_seq, input_length)

In [25]:
encoded_input[0]

tensor([[-0.6125, -0.5094,  0.1136,  ...,  0.2936, -0.0332, -0.5566],
        [-0.2966, -0.5157,  0.0630,  ...,  0.6734,  0.0263, -0.9648],
        [ 0.3257, -1.1140,  0.3586,  ...,  0.7310, -0.7097, -0.5748],
        ...,
        [-0.3053, -0.1229,  0.2143,  ..., -0.5936, -0.5418, -0.0655],
        [-0.2868, -0.2641,  0.6324,  ..., -0.5235, -0.4746, -0.1487],
        [-0.0230, -0.0655,  0.1169,  ..., -0.4210,  0.3063,  0.4462]],
       device='cuda:0', grad_fn=<SelectBackward>)

In [26]:
x = next(iter(data_loaders['train']))

optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, translation_model.parameters()), 1e-2)

for i in range(100):
    optimizer.zero_grad()
    logits = translation_model(x)
    loss = calc_loss(logits, x['target'], criterion)
    print(loss)
    loss.backward()
    translation_model.encoder.embedding.weight.grad[data['train'].input_lang.pretrained_inds] = 0
    translation_model.decoder.embedding.weight.grad[data['train'].output_lang.pretrained_inds] = 0
    clip_grad_norm_(filter(lambda p: p.requires_grad,
                               translation_model.parameters()), model_config.grad_norm)
    optimizer.step()

tensor(10.8560, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(9.9659, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.5004, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.8864, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.6769, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.1789, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.1030, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.1107, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.3734, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.3272, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.8524, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.8219, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.5111, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.5380, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.2955, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.0173, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.1326, device='cuda:0', grad_fn=<NllLossBackwar

In [43]:
def evaluate(model, data, data_loaders, dataset_type='dev', max_batches=100):
    model.eval()
    with torch.no_grad():
        original_strings = []
        translated_strings = []
        for i, batch in enumerate(data_loaders[dataset_type]):
            if i > max_batches:
                break
            logits = translation_model(batch)
            epoch_loss = calc_loss(logits, batch['target'], criterion)
            original = output_to_translations(batch['target'], data['train'])
            translations = output_to_translations(model.greedy(batch), data['train'])
            original_strings.extend(original)
            translated_strings.extend(translations)
        bleu = bleu_eval(original_strings, translated_strings)
        model.train()
        print(epoch_loss)
        print(bleu)
        
        return original_strings, translated_strings

In [29]:
optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, translation_model.parameters()), 1e-3)
weight = torch.ones(translation_model.decoder.output_size).to(model_config.device)
weight[model_config.PAD_token] = 0
criterion = nn.CrossEntropyLoss(weight)

In [30]:
for i, batch in enumerate(tqdm_notebook(data_loaders['train'])):
    if i % 500 == 0:
        evaluate(translation_model, data, data_loaders)
        #evaluate(translation_model, data, data_loaders, dataset_type='train')
    optimizer.zero_grad()
    logits = translation_model(batch)
    loss = calc_loss(logits, batch['target'], criterion)
    loss.backward()
    translation_model.encoder.embedding.weight.grad[data['train'].input_lang.pretrained_inds] = 0
    translation_model.decoder.embedding.weight.grad[data['train'].output_lang.pretrained_inds] = 0
    clip_grad_norm_(filter(lambda p: p.requires_grad,
                               translation_model.parameters()), model_config.grad_norm)
    optimizer.step()

HBox(children=(IntProgress(value=0, max=1389), HTML(value='')))

tensor(19.1450, device='cuda:0')
0.0008142795832761162
tensor(9.0464, device='cuda:0')
0.0033784249080402885


KeyboardInterrupt: 

In [None]:
original_strings, translated_strings = evaluate(translation_model, data, data_loaders)

In [13]:
original_strings[0:10]

['baseball be later but of volumes all. hope. answer. only were',
 'be this. 65. know what must truth. i most',
 'games be practical everybody pins your needles all forward careful were',
 'i moment. it attack. be sorrows. ten.',
 'canadian be appointment careful laugh. no truth. something time were',
 'beautiful cupboard. hot. wine these',
 'much. but how bitter really dishonesty fed recently',
 'applied all else fed to climate husbands. matter. key',
 'but they answer. perfect. today. tape',
 'i is invented i situation getting powers your few']

In [14]:
translated_strings[0:10]

['i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the']

In [48]:
decoder_input.size()

torch.Size([16])

In [23]:
total_loss.backward()

In [30]:
predictions = translation_model.greedy(x)

In [215]:
for row in predictions.cpu().numpy():
    decoded_words = []
    for elem in row[1:]:
        decoded_words.append(data['train']['output_lang'].index2word[elem])
        if elem == model_config.EOS_token:
            break

In [167]:
yo = Variable(torch.LongTensor([model_config.SOS_token] * 8)).to(model_config.device)
yo = torch.stack((yo, topi.squeeze(), topi.squeeze()), dim=1)

In [99]:
total_loss.backward()

In [57]:
seq_range = torch.autograd.Variable(torch.LongTensor(np.repeat([2], len(x['input_length'])))).to(model_config.device)
mask = seq_range < x['input_length']
loss = -torch.gather(decoder_output, dim=1, index=input_var.unsqueeze(1)).squeeze() * mask.float()

In [66]:
loss.sum() / torch.sum(loss > 0).float()

tensor(11.4193, device='cuda:0', grad_fn=<DivBackward1>)

In [63]:
torch.sum(loss > 0).cpu().numpy()

array(8)

In [None]:
encoder_output, encoder_hidden = encoder(x['input'], x['input_length'])

In [290]:
context = None
if decoder.attention:
    context = Variable(torch.zeros(encoder_output.size(0), encoder_output.size(2))).unsqueeze(1).to(model_config.device)

In [291]:
decoder_output, decoder_hidden, context, weights = decoder(input_var, encoder_hidden, encoder_output, context)

In [16]:
def train_model(model, optimizer, train_loader, criterion):
    model.train()
    loss_train = 0
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(batch)
        loss = criterion(outputs, batch['label'])
        loss.backward()
        optimizer.step()
        loss_train += loss.item() * \
            len(batch['label']) / len(train_loader.dataset)
    return loss_train

In [17]:
optimizer = torch.optim.Adam(translation_model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()
train_model(translation_model, optimizer, train_loader, criterion)

KeyError: 'label'