# Testing code

# Importing stuff

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings

warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [2]:
import os
import os.path as osp
import random

import mlflow
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm_notebook

from nlpclass.config import model_config
from nlpclass.data.data_utils import TranslationDataset, text_collate_func
from nlpclass.models.evaluation_utils import bleu_eval, output_to_translations
from nlpclass.models.models import DecoderRNN, EncoderRNN, TranslationModel
from nlpclass.models.training_utils import load_data

In [3]:
CURRENT_PATH = os.getcwd()
DATA_DIR = osp.join(CURRENT_PATH, '..', 'data')
MODEL_DIR = osp.join(CURRENT_PATH, '..','models')

# Data

In [4]:
data, data_loaders, max_length = load_data('vi', batch_size=24)

Counting words...
Counted words:
eng 35150
vi 43278
Counting words...
Counted words:
eng 3561
vi 3677
Counting words...
Counted words:
eng 3360
vi 3517


In [15]:
encoder = EncoderRNN(
    data['train'].input_lang.n_words,
    embedding_size=128,
    hidden_size=128,
    num_layers=1,
    dropout=0.0,
    bidirectional=True).to(model_config.device)
if encoder.bidirectional:
    multiplier = 2
else:
    multiplier = 1
multiplier 
decoder = DecoderRNN(
    embedding_size=128,
    hidden_size=128 * multiplier,
    output_size=data['train'].output_lang.n_words,
    attention=True).to(model_config.device)
translation_model = TranslationModel(encoder, decoder, teacher_forcing_ratio=0.5).to(model_config.device)

In [7]:
x = next(iter(data_loaders['train']))
input_seq = x['input']
target_seq = x['target']
input_length = x['input_length']
target_length = x['target_length']

encoded_input, hidden = translation_model.encoder.forward(input_seq, input_length)

In [8]:
weight = torch.ones(translation_model.decoder.output_size).to(model_config.device)
weight[model_config.PAD_token] = 0
criterion = nn.CrossEntropyLoss(weight)

In [9]:
def calc_loss(logits, target, criterion):
    logits_flat = logits.view(-1, logits.size(-1))
    target_flat = target.view(-1, 1).squeeze()
    return criterion(logits_flat, target_flat)

In [10]:
x = next(iter(data_loaders['train']))

optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, translation_model.parameters()), 1e-2)

for i in range(100):
    optimizer.zero_grad()
    logits = translation_model(x)
    loss = calc_loss(logits, x['target'], criterion)
    print(loss)
    loss.backward()
    clip_grad_norm_(filter(lambda p: p.requires_grad,
                               translation_model.parameters()), model_config.grad_norm)
    optimizer.step()

tensor(10.8334, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.7296, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.2177, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.8561, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.5984, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.2933, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.8101, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.7192, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.4480, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.7836, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4524, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1647, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6222, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4378, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2779, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2172, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.3980, device='cuda:0', grad_fn=<NllLossBackwar

In [11]:
original = output_to_translations(x['target'], data['train'])
translations = output_to_translations(translation_model.greedy(x), data['train'])
bleu_eval(original, translations)

100.00000000000004

In [16]:
def evaluate(model, data, data_loaders, dataset_type='dev', max_batches=100):
    model.eval()
    with torch.no_grad():
        original_strings = []
        translated_strings = []
        for i, batch in enumerate(data_loaders[dataset_type]):
            if i > max_batches:
                break
            logits = translation_model(batch)
            epoch_loss = calc_loss(logits, batch['target'], criterion)
            original = output_to_translations(batch['target'], data['train'])
            translations = output_to_translations(model.greedy(batch), data['train'])
            original_strings.extend(original)
            translated_strings.extend(translations)
        bleu = bleu_eval(original_strings, translated_strings)
        model.train()
        print(epoch_loss)
        print(bleu)
        
        return original_strings, translated_strings

In [17]:
optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, translation_model.parameters()), 1e-4)
weight = torch.ones(translation_model.decoder.output_size).to(model_config.device)
weight[model_config.PAD_token] = 0
criterion = nn.CrossEntropyLoss(weight)

In [None]:
for i, batch in enumerate(tqdm_notebook(data_loaders['train'])):
    if i % 250 == 0:
        evaluate(translation_model, data, data_loaders)
        #evaluate(translation_model, data, data_loaders, dataset_type='train')
    optimizer.zero_grad()
    logits = translation_model(batch)
    loss = calc_loss(logits, batch['target'], criterion)
    loss.backward()
    clip_grad_norm_(filter(lambda p: p.requires_grad,
                               translation_model.parameters()), model_config.grad_norm)
    optimizer.step()

HBox(children=(IntProgress(value=0, max=5555), HTML(value='')))

tensor(10.8823, device='cuda:0')
6.206507731083603e-05
tensor(9.5364, device='cuda:0')
0.001002683570029698


In [122]:
original

[' quot gazelle i killed you for your skins exquisite touch for how easy it is to be nailed to a board weathered raw as white butcher paper .',
 'heres why once they become ubiquitous each year these vehicles will save tens of thousands of lives in the united states alone and a million globally .',
 'i was leaning against some sandbags one morning not much going on sort of spacing out and some sand was kicked into the side of sort of hit the side of my face .',
 'hi . im going to ask you to raise your arms and wave back just the way i am kind of a royal wave .',
 'and thus was born the project called remark which is a collaboration with zachary lieberman and the ars electronica futurelab .',
 'more complex structures such as blood vessels urethras which i showed you theyre definitely more complex because youre introducing two different cell types .',
 'theres an old african proverb that goes quot when spider webs unite they can halt even the lion . quot ',
 'and indeed at the top youll

In [123]:
translations

['and you can see the first of the middle of the middle of the middle of the middle of the middle of the middle of the middle of the middle of the middle of the united states and the next year .',
 'and the other people have to be able to be able to make the same time and the world and the most of the world is the same of the world and the world .',
 'and then i was a few years ago i was a lot of the first time i was a lot of the first time i was a lot of the time .',
 'and so i want to show you a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit .',
 'so the first thing is the first thing that is the first of the most of the most of the most of the most of the most of the most of the same time .',
 'and you can see the other people who are the same time and you can see the other and the people who are the same time and you can see the other .',
 'and

In [48]:
decoder_input.size()

torch.Size([16])

In [23]:
total_loss.backward()

In [30]:
predictions = translation_model.greedy(x)

In [215]:
for row in predictions.cpu().numpy():
    decoded_words = []
    for elem in row[1:]:
        decoded_words.append(data['train']['output_lang'].index2word[elem])
        if elem == model_config.EOS_token:
            break

In [167]:
yo = Variable(torch.LongTensor([model_config.SOS_token] * 8)).to(model_config.device)
yo = torch.stack((yo, topi.squeeze(), topi.squeeze()), dim=1)

In [99]:
total_loss.backward()

In [57]:
seq_range = torch.autograd.Variable(torch.LongTensor(np.repeat([2], len(x['input_length'])))).to(model_config.device)
mask = seq_range < x['input_length']
loss = -torch.gather(decoder_output, dim=1, index=input_var.unsqueeze(1)).squeeze() * mask.float()

In [66]:
loss.sum() / torch.sum(loss > 0).float()

tensor(11.4193, device='cuda:0', grad_fn=<DivBackward1>)

In [63]:
torch.sum(loss > 0).cpu().numpy()

array(8)

In [None]:
encoder_output, encoder_hidden = encoder(x['input'], x['input_length'])

In [290]:
context = None
if decoder.attention:
    context = Variable(torch.zeros(encoder_output.size(0), encoder_output.size(2))).unsqueeze(1).to(model_config.device)

In [291]:
decoder_output, decoder_hidden, context, weights = decoder(input_var, encoder_hidden, encoder_output, context)

In [16]:
def train_model(model, optimizer, train_loader, criterion):
    model.train()
    loss_train = 0
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(batch)
        loss = criterion(outputs, batch['label'])
        loss.backward()
        optimizer.step()
        loss_train += loss.item() * \
            len(batch['label']) / len(train_loader.dataset)
    return loss_train

In [17]:
optimizer = torch.optim.Adam(translation_model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()
train_model(translation_model, optimizer, train_loader, criterion)

KeyError: 'label'