# Testing code

# Importing stuff

In [33]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings

warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
import os
import os.path as osp
import random

import mlflow
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm_notebook

from nlpclass.config import model_config
from nlpclass.data.data_utils import TranslationDataset, text_collate_func
from nlpclass.models.evaluation_utils import bleu_eval, output_to_translations
from nlpclass.models.models import DecoderRNN, EncoderCNN, EncoderRNN, TranslationModel
from nlpclass.models.training_utils import load_data

In [35]:
CURRENT_PATH = os.getcwd()
DATA_DIR = osp.join(CURRENT_PATH, '..', 'data')
MODEL_DIR = osp.join(CURRENT_PATH, '..','models')

# Data

In [36]:
data, data_loaders, max_length = load_data('vi', batch_size=24, subsample=0.5)

Counting words...
Counted words:
eng 27052
vi 33338
Counting words...
Counted words:
eng 3562
vi 3678
Counting words...
Counted words:
eng 3361
vi 3518


In [20]:
encoder = EncoderCNN(
    data['train'].input_lang.n_words,
    embedding_size=100,
    hidden_size=128,
    num_layers=2).to(model_config.device)
decoder = DecoderRNN(
    embedding_size=100,
    hidden_size=128,
    output_size=data['train'].output_lang.n_words,
    attention=False).to(model_config.device)
translation_model = TranslationModel(encoder, decoder, teacher_forcing_ratio=0.5).to(model_config.device)

In [37]:
encoder = EncoderRNN(
    data['train'].input_lang.n_words,
    embedding_size=100,
    hidden_size=128,
    num_layers=2,
    dropout=0.0,
    bidirectional=True).to(model_config.device)
if encoder.bidirectional:
    multiplier = 2
else:
    multiplier = 1
decoder = DecoderRNN(
    embedding_size=100,
    hidden_size=multiplier * 128,
    output_size=data['train'].output_lang.n_words,
    attention=True).to(model_config.device)
translation_model = TranslationModel(encoder, decoder, teacher_forcing_ratio=0.5).to(model_config.device)

In [38]:
optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, translation_model.parameters()), 1e-3)

In [23]:
weight = torch.ones(translation_model.decoder.output_size).to(model_config.device)
weight[model_config.PAD_token] = 0
criterion = nn.CrossEntropyLoss(weight)

In [24]:
def calc_loss(logits, target, criterion):
    logits_flat = logits.view(-1, logits.size(-1))
    target_flat = target.view(-1, 1).squeeze()
    return criterion(logits_flat, target_flat)

In [44]:
def evaluate(model, data, data_loaders, dataset_type='dev', max_batch=100, greedy=True):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        original_strings = []
        translated_strings = []
        for i, batch in enumerate(data_loaders[dataset_type]):
            if i > max_batch:
                break
            logits, loss = model(batch)
            epoch_loss += loss.item()
            original = output_to_translations(batch['target'], data['train'])
            if greedy:
                translations = output_to_translations(
                    model.greedy(batch), data['train'])
            else:
                translations = output_to_translations(
                    model.beam_search(batch), data['train'])
            original_strings.extend(original)
            translated_strings.extend(translations)
        bleu = bleu_eval(original_strings, translated_strings)
        model.train()

    return epoch_loss / (i + 1), bleu

In [29]:
for i in tqdm_notebook(range(25)):
    for batch in data_loaders['train']:
        translation_model.train()
        optimizer.zero_grad()
        logits, total_loss = translation_model(batch)
        loss = calc_loss(logits, batch['target'], criterion)
        print(total_loss, loss)
        total_loss.backward()
        
        encoder_norm = 0
        for p in translation_model.encoder.parameters():
            param_norm = p.grad.data.norm(2)
            encoder_norm += param_norm.item() ** 2
        decoder_norm = 0
        for p in translation_model.decoder.parameters():
            param_norm = p.grad.data.norm(2)
            decoder_norm += param_norm.item() ** 2
            
        #print(encoder_norm, decoder_norm, loss.item())
            
        clip_grad_norm_(filter(lambda p: p.requires_grad,
                                   translation_model.parameters()), 5.0)
        
        optimizer.step() 
    original, translation = evaluate(translation_model, data, data_loaders, dataset_type='train')

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

tensor(6.0760, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.3035, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.9517, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.2421, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.2301, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.4688, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.9269, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.4204, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.8331, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.0770, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.3624, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.4277, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.2147, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.4235, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.1306, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.3325, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.1396, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.4288, 

tensor(6.3096, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.7170, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.0556, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.2055, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.9899, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.2137, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.6201, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.0113, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.7014, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.0326, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.6560, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.8294, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.4117, device='cuda:0', grad_fn=<MeanBackward1>) tensor(6.5302, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.6079, device='cuda:0', grad_fn=<MeanBackward1>) tensor(5.9382, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.3249, device='cuda:0', grad_fn=<MeanBackward1>) tensor(5.7270, 

KeyboardInterrupt: 

In [30]:
x = next(iter(data_loaders['train']))
original = output_to_translations(x['target'], data['train'])
translations = output_to_translations(translation_model.greedy(x), data['train'])
print(bleu_eval(original, translations))
print(list(zip(translations[0:3], original[0:3])))

0.0962950180861046
[('but but but i was to the the the to the the the to the the the to the the the to the and the to the and the to the and the to the and the to the and the to the and the and the to the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and .', 'but the upshot of this is that the mission i want psychology to have in addition to its mission of curing the mentally ill and in addition to its mission of making miserable people less miserable is can psychology actually make people happier ?'), ('and you you to to the the the to the the the and we the the to the the the and the and the and the and .', 'see mr. hunter is doing that because he says his time has messed up a lot and hes trying to tell us how to fix that problem .'), ('and the the the the the the the the the the the the the the the the the the the the the the the the the the .', 'and in fact university administrators are a little uncom

In [31]:
translation_model.beam_size = 3

In [32]:
original = output_to_translations(x['target'], data['train'])
translations = output_to_translations(translation_model.beam_search(x), data['train'])
print(bleu_eval(original, translations))
print(list(zip(translations[0:3], original[0:3])))

0.0862594017124614
[('but but but i was to the the to to the the to the the to the the to the the to the the to the the the to the and the to the and the to to the the the and and i was to the to the and the to the and the to the and the to the and the to the and the to the and the to the and the to the and the to the and the to the and the to the and the and .', 'but the upshot of this is that the mission i want psychology to have in addition to its mission of curing the mentally ill and in addition to its mission of making miserable people less miserable is can psychology actually make people happier ?'), ('and you you you to the to the to the to the to the to the to the the the and the and .', 'see mr. hunter is doing that because he says his time has messed up a lot and hes trying to tell us how to fix that problem .'), ('and the the the of the the the of the of the the the the the the the the the the .', 'and in fact university administrators are a little uncomfortable about the i

In [116]:
x = next(iter(data_loaders['train']))

optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, translation_model.parameters()), 1e-2)

for i in range(100):
    optimizer.zero_grad()
    logits = translation_model(x)
    loss = calc_loss(logits, x['target'], criterion)
    print(loss)
    loss.backward()
    clip_grad_norm_(filter(lambda p: p.requires_grad,
                               translation_model.parameters()), model_config.grad_norm)
    optimizer.step()

tensor(8.9509, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.8092, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.1293, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.5016, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.6784, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.4387, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.6025, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.5010, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3718, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.2881, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.2512, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.0624, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.7472, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.9508, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.6674, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.8056, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.7100, device='cuda:0', grad_fn=<NllLossBackward

In [45]:
optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, translation_model.parameters()), 1e-3)

In [None]:
for i, batch in enumerate(tqdm_notebook(data_loaders['train'])):
    if i % 300 == 0:
        loss, bleu = evaluate(translation_model, data, data_loaders)
        print(loss, bleu)
        #evaluate(translation_model, data, data_loaders, dataset_type='train')
    optimizer.zero_grad()
    logits, loss = translation_model(batch)
    loss.backward()
    clip_grad_norm_(filter(lambda p: p.requires_grad,
                               translation_model.parameters()), model_config.grad_norm)
    optimizer.step()

HBox(children=(IntProgress(value=0, max=2778), HTML(value='')))

10.087496595562628 0.00037814385791066975


In [None]:
original_strings, translated_strings = evaluate(translation_model, data, data_loaders)

In [13]:
original_strings[0:10]

['baseball be later but of volumes all. hope. answer. only were',
 'be this. 65. know what must truth. i most',
 'games be practical everybody pins your needles all forward careful were',
 'i moment. it attack. be sorrows. ten.',
 'canadian be appointment careful laugh. no truth. something time were',
 'beautiful cupboard. hot. wine these',
 'much. but how bitter really dishonesty fed recently',
 'applied all else fed to climate husbands. matter. key',
 'but they answer. perfect. today. tape',
 'i is invented i situation getting powers your few']

In [14]:
translated_strings[0:10]

['i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the',
 'i you you to the']

In [48]:
decoder_input.size()

torch.Size([16])

In [23]:
total_loss.backward()

In [30]:
predictions = translation_model.greedy(x)

In [215]:
for row in predictions.cpu().numpy():
    decoded_words = []
    for elem in row[1:]:
        decoded_words.append(data['train']['output_lang'].index2word[elem])
        if elem == model_config.EOS_token:
            break

In [167]:
yo = Variable(torch.LongTensor([model_config.SOS_token] * 8)).to(model_config.device)
yo = torch.stack((yo, topi.squeeze(), topi.squeeze()), dim=1)

In [99]:
total_loss.backward()

In [57]:
seq_range = torch.autograd.Variable(torch.LongTensor(np.repeat([2], len(x['input_length'])))).to(model_config.device)
mask = seq_range < x['input_length']
loss = -torch.gather(decoder_output, dim=1, index=input_var.unsqueeze(1)).squeeze() * mask.float()

In [66]:
loss.sum() / torch.sum(loss > 0).float()

tensor(11.4193, device='cuda:0', grad_fn=<DivBackward1>)

In [63]:
torch.sum(loss > 0).cpu().numpy()

array(8)

In [None]:
encoder_output, encoder_hidden = encoder(x['input'], x['input_length'])

In [290]:
context = None
if decoder.attention:
    context = Variable(torch.zeros(encoder_output.size(0), encoder_output.size(2))).unsqueeze(1).to(model_config.device)

In [291]:
decoder_output, decoder_hidden, context, weights = decoder(input_var, encoder_hidden, encoder_output, context)

In [16]:
def train_model(model, optimizer, train_loader, criterion):
    model.train()
    loss_train = 0
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(batch)
        loss = criterion(outputs, batch['label'])
        loss.backward()
        optimizer.step()
        loss_train += loss.item() * \
            len(batch['label']) / len(train_loader.dataset)
    return loss_train

In [17]:
optimizer = torch.optim.Adam(translation_model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()
train_model(translation_model, optimizer, train_loader, criterion)

KeyError: 'label'