In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [85]:
import torch
from torchtext import data, vocab

import time
import datetime
import logging
import sys

from utils import execute_and_time, get_device, itos
from preprocess import Batch, embedding_param
from model import Transformer
from optimize import get_default_optimizer, train

In [39]:
LOG_FILE = False
logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
if LOG_FILE:
    file_handler = logging.FileHandler('log.out')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler) 
stream_handler = logging.StreamHandler(sys.stdout)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.setLevel(logging.INFO)

In [53]:
DATA_PATH = 'data/'
SAMPLE_DATA_PATH = f'{DATA_PATH}sample_data/'
PROCESSED_DATA_PATH = f'{DATA_PATH}processed_data/'

pre_trained_vector_type = 'glove.6B.200d' 
batch_size = 64
device = get_device()
stack_number = 6
heads_number = 8


2019-01-17 14:45:00,083 INFO GPU unavailable, using CPU.


In [4]:
%%time
tokenizer = data.get_tokenizer('spacy')
TEXT = data.Field(tokenize=tokenizer, lower=True, eos_token='_eos_')
trn_data_fields = [("source", TEXT), ("target", TEXT)]
trn, vld = data.TabularDataset.splits(path=f'{SAMPLE_DATA_PATH}',
                           train='train_ds.csv', 
                           validation='valid_ds.csv',
                           format='csv', 
                           skip_header=True, 
                           fields=trn_data_fields)

CPU times: user 2min 10s, sys: 1.65 s, total: 2min 11s
Wall time: 2min 15s


In [51]:
TEXT.build_vocab(trn, vectors=pre_trained_vector_type)
vocabulary = TEXT.vocab
vocab_size = len(vocabulary)

2019-01-17 14:44:23,668 INFO Loading vectors from .vector_cache/glove.6B.200d.txt.pt


In [108]:
train_iter, val_iter = data.BucketIterator.splits((trn, vld), 
                                                  batch_sizes=(batch_size, int(batch_size * 1.6)),
                                                  device=device, 
                                                  sort_key=lambda x: len(x.source),
                                                  shuffle=True, 
                                                  sort_within_batch=False, 
                                                  repeat=True)
train_iter = Batch(train_iter, "source", "target", vocabulary)
val_iter = Batch(val_iter, "source", "target", vocabulary)
# train_iter, val_iter = iter(train_iter_tuple), iter(val_iter_tuple)

In [27]:
batch = next(train_iter)
print(type(batch), len(batch))
print(batch[0].size(), batch[1].size(), batch[2].size(), batch[3].size())

sample_source = batch[0].transpose(1,0)[0]
sample_target = batch[1].transpose(1,0)[0]
sample_src_mask = batch[2].transpose(1,0)[0]
sample_tgt_mask = batch[3].transpose(1,0)[0]


print("source:\n%s \n\ncorresponding tensor:\n%s \n" %(itos(sample_source, vocabulary), sample_source))
print("target:\n%s \n\ncorresponding tensor:\n%s \n" %(itos(sample_target, vocabulary), sample_target))
print(sample_src_mask)
print(sample_tgt_mask)

<class 'tuple'> 4
torch.Size([62, 64]) torch.Size([18, 64]) torch.Size([64, 1, 1, 62]) torch.Size([64, 1, 18, 18])
source:
the new york times co. said thursday that its second - quarter profit from operations rose # . # percent , lifted by strong growth in advertising revenue at the company 's flagship newspaper . _eos_ <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

corresponding tensor:
tensor([   4,   27,  214,  603,  742,   16,   34,   19,   36,  124,   14,  244,
         331,   24,  993,  359,    3,    5,    3,   60,    6, 2095,   26,  467,
         445,    8, 2641, 1963,   18,    4,  151,   13, 7621,  618,    5,    2,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1]) 

target:
the new york times co. _eos_ <pad> <pad> <pad> <pad> <pad> <pad> <p

In [42]:
pre_trained_vector, embz_size, padding_idx = embedding_param(SAMPLE_DATA_PATH, TEXT, pre_trained_vector_type)

2019-01-17 14:33:23,020 INFO pre_trained_vector_mean = 0.002008178, pre_trained_vector_std = 0.43602833
2019-01-17 14:33:23,020 INFO Normalizing embeddings...
2019-01-17 14:33:23,169 INFO pre_trained_vector_mean = -1.2933846e-08, pre_trained_vector_std = 1.0000006


In [131]:
model = Transformer(
    embz_size,
    vocab_size,
    padding_idx,
    pre_trained_vector,
    stack_number,
    heads_number
)

In [132]:
optimizer = get_default_optimizer(model)
criterion = torch.nn.NLLLoss()
train(model, train_iter, 5000, optimizer, criterion, print_every=100)

2019-01-20 10:44:13,923 INFO Start traning
torch.Size([64, 61, 200]) torch.Size([64, 61, 200]) torch.Size([64, 61, 200])
torch.Size([64, 61, 200]) torch.Size([64, 61, 200]) torch.Size([64, 61, 200])
torch.Size([64, 61, 200]) torch.Size([64, 61, 200]) torch.Size([64, 61, 200])
torch.Size([64, 61, 200]) torch.Size([64, 61, 200]) torch.Size([64, 61, 200])
torch.Size([64, 61, 200]) torch.Size([64, 61, 200]) torch.Size([64, 61, 200])
torch.Size([64, 61, 200]) torch.Size([64, 61, 200]) torch.Size([64, 61, 200])
torch.Size([64, 17, 200]) torch.Size([64, 17, 200]) torch.Size([64, 17, 200])
torch.Size([64, 17, 200]) torch.Size([64, 61, 200]) torch.Size([64, 61, 200])


RuntimeError: The expanded size of the tensor (61) must match the existing size (17) at non-singleton dimension 3.  Target sizes: [64, 8, 17, 61].  Tensor sizes: [64, 1, 17, 17]