In [1]:
import os
os.chdir('..')

%load_ext autoreload
%autoreload 2

In [2]:
import datetime

import torch

import torch.nn as nn

from src.consts import *
from src.main import main, setup_torch, get_corpus
from src.model import RNNModel
from src.training import train, evaluate
from src.split_cross_entropy_loss import SplitCrossEntropyLoss
from src.utils import summary, check_cuda_mem

In [3]:
use_data_paralellization = False

In [4]:
setup_torch()
device = torch.device("cuda" if USE_CUDA else "cpu")
corpus = get_corpus()

assert corpus.train.max() < len(corpus.dictionary)
assert corpus.valid.max() < len(corpus.dictionary)
assert corpus.test.max() < len(corpus.dictionary)

ntokens = len(corpus.dictionary)
model = RNNModel(MODEL_TYPE, ntokens, EMBEDDINGS_SIZE, HIDDEN_UNIT_COUNT, LAYER_COUNT, DROPOUT_PROB,
                 TIED)
if use_data_paralellization or USE_DATA_PARALLELIZATION:
    model = CustomDataParallel(model)
else:
    model.to(device)
criterion = nn.CrossEntropyLoss()

summary(model, criterion)

RNNModel(
  (drop): Dropout(p=0.2)
  (encoder): Embedding(33281, 200)
  (rnn): LSTM(200, 200, num_layers=2, dropout=0.2)
  (decoder): Linear(in_features=200, out_features=33281, bias=True)
)

encoder.weight torch.Size([33281, 200])
rnn.weight_ih_l0 torch.Size([800, 200])
rnn.weight_hh_l0 torch.Size([800, 200])
rnn.bias_ih_l0 torch.Size([800])
rnn.bias_hh_l0 torch.Size([800])
rnn.weight_ih_l1 torch.Size([800, 200])
rnn.weight_hh_l1 torch.Size([800, 200])
rnn.bias_ih_l1 torch.Size([800])
rnn.bias_hh_l1 torch.Size([800])
decoder.weight torch.Size([33281, 200])
decoder.bias torch.Size([33281])

Total Parameters: 7,332,681


In [5]:
train(model, corpus, criterion, None, device, False)

INFO 2019-06-15 17:55:50,363: | epoch   1 |   200/ 2965 batches | lr 20.00 | ms/batch 11.33 | loss  7.63 | ppl  2062.17
INFO 2019-06-15 17:55:52,578: | epoch   1 |   400/ 2965 batches | lr 20.00 | ms/batch 11.07 | loss  6.84 | ppl   933.64
INFO 2019-06-15 17:55:54,792: | epoch   1 |   600/ 2965 batches | lr 20.00 | ms/batch 11.06 | loss  6.41 | ppl   605.24
INFO 2019-06-15 17:55:57,006: | epoch   1 |   800/ 2965 batches | lr 20.00 | ms/batch 11.07 | loss  6.23 | ppl   508.93
INFO 2019-06-15 17:55:59,224: | epoch   1 |  1000/ 2965 batches | lr 20.00 | ms/batch 11.09 | loss  6.09 | ppl   439.64
INFO 2019-06-15 17:56:01,445: | epoch   1 |  1200/ 2965 batches | lr 20.00 | ms/batch 11.10 | loss  6.00 | ppl   405.33
INFO 2019-06-15 17:56:03,662: | epoch   1 |  1400/ 2965 batches | lr 20.00 | ms/batch 11.08 | loss  5.89 | ppl   360.80
INFO 2019-06-15 17:56:05,883: | epoch   1 |  1600/ 2965 batches | lr 20.00 | ms/batch 11.10 | loss  5.89 | ppl   361.04
INFO 2019-06-15 17:56:08,101: | epoch   

In [5]:
train(model, corpus, criterion, None, device, False)

INFO 2019-06-15 17:15:34,289: | epoch   1 |   200/ 2965 batches | lr 20.00 | ms/batch 11.46 | loss  7.62 | ppl  2047.74
INFO 2019-06-15 17:15:36,511: | epoch   1 |   400/ 2965 batches | lr 20.00 | ms/batch 11.11 | loss  6.81 | ppl   907.40
INFO 2019-06-15 17:15:38,725: | epoch   1 |   600/ 2965 batches | lr 20.00 | ms/batch 11.07 | loss  6.41 | ppl   605.91
INFO 2019-06-15 17:15:40,939: | epoch   1 |   800/ 2965 batches | lr 20.00 | ms/batch 11.07 | loss  6.24 | ppl   512.92
INFO 2019-06-15 17:15:43,161: | epoch   1 |  1000/ 2965 batches | lr 20.00 | ms/batch 11.11 | loss  6.09 | ppl   441.40
INFO 2019-06-15 17:15:45,384: | epoch   1 |  1200/ 2965 batches | lr 20.00 | ms/batch 11.11 | loss  6.00 | ppl   404.50
INFO 2019-06-15 17:15:47,606: | epoch   1 |  1400/ 2965 batches | lr 20.00 | ms/batch 11.11 | loss  5.88 | ppl   359.02
INFO 2019-06-15 17:15:49,828: | epoch   1 |  1600/ 2965 batches | lr 20.00 | ms/batch 11.11 | loss  5.89 | ppl   362.11
INFO 2019-06-15 17:15:52,051: | epoch   