In [1]:
import os
os.chdir('..')

%load_ext autoreload
%autoreload 2

In [3]:
import datetime

import torch

import torch.nn as nn

from src.consts import *
from src.main import main, setup_torch, get_corpus
from src.model import RNNModel
from src.training import train, evaluate
from src.split_cross_entropy_loss import SplitCrossEntropyLoss
from src.parallel import DataParallelCriterion
from src.custom_data_parallel import CustomDataParallel
from src.utils import summary, check_cuda_mem

In [4]:
use_data_paralellization = True

In [5]:
setup_torch()
# torch.cuda.set_device(1)
device = torch.device("cuda" if USE_CUDA else "cpu")
corpus = get_corpus()

# TODO remove these two lines
assert len(corpus.dictionary) == 602755
assert corpus.valid.size()[0] == 11606861
assert corpus.train.max() < len(corpus.dictionary)
assert corpus.valid.max() < len(corpus.dictionary)
assert corpus.test.max() < len(corpus.dictionary)

ntokens = len(corpus.dictionary)
model = RNNModel(MODEL_TYPE, ntokens, EMBEDDINGS_SIZE, HIDDEN_UNIT_COUNT, LAYER_COUNT, DROPOUT_PROB,
                 TIED).to(device)
criterion = nn.CrossEntropyLoss()

if use_data_paralellization or USE_DATA_PARALLELIZATION:
    model = CustomDataParallel(model)
    criterion = DataParallelCriterion(criterion)
# else:
#     model.to(device)
#     criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters())

summary(model, criterion)

CustomDataParallel(
  (model): DataParallelModel(
    (module): RNNModel(
      (drop): Dropout(p=0.2)
      (encoder): Embedding(602755, 200)
      (rnn): LSTM(200, 200, num_layers=2, dropout=0.2)
      (decoder): Linear(in_features=200, out_features=602755, bias=True)
    )
  )
)

model.module.encoder.weight torch.Size([602755, 200])
model.module.rnn.weight_ih_l0 torch.Size([800, 200])
model.module.rnn.weight_hh_l0 torch.Size([800, 200])
model.module.rnn.bias_ih_l0 torch.Size([800])
model.module.rnn.bias_hh_l0 torch.Size([800])
model.module.rnn.weight_ih_l1 torch.Size([800, 200])
model.module.rnn.weight_hh_l1 torch.Size([800, 200])
model.module.rnn.bias_ih_l1 torch.Size([800])
model.module.rnn.bias_hh_l1 torch.Size([800])
model.module.decoder.weight torch.Size([602755, 200])
model.module.decoder.bias torch.Size([602755])

Total Parameters: 121,796,955


In [4]:
torch.cuda.empty_cache()

In [6]:
train(model, corpus, criterion, optimizer, device)

using Batch Size 40


INFO 2019-05-30 12:39:27,906: | epoch   1 |   200/11879 batches | lr 20.00 | ms/batch 591.00 | loss  9.79 | ppl 17801.87
INFO 2019-05-30 12:41:23,640: | epoch   1 |   400/11879 batches | lr 20.00 | ms/batch 578.66 | loss  8.82 | ppl  6793.58
INFO 2019-05-30 12:43:19,524: | epoch   1 |   600/11879 batches | lr 20.00 | ms/batch 579.42 | loss  8.54 | ppl  5130.15
INFO 2019-05-30 12:45:15,463: | epoch   1 |   800/11879 batches | lr 20.00 | ms/batch 579.70 | loss  8.41 | ppl  4501.38
INFO 2019-05-30 12:47:11,344: | epoch   1 |  1000/11879 batches | lr 20.00 | ms/batch 579.40 | loss  8.25 | ppl  3827.97
INFO 2019-05-30 12:49:07,187: | epoch   1 |  1200/11879 batches | lr 20.00 | ms/batch 579.21 | loss  8.19 | ppl  3609.10
INFO 2019-05-30 12:51:03,087: | epoch   1 |  1400/11879 batches | lr 20.00 | ms/batch 579.50 | loss  8.06 | ppl  3150.66
INFO 2019-05-30 12:52:58,985: | epoch   1 |  1600/11879 batches | lr 20.00 | ms/batch 579.49 | loss  8.05 | ppl  3137.54
INFO 2019-05-30 12:54:54,901: | 

FileNotFoundError: [Errno 2] No such file or directory: 'models/trained_models/model-2019-05-30 12:37:29.706331.pt'

In [6]:
# with optimizer:
train(model, corpus, criterion, optimizer, device)

using Batch Size 40


INFO 2019-05-29 19:20:33,895: | epoch   1 |   200/11879 batches | lr 20.00 | ms/batch 608.19 | loss  8.52 | ppl  5006.08
INFO 2019-05-29 19:22:32,776: | epoch   1 |   400/11879 batches | lr 20.00 | ms/batch 594.40 | loss  7.77 | ppl  2356.94
INFO 2019-05-29 19:24:31,470: | epoch   1 |   600/11879 batches | lr 20.00 | ms/batch 593.47 | loss  7.64 | ppl  2072.55
INFO 2019-05-29 19:26:30,188: | epoch   1 |   800/11879 batches | lr 20.00 | ms/batch 593.59 | loss  7.66 | ppl  2131.26
INFO 2019-05-29 19:28:28,873: | epoch   1 |  1000/11879 batches | lr 20.00 | ms/batch 593.43 | loss  7.64 | ppl  2078.91
INFO 2019-05-29 19:30:27,597: | epoch   1 |  1200/11879 batches | lr 20.00 | ms/batch 593.62 | loss  7.62 | ppl  2040.10
INFO 2019-05-29 19:32:26,271: | epoch   1 |  1400/11879 batches | lr 20.00 | ms/batch 593.37 | loss  7.54 | ppl  1875.25
INFO 2019-05-29 19:34:24,978: | epoch   1 |  1600/11879 batches | lr 20.00 | ms/batch 593.53 | loss  7.57 | ppl  1944.25
INFO 2019-05-29 19:36:23,696: | 

FileNotFoundError: [Errno 2] No such file or directory: 'models/trained_models/model-2019-05-29 19:18:32.258388.pt'

In [6]:
# timestamp = datetime.datetime.now()
# with open(MODEL_FILE_NAME.format(timestamp), 'wb') as f:
#     torch.save(model, f)

In [6]:
# with open(MODEL_FILE_NAME.format(timestamp), 'rb') as f:
with open('models/trained_models/model-2019-05-24 17:19:46.971655.pt', 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

In [7]:
evaluate(model, corpus, criterion, device, use_test_data=True)

INFO 2019-05-27 10:57:05,109: -----------------------------------------------------------------------------------------
INFO 2019-05-27 10:57:05,110: Running eval
INFO 2019-05-27 10:57:05,110: -----------------------------------------------------------------------------------------
INFO 2019-05-27 10:57:31,470: |  1000/42211 batches | loss 175.47
INFO 2019-05-27 10:57:57,898: |  2000/42211 batches | loss 175.93
INFO 2019-05-27 10:58:24,412: |  3000/42211 batches | loss 176.24
INFO 2019-05-27 10:58:50,994: |  4000/42211 batches | loss 176.12
INFO 2019-05-27 10:59:17,711: |  5000/42211 batches | loss 175.86
INFO 2019-05-27 10:59:44,515: |  6000/42211 batches | loss 175.90
INFO 2019-05-27 11:00:11,298: |  7000/42211 batches | loss 175.92
INFO 2019-05-27 11:00:38,091: |  8000/42211 batches | loss 176.16
INFO 2019-05-27 11:01:04,898: |  9000/42211 batches | loss 176.03
INFO 2019-05-27 11:01:31,771: | 10000/42211 batches | loss 176.04
INFO 2019-05-27 11:01:58,692: | 11000/42211 batches | los

5.038112595037704