In [1]:
import os
os.chdir('..')

%load_ext autoreload
%autoreload 2

In [2]:
import datetime

import torch

import torch.nn as nn

from src.consts import *
from src.main import main, setup_torch, get_corpus
from src.model import RNNModel
from src.training import train, evaluate
from src.split_cross_entropy_loss import SplitCrossEntropyLoss
from src.parallel import DataParallelModel, DataParallelCriterion
from src.custom_data_parallel import CustomDataParallel

from notebooks.utils import summary, check_cuda_mem

In [3]:
use_data_paralellization = True

In [4]:
setup_torch()
# torch.cuda.set_device(1)
device = torch.device("cuda" if USE_CUDA else "cpu")
corpus = get_corpus()

# TODO remove these two lines
assert len(corpus.dictionary) == 602755
assert corpus.valid.size()[0] == 11606861
assert corpus.train.max() < len(corpus.dictionary)
assert corpus.valid.max() < len(corpus.dictionary)
assert corpus.test.max() < len(corpus.dictionary)

ntokens = len(corpus.dictionary)
model = RNNModel(MODEL_TYPE, ntokens, EMBEDDINGS_SIZE, HIDDEN_UNIT_COUNT, LAYER_COUNT, DROPOUT_PROB,
                 TIED).to(device)
criterion = nn.CrossEntropyLoss()

if use_data_paralellization or USE_DATA_PARALLELIZATION:
    model = CustomDataParallel(model)
    criterion = DataParallelCriterion(criterion)
# else:
#     model.to(device)
#     criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters())

summary(model, criterion)

CustomDataParallel(
  (model): DataParallelModel(
    (module): RNNModel(
      (drop): Dropout(p=0.2)
      (encoder): Embedding(602755, 200)
      (rnn): LSTM(200, 200, num_layers=2, dropout=0.2)
      (decoder): Linear(in_features=200, out_features=602755, bias=True)
    )
  )
)

model.module.encoder.weight torch.Size([602755, 200])
model.module.rnn.weight_ih_l0 torch.Size([800, 200])
model.module.rnn.weight_hh_l0 torch.Size([800, 200])
model.module.rnn.bias_ih_l0 torch.Size([800])
model.module.rnn.bias_hh_l0 torch.Size([800])
model.module.rnn.weight_ih_l1 torch.Size([800, 200])
model.module.rnn.weight_hh_l1 torch.Size([800, 200])
model.module.rnn.bias_ih_l1 torch.Size([800])
model.module.rnn.bias_hh_l1 torch.Size([800])
model.module.decoder.weight torch.Size([602755, 200])
model.module.decoder.bias torch.Size([602755])

Total Parameters: 121,796,955


In [5]:
torch.cuda.empty_cache()

In [13]:
train(model, corpus, criterion, optimizer, device)

inside forwardinside forward torch.Size([350, 602755])
 torch.Size([350, 602755])
torch.Size([10, 2, 200])
torch.Size([10, 2, 200])
torch.Size([10, 2, 200])
torch.Size([10, 2, 200])
device index:  0
torch.Size([350, 602755])
torch.Size([350])
device index:  1
torch.Size([350, 602755])
torch.Size([350])
tensor(13.3070, device='cuda:1', grad_fn=<NllLossBackward>)
tensor(13.3074, device='cuda:0', grad_fn=<NllLossBackward>)
> [0;32m/media/gabrielamelo/Novo volume/Projects/portuguese_wsc/src/training.py[0m(137)[0;36mtrain_one_epoch[0;34m()[0m
[0;32m    136 [0;31m[0;34m[0m[0m
[0m[0;32m--> 137 [0;31m        [0mtotal_loss[0m [0;34m+=[0m [0mloss[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0mitem[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    138 [0;31m[0;34m[0m[0m
[0m


ipdb>  c


inside forward torch.Size([350, 602755])
torch.Size([10, 2, 200])
torch.Size([10, 2, 200])
inside forward torch.Size([350, 602755])
torch.Size([10, 2, 200])
torch.Size([10, 2, 200])
device index: device index:  1
torch.Size([350, 602755]) 0
torch.Size([350, 602755])
torch.Size([350])

torch.Size([350])
tensor(13.3012, device='cuda:1', grad_fn=<NllLossBackward>)
tensor(13.2972, device='cuda:0', grad_fn=<NllLossBackward>)


RuntimeError: CUDA out of memory. Tried to allocate 804.88 MiB (GPU 0; 10.91 GiB total capacity; 8.73 GiB already allocated; 474.94 MiB free; 521.52 MiB cached)

In [6]:
# timestamp = datetime.datetime.now()
# with open(MODEL_FILE_NAME.format(timestamp), 'wb') as f:
#     torch.save(model, f)

In [6]:
# with open(MODEL_FILE_NAME.format(timestamp), 'rb') as f:
with open('models/trained_models/model-2019-05-24 17:19:46.971655.pt', 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

In [7]:
evaluate(model, corpus, criterion, device, use_test_data=True)

INFO 2019-05-27 10:57:05,109: -----------------------------------------------------------------------------------------
INFO 2019-05-27 10:57:05,110: Running eval
INFO 2019-05-27 10:57:05,110: -----------------------------------------------------------------------------------------
INFO 2019-05-27 10:57:31,470: |  1000/42211 batches | loss 175.47
INFO 2019-05-27 10:57:57,898: |  2000/42211 batches | loss 175.93
INFO 2019-05-27 10:58:24,412: |  3000/42211 batches | loss 176.24
INFO 2019-05-27 10:58:50,994: |  4000/42211 batches | loss 176.12
INFO 2019-05-27 10:59:17,711: |  5000/42211 batches | loss 175.86
INFO 2019-05-27 10:59:44,515: |  6000/42211 batches | loss 175.90
INFO 2019-05-27 11:00:11,298: |  7000/42211 batches | loss 175.92
INFO 2019-05-27 11:00:38,091: |  8000/42211 batches | loss 176.16
INFO 2019-05-27 11:01:04,898: |  9000/42211 batches | loss 176.03
INFO 2019-05-27 11:01:31,771: | 10000/42211 batches | loss 176.04
INFO 2019-05-27 11:01:58,692: | 11000/42211 batches | los

5.038112595037704