In [1]:
import sys
sys.path.append('/source/main')

In [2]:
import logging
import torch

from data_for_train import dataset as my_dataset
from model_def.seq2seq_attn import Seq2SeqAttn
from utils import pytorch_utils
from train.trainer import train

In [3]:
def input2_text(first_input, *params):
    return my_dataset.voc_src.idx2docs(first_input)


def target2_text(first_input, *params):
    return my_dataset.voc_tgt.idx2docs(first_input)

In [4]:
logging.basicConfig(level=logging.INFO)
BATCH_SIZE = 2
NUM_EPOCHS = 500
NUM_WORKERS = 0
PRINT_EVERY = 100
PREDICT_EVERY = 10
EVAL_EVERY = 1000
PRE_TRAINED_MODEL = ''
my_dataset.bootstrap()
train_loader = my_dataset.get_dl_train(batch_size=BATCH_SIZE, size=500)
eval_loader = my_dataset.get_dl_eval(batch_size=BATCH_SIZE, size=500)


INFO:root:Indexing vocabs successfully. Total vocabs: 25390
INFO:root:Indexing vocabs successfully. Total vocabs: 50437
INFO:root:Vocab for source from file /source/main/vocab/output/src.pkl contains 25390 tokens
INFO:root:Vocab for source from file /source/main/vocab/output/tgt.pkl contains 50437 tokens
INFO:root:Data at /source/main/data_for_train/output/my_train.csv contains 500 samples
INFO:root:Data at /source/main/data_for_train/output/my_eval.csv contains 500 samples


In [5]:
model = Seq2SeqAttn(src_vocab_size=len(my_dataset.voc_src.index2word),
                tgt_vocab_size=len(my_dataset.voc_tgt.index2word),
                start_idx=2,
                end_idx=3
                )
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')
model.to(device)
logging.info('Model architecture: \n%s', model)
logging.info('Total trainable parameters: %s', pytorch_utils.count_parameters(model))


INFO:root:Model architecture: 
Seq2SeqAttn(
  (encoder): Encoder(
    (embedding): Embedding(25390, 256)
    (lstm): LSTM(256, 512, num_layers=3, dropout=0.3, bidirectional=True)
    (dropout): Dropout(p=0.3)
  )
  (flatten_hidden_lstm): FlattenHiddenLSTM()
  (core_decoder): AttnRawDecoder(
    (embedding): Embedding(50437, 256)
    (lstm): LSTM(256, 512, num_layers=3, dropout=0.3)
    (attention): Attention(
      (scoring): Linear(in_features=512, out_features=1024, bias=True)
      (softmax): Softmax()
    )
    (output_mapping): Linear(in_features=1536, out_features=50437, bias=True)
    (dropout): Dropout(p=0.3)
  )
  (greedy_infer): DecoderGreedyInfer(
    (core_decoder): AttnRawDecoder(
      (embedding): Embedding(50437, 256)
      (lstm): LSTM(256, 512, num_layers=3, dropout=0.3)
      (attention): Attention(
        (scoring): Linear(in_features=512, out_features=1024, bias=True)
        (softmax): Softmax()
      )
      (output_mapping): Linear(in_features=1536, out_feature

In [6]:
pytorch_utils.show_detail_parameters(model)

INFO:root:Param: encoder.embedding.weight contributes 6499840 weights
Param: encoder.lstm.weight_ih_l0 contributes 524288 weights
Param: encoder.lstm.weight_hh_l0 contributes 1048576 weights
Param: encoder.lstm.bias_ih_l0 contributes 2048 weights
Param: encoder.lstm.bias_hh_l0 contributes 2048 weights
Param: encoder.lstm.weight_ih_l0_reverse contributes 524288 weights
Param: encoder.lstm.weight_hh_l0_reverse contributes 1048576 weights
Param: encoder.lstm.bias_ih_l0_reverse contributes 2048 weights
Param: encoder.lstm.bias_hh_l0_reverse contributes 2048 weights
Param: encoder.lstm.weight_ih_l1 contributes 2097152 weights
Param: encoder.lstm.weight_hh_l1 contributes 1048576 weights
Param: encoder.lstm.bias_ih_l1 contributes 2048 weights
Param: encoder.lstm.bias_hh_l1 contributes 2048 weights
Param: encoder.lstm.weight_ih_l1_reverse contributes 2097152 weights
Param: encoder.lstm.weight_hh_l1_reverse contributes 1048576 weights
Param: encoder.lstm.bias_ih_l1_reverse contributes 2048 weig

In [7]:
init_step = 0
# Restore model
if PRE_TRAINED_MODEL != '':
    checkpoint = torch.load(PRE_TRAINED_MODEL, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.optimizer.load_state_dict(checkpoint['optimizer'])
    init_step = checkpoint.get('step', 0)
    logging.info('Load pre-trained model from %s successfully', PRE_TRAINED_MODEL)

train(model, train_loader, eval_loader, dir_checkpoint='/source/main/train/output/saved_models/', device=device,
      num_epoch=NUM_EPOCHS, print_every=PRINT_EVERY, predict_every=PREDICT_EVERY, eval_every=EVAL_EVERY,
      input_transform=input2_text, output_transform=target2_text, init_step=init_step)


INFO:root:----------------------- START TRAINING -----------------------
INFO:root:Step: 1 	 L_mean: 10.8315±0.0000 	 w_a: 0.0337 	 s_a: 0.0000 	 Duration: 1.4438 s/step


RuntimeError: CUDA out of memory. Tried to allocate 296.00 MiB (GPU 0; 7.93 GiB total capacity; 1.84 GiB already allocated; 281.19 MiB free; 277.92 MiB cached)