In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "1"  # Set the GPU 2 to use

In [2]:
# Import modules
import os
import time
import argparse
# Import custom modules
from training import training
# Utils
from utils import str2bool, path_check, set_random_seed

user_name = os.getlogin()
parser = argparse.ArgumentParser(description='Parsing Method')
# Task setting
parser.add_argument('--training', action='store_true')
parser.add_argument('--resume', action='store_true')
parser.add_argument('--debuging_mode', action='store_true')
# Path setting
parser.add_argument('--preprocess_path', default=f'/HDD/{user_name}/preprocessed', type=str,
                    help='Pre-processed data save path')
parser.add_argument('--data_path', default='/HDD/dataset/WMT/2016/multi_modal', type=str,
                    help='Original data path')
parser.add_argument('--model_save_path', default=f'/HDD/{user_name}/model_checkpoint/acl_text_aug', type=str,
                    help='Model checkpoint file path')
parser.add_argument('--result_path', default=f'/HDD/{user_name}/results/acl_text_aug', type=str,
                    help='Results file path')
# Training setting
parser.add_argument('--min_len', default=4, type=int, 
                    help="Sentences's minimum length; Default is 4")
parser.add_argument('--src_max_len', default=150, type=int, 
                    help="Source sentences's maximum length; Default is 150")
parser.add_argument('--trg_max_len', default=150, type=int, 
                    help="Target sentences's maximum length; Default is 150")
parser.add_argument('--num_epochs', default=100, type=int, 
                    help='Training epochs; Default is 100')
parser.add_argument('--num_workers', default=8, type=int, 
                    help='Num CPU Workers; Default is 8')
parser.add_argument('--batch_size', default=16, type=int,    
                    help='Batch size; Default is 16')
parser.add_argument('--lr', default=5e-5, type=float,
                    help='Maximum learning rate of warmup scheduler; Default is 5e-5')
parser.add_argument('--w_decay', default=1e-5, type=float,
                    help="Ralamb's weight decay; Default is 1e-5")
parser.add_argument('--clip_grad_norm', default=5, type=int, 
                    help='Graddient clipping norm; Default is 5')
parser.add_argument('--label_smoothing_eps', default=0.05, type=float,
                    help='')
# Testing setting
parser.add_argument('--test_batch_size', default=32, type=int, 
                    help='Test batch size; Default is 32')
parser.add_argument('--beam_size', default=5, type=int, 
                    help='Beam search size; Default is 5')
parser.add_argument('--beam_alpha', default=0.7, type=float, 
                    help='Beam search length normalization; Default is 0.7')
parser.add_argument('--repetition_penalty', default=1.3, type=float, 
                    help='Beam search repetition penalty term; Default is 1.3')
# Seed & Logging setting
parser.add_argument('--seed', default=42, type=int,
                    help='Random seed; Default is 42')
parser.add_argument('--use_tensorboard', default=True, type=str2bool,
                    help='Using tensorboard; Default is True')
parser.add_argument('--tensorboard_path', default='./tensorboard_runs', type=str,
                    help='Tensorboard log path; Default is ./tensorboard_runs')
parser.add_argument('--print_freq', default=100, type=int, 
                    help='Print training process frequency; Default is 100')
args = parser.parse_args(list())

In [3]:
import os
import math
import logging
from tqdm import tqdm
from time import time

import torch
import torch.nn as nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.nn.utils import clip_grad_norm_
from torch.optim.lr_scheduler import LambdaLR

from transformers import AutoTokenizer

from dataset import CustomDataset
from model import TransformerModel
from utils import TqdmLoggingHandler, write_log, get_tb_exp_name

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#===================================#
#==============Logging==============#
#===================================#

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
handler = TqdmLoggingHandler()
handler.setFormatter(logging.Formatter(" %(asctime)s - %(message)s", "%Y-%m-%d %H:%M:%S"))
logger.addHandler(handler)
logger.propagate = False

write_log(logger, 'Start training!')

#===================================#
#=============Data Load=============#
#===================================#

write_log(logger, "Load data...")

start_time = time()

src_list = dict()
trg_list = dict()

# 3) Test data load
with open(os.path.join(args.data_path, 'test.de'), 'r') as f:
    src_list['test'] = [x.replace('\n', '') for x in f.readlines()]
with open(os.path.join(args.data_path, 'test.en'), 'r') as f:
    trg_list['test'] = [x.replace('\n', '') for x in f.readlines()]

write_log(logger, 'Data loading done!')

 2023-05-02 20:33:33 - Start training!
 2023-05-02 20:33:33 - Load data...
 2023-05-02 20:33:33 - Data loading done!


In [5]:
#===================================#
#===========Train setting===========#
#===================================#

# 1) Model initiating
write_log(logger, 'Instantiating model...')
model = TransformerModel()
save_file_name = os.path.join(args.model_save_path, f'checkpoint.pth.tar')
checkpoint = torch.load(save_file_name)
model.to(device)

 2023-05-02 20:33:33 - Instantiating model...


You are using a model of type bart to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


TransformerModel(
  (dropout): Dropout(p=0.3, inplace=False)
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_

In [6]:
import transformers

transformers.__version__

'4.12.0'

In [7]:
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')
test_dataset = CustomDataset(tokenizer=tokenizer,
                           src_list=src_list['test'], trg_list=trg_list['test'],
                           src_max_len=args.src_max_len, trg_max_len=args.trg_max_len)
test_loader = DataLoader(test_dataset, drop_last=False,
                        batch_size=4, shuffle=True,
                        pin_memory=True, num_workers=args.num_workers)

In [8]:
for i, batch_iter in enumerate(test_loader):
    
    model.eval()
    
    # Input, output setting
    src_sequence = batch_iter[0]
    src_att = batch_iter[1]
    src_sequence = src_sequence.to(device, non_blocking=True)
    src_att = src_att.to(device, non_blocking=True)

    trg_sequence = batch_iter[2]
    trg_att = batch_iter[3]
    trg_sequence = trg_sequence.to(device, non_blocking=True)
    trg_att = trg_att.to(device, non_blocking=True)
    
    break

In [9]:
with torch.no_grad():
    # Encoding
    encoder_out = model.encoder_model(input_ids=src_sequence,
                                     attention_mask=src_att)
    encoder_out = encoder_out['last_hidden_state'] # (batch_size, seq_len, d_hidden)

    # Latent setting
    encoder_out = model.latent_encoder(encoder_out)
    encoder_out = model.latent_decoder(encoder_out)

    # Input, output setting
    batch_size = encoder_out.size(0)
    src_seq_size = encoder_out.size(1)

    # Decoding start token setting
    seqs = torch.tensor([[0]], dtype=torch.long, device=device)
    seqs = seqs.repeat(batch_size, 1).contiguous() # (batch_size, 1)

    for step in range(model.src_max_len):
        # Decoding sentence
        decoder_outputs = model.decoder_model(
            input_ids=seqs,
            encoder_hidden_states=encoder_out,
            encoder_attention_mask=src_att
        )
        decoder_outputs = decoder_outputs['last_hidden_state']

        # Next word probability
        scores = F.gelu(model.decoder_linear(decoder_outputs[:,-1])) # (batch_size, d_embedding)
        scores = model.decoder_linear2(model.decoder_norm(scores)) # (batch_size, vocab_num)

        next_word_prob = F.softmax(scores, dim=1) # (batch_size, vocab_num)
        next_word = torch.argmax(next_word_prob, dim=-1)

        # Concatenate generated token to sequence
        next_word = next_word.unsqueeze(1) # (batch_size, 1)
        seqs = torch.cat([seqs, next_word], dim=1) # (batch_size, seq_len + 1)

In [10]:
seqs[:,1:]

tensor([[35110, 40720, 28937, 40103, 23318,  8988, 14785, 19144, 17854, 37545,
         18714, 35765, 50054,  5649,  9220,  9220, 12130,  8266, 10888, 18989,
         44837,  3638, 36088,  4028, 45267, 11811, 22293,  9374,  9220, 28937,
         36088,  8987, 12130, 36933, 10888, 23959,  9902,  5027, 13516, 20349,
          9374, 13516, 44837,  6018, 44070, 50208, 36858, 16300, 20642, 18714,
         19112, 12130,  4106, 42260, 10888, 24998, 45145,  2502, 35355,  7038,
         10852,  7038, 29746, 26974, 21657, 42253, 35355, 31918, 42199,  3507,
         23409,  7133, 25389, 43064, 49779, 12130, 29256, 12130, 29256, 12130,
         29256, 12130, 29256, 12130, 29256, 12130, 29256, 12130, 29256, 12130,
         29256, 12130, 29256, 12130, 29256, 12130, 29256, 12130, 29256, 12130,
         29256, 12130, 29256, 12130, 29256, 12130, 29256, 12130, 29256, 12130,
         29256, 12130, 29256, 12130, 29256, 12130, 29256, 12130, 29256, 12130,
         29256, 17815, 17815, 26946, 24753,  8103, 4

In [11]:
model.tokenizer.batch_decode(seqs)

['<s> VersionendmentooksLearn ripeacedmandOct reversalTam HooverPACSTDOUTeman Logan Logan Wor ignored Ly Watt Tyrann benchScore Eachmop quarterbacks pulse mer LoganooksScorejust WorOVER Lymarksboy Navy dat Cath mer dat TyrannMC spont\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Hamb proactive orchestra Hooverarant Wor culturalRECT Ly IBvag applicationFal CSearing CS Increased infuri ConcmodeFal Emperor________________ Yet genres visual indust truncRuntime Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Wor directives Laboratory Laboratory unmanned Scan �mopGiving composed Allen intersection visualNR piercedtechn LynchgmentjoinedEmail visualothesEmail visual deserve stool Worst Rouse cuisine

In [12]:
from transformers import BartForConditionalGeneration

model2 = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

In [13]:
model2.model

BartModel(
  (shared): Embedding(50265, 768, padding_idx=1)
  (encoder): BartEncoder(
    (embed_tokens): Embedding(50265, 768, padding_idx=1)
    (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
    (layers): ModuleList(
      (0): BartEncoderLayer(
        (self_attn): BartAttention(
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (1): BartEncoderLayer(
        (self_attn): BartAttention(
          (k_proj)

from transformers import PretrainedConfig, AutoModel, AutoTokenizer
model = AutoModel.from_pretrained('facebook/bart-base')

model.encoder

model.decoder