In [1]:
from kobert_tokenizer import KoBERTTokenizer
from transformers import TrainingArguments, Trainer

from modeling import BiEncoder
from parsing import pickling
from data_loader import DataLoader
from utils import seed_everything, empty_cuda_cache

In [2]:
seed_everything(42)

In [3]:
train_context, train_candidate = pickling('./data/pickle/train.pickle', 'load')
valid_context, valid_candidate = pickling('./data/pickle/valid.pickle', 'load')
print(train_context[:5])
print(train_candidate[:5])

['너희 집 근처에는 병원 있어?', '응, 난 엄청 근처에 있어.', '나는 차 타고 20분 거리에 있어.', '집 근처에 병원이 있으면 편리한 거 같아.', '왜 그렇게 생각하는 거야?']
['응, 난 엄청 근처에 있어.', '나는 차 타고 20분 거리에 있어.', '집 근처에 병원이 있으면 편리한 거 같아.', '왜 그렇게 생각하는 거야?', '언제 사고가 발생할지 모르잖아.']


In [4]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model = BiEncoder.from_pretrained('skt/kobert-base-v1')
model.to('cuda');

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [5]:
train_loader = DataLoader(train_context, train_candidate, tokenizer)
valid_loader = DataLoader(valid_context, valid_candidate, tokenizer)

In [6]:
arguments = TrainingArguments(
    output_dir='checkpoints',
    do_train=True,
    do_eval=True,

    num_train_epochs=5,
    learning_rate = 5e-5,

    weight_decay = 0.4,
    warmup_steps  = 100,

    save_strategy="epoch",
    save_total_limit=10,
    evaluation_strategy="epoch",
    # load_best_model_at_end=True,
    
    report_to = 'none',

    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    gradient_accumulation_steps=1,
    dataloader_num_workers=0,
    fp16=True,

)

trainer = Trainer(
    model,
    arguments,
    train_dataset=train_loader,
    eval_dataset=valid_loader
)

Using cuda_amp half precision backend


In [7]:
empty_cuda_cache()
trainer.train()
model.save_pretrained(f"checkpoints/firstmodel_ep5")

***** Running training *****
  Num examples = 1358286
  Num Epochs = 5
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 53060
  0%|          | 257/53060 [02:07<7:19:02,  2.00it/s]

KeyboardInterrupt: 

In [None]:
# seq = tokenizer(['i am a student', 'wqwei am a student'], padding='max_length', max_length=50, truncation=True, return_tensors='pt')
# seq2 = tokenizer(['i am a student', 'wqwei am a student'], padding='max_length', max_length=50, truncation=True, return_tensors='pt')

# output = model(input_ids = seq['input_ids'],
#                 attention_mask = seq['attention_mask'],
#                 token_type_ids = seq['token_type_ids'],
#                 candidate_input_ids = seq2['input_ids'],
#                 candidate_attention_mask = seq2['attention_mask'],
#                 candidate_token_type_ids = seq2['token_type_ids'],
#                 training = True)
# output