In [1]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0,3, 6"


In [3]:

from kss import split_sentences

In [4]:
import re
import emoji
from soynlp.normalizer import repeat_normalize

emojis = ''.join(emoji.UNICODE_EMOJI.keys())
pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣{emojis}]+')
url_pattern = re.compile(
    r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

def clean(x):
    x = pattern.sub(' ', x)
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

In [5]:
import pandas as pd
from pathlib import Path

def get_korean_dataset_raw(data_dir):  
    train_raw = pd.read_csv(data_dir)
    one_sentence_train_texts = []
    train_texts = train_raw.Question.map(clean).to_list()
    for text in train_texts:
        one_sentence_train_texts += split_sentences(text)
    
    return one_sentence_train_texts

In [6]:

train_dir = './dataset/kor_asdiv-a/train.csv'
test_dir = './dataset/kor_asdiv-a/dev.csv'

In [7]:
train_raw = get_korean_dataset_raw(train_dir)
test_raw = get_korean_dataset_raw(test_dir)

In [8]:
len(train_raw), len(test_raw)

(2615, 640)

In [9]:
from transformers import RobertaModel, BertTokenizer

etk = BertTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [10]:
train_inputs = etk(train_raw, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
test_inputs = etk(test_raw, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [11]:
train_inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [12]:
def make_mlm_label(inputs):
    inputs['labels'] = inputs.input_ids.detach().clone()
    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(inputs.input_ids.shape)
    # create mask array
    mask_arr = (rand < 0.15) * (inputs.input_ids != 2) * \
               (inputs.input_ids != 3) * (inputs.input_ids != 0)
    
    selection = []
    
    for i in range(inputs.input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist())
    



    for i in range(inputs.input_ids.shape[0]):
        inputs.input_ids[i, selection[i]] = 4
    
    return inputs

In [13]:
train_inputs = make_mlm_label(train_inputs)
test_inputs = make_mlm_label(test_inputs)

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  torch.flatten(mask_arr[i].nonzero()).tolist())


In [14]:
train_inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [15]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [16]:
train_dataset = OurDataset(train_inputs)
test_dataset = OurDataset(test_inputs)

In [17]:
from transformers import ElectraForMaskedLM

In [18]:
model = ElectraForMaskedLM.from_pretrained("monologg/koelectra-base-v3-discriminator")

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForMaskedLM: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForMaskedLM were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['generator_predictions.dense.bias', 'generator_predicti

In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer", per_device_train_batch_size=4, per_device_eval_batch_size=4, num_train_epochs=5)

In [20]:
training_args

TrainingArguments(
_n_gpu=3,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=test_trainer/runs/Oct27_12-33-50_4d9b1a113784,
logging_first_step=False,
logging_nan_inf_filter=True,
logging

In [21]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset
)

In [22]:
trainer.train()

***** Running training *****
  Num examples = 2615
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 1090
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
500,0.6525
1000,0.0251


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1090, training_loss=0.3125703774460959, metrics={'train_runtime': 532.5966, 'train_samples_per_second': 24.55, 'train_steps_per_second': 2.047, 'total_flos': 3441582792499200.0, 'train_loss': 0.3125703774460959, 'epoch': 5.0})