# Description


# Modules and Global Variables

In [1]:
from transformers import (
    AutoConfig, ElectraTokenizerFast, ElectraForTokenClassification, 
    DataCollatorForTokenClassification,
    TrainingArguments, Trainer,
)

from transformers.optimization import (
    AdamW, get_linear_schedule_with_warmup,
    Adafactor, AdafactorSchedule,
)

import torch
import wandb

import datasets
import evaluate

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd

import os
import re
import random

import demoji

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f'torch.__version__: {torch.__version__}')
print(f'torch.cuda.is_available(): {torch.cuda.is_available()}')
NGPU = torch.cuda.device_count()
print(f'NGPU: {NGPU}')
# NGPU = torch.cuda.device_count()
# if NGPU > 1:
#     model = torch.nn.DataParallel(model, device_ids=list(range(NGPU)))

torch.__version__: 1.12.1
torch.cuda.is_available(): True
NGPU: 4


In [3]:
### labels

ce_labels = ['True', 'False']
pc_labels = ['positive', 'negative', 'neutral']
pc_binary_labels = ['True', 'False']
target_tagger_labels = ['Other', 'TRG_B', 'TRG_I']

labels = target_tagger_labels

label2id = {k: i for i, k in enumerate(labels)}
id2label = {i: k for i, k in enumerate(labels)}
num_labels = len(labels)

print(label2id)
print(id2label)

{'Other': 0, 'TRG_B': 1, 'TRG_I': 2}
{0: 'Other', 1: 'TRG_B', 2: 'TRG_I'}


In [4]:
### paths and names

PROJECT_NAME = 'target_tagging'
RUN_ID = 'uncleaned_v11'

DATA_V = 'uncleaned_v11'
DATA_T = 'tagger' # ce or pc or pc_binary or tagger
AUGMENTATION = False
AUG_NAME = 'balanced'

model_checkpoint = 'monologg/koelectra-base-v3-discriminator'

notebook_name = 'target_tagger_trainer.ipynb'

### fixed

model_name = re.sub(r'[/-]', r'_', model_checkpoint).lower()
run_name = f'{model_name}_{RUN_ID}'

ROOT_PATH = './'
SAVE_PATH = os.path.join(ROOT_PATH, 'training_results', run_name, 'target_tagger')
NOTEBOOK_PATH = os.path.join('./', notebook_name)

augornot = f'_{AUG_NAME}' if AUGMENTATION is True else ''
TRAIN_DATA_PATH = os.path.join(ROOT_PATH, 'dataset', DATA_V, f'{DATA_T}_train{augornot}.json')
EVAL_DATA_PATH = os.path.join(ROOT_PATH, 'dataset', DATA_V, f'{DATA_T}_dev.json')

!mkdir -p {SAVE_PATH}

In [5]:
if os.path.exists(SAVE_PATH):
    print(f'{SAVE_PATH} exists.')
else:
    print(f'{SAVE_PATH} does not exist.')
if os.path.exists(NOTEBOOK_PATH):
    print(f'{NOTEBOOK_PATH} exists.')
else:
    print(f'{NOTEBOOK_PATH} does not exist.')
if os.path.exists(TRAIN_DATA_PATH):
    print(f'{TRAIN_DATA_PATH} exists.')
else:
    print(f'{TRAIN_DATA_PATH} does not exist.')
if os.path.exists(EVAL_DATA_PATH):
    print(f'{EVAL_DATA_PATH} exists.')
else:
    print(f'{EVAL_DATA_PATH} does not exist.')

./training_results/monologg_koelectra_base_v3_discriminator_uncleaned_v11/target_tagger exists.
./target_tagger_trainer.ipynb exists.
./dataset/uncleaned_v11/tagger_train.json exists.
./dataset/uncleaned_v11/tagger_dev.json exists.


In [6]:
### rest of training args

report_to="wandb"

fp16 = False

num_train_epochs = 10
batch_size = 32
gradient_accumulation_steps = 1

optim = 'adamw_hf' # 'adamw_torch'

learning_rate = 3e-6 / 8 * batch_size * NGPU # 5e-5
weight_decay = 0.01 # 0
adam_epsilon = 1e-8

lr_scheduler_type = 'linear'
warmup_ratio = 0

save_total_limit = 2

load_best_model_at_end = True
metric_for_best_model ='f1_macro'

save_strategy = "epoch"
evaluation_strategy = "epoch"

logging_strategy = "steps"
logging_first_step = True 
logging_steps = 5

print(learning_rate)

4.8e-05


# WandB Configuration

In [7]:
%env WANDB_PROJECT={PROJECT_NAME}
%env WANDB_NOTEBOOK_NAME={NOTEBOOK_PATH}
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
wandb.login()

env: WANDB_PROJECT=target_tagging
env: WANDB_NOTEBOOK_NAME=./target_tagger_trainer.ipynb
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


[34m[1mwandb[0m: Currently logged in as: [33mdotsnangles[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Load Model, Tokenizer, and Collator

In [8]:
tokenizer = ElectraTokenizerFast.from_pretrained(f'dataset/{DATA_V}/tokenizer')
model = ElectraForTokenClassification.from_pretrained(
    model_checkpoint, label2id=label2id, id2label=id2label, num_labels=num_labels
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForTokenClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(35254, 768)

In [9]:
# train_path = f'./dataset/{DATA_V}/raw_train.csv'
# dev_path = f'./dataset/{DATA_V}/raw_dev.csv'
# test_path = f'./dataset/{DATA_V}/raw_test.csv'
# train = pd.read_csv(train_path)
# dev = pd.read_csv(dev_path)
# test = pd.read_csv(test_path)

# ### new
# entity_property_pair = [
#     '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
#     '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
#     '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
#     '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
# ]
# sentiments = ['positive', 'negative', 'neutral']
# target = ['Target']
# special_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']
# emojis = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
# emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))
# ep_labels = pd.Series(entity_property_pair, name='sentence_form', copy=True)

# tokens2add = special_tokens + emojis
# # tokens2add = special_tokens + emojis + entity_property_pair + sentiments + target

# print(len(tokenizer))
# tokenizer_train_data = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame().drop_duplicates()
# tokenizer_train_data = tokenizer_train_data.sentence_form.to_list()
# new_tokenizer = tokenizer.train_new_from_iterator(tokenizer_train_data, vocab_size=1)
# new_tokens = set(list(new_tokenizer.vocab.keys()) + tokens2add) - set(tokenizer.vocab.keys())
# tokenizer.add_tokens(list(new_tokens))
# print(len(new_tokenizer))
# print(len(tokenizer))
# model.resize_token_embeddings(len(tokenizer))

In [10]:
# print(len(new_tokens))
# print(new_tokens)

In [11]:
model.config.label2id, model.config.id2label, model.num_labels

({'Other': 0, 'TRG_B': 1, 'TRG_I': 2}, {0: 'Other', 1: 'TRG_B', 2: 'TRG_I'}, 3)

# Define Metric

In [12]:
accuracy_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

In [13]:
# label_list = [0,1,2]

In [14]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_predictions = [x for true_prediction in true_predictions for x in true_prediction]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [x for true_label in true_labels for x in true_label]
    
    accuracy = accuracy_metric.compute(references=true_labels, predictions=true_predictions)['accuracy']
    
    'Other', 'TRG_B', 'TRG_I'
    f1_other, f1_trg_b, f1_trg_i = tuple(f1_metric.compute(references=true_labels, predictions=true_predictions, average=None, labels=[0,1,2])['f1'])
    f1_macro = f1_metric.compute(references=true_labels, predictions=true_predictions, average='macro')['f1']
    f1_micro = f1_metric.compute(references=true_labels, predictions=true_predictions, average='micro')['f1']
    
    return {'accuracy': accuracy, 'f1_other': f1_other, 'f1_trg_b': f1_trg_b, 'f1_trg_i': f1_trg_i, 'f1_macro': f1_macro, 'f1_micro': f1_micro}

# Load Data

In [15]:
def preprocess_function(examples):
    input_ids = tokenizer.convert_tokens_to_ids(examples["input_tokens_list"])
    token_type_ids = [0 for _ in range(len(input_ids))]
    attention_mask = [1 for _ in range(len(input_ids))]   
    
    input_triplet = dict(
        input_ids = input_ids,
        token_type_ids = token_type_ids,
        attention_mask = attention_mask,
    )
    return input_triplet

In [16]:
train_dataset = pd.read_json(TRAIN_DATA_PATH)
eval_dataset = pd.read_json(EVAL_DATA_PATH)

In [17]:
train_dataset.input_tokens_list.to_list()
train_dataset.labels.to_list()
train_dataset.sentence_form.to_list()
train_dataset = dict(
    input_tokens_list = train_dataset.input_tokens_list.to_list(),
    labels = train_dataset.labels.to_list(),
    sentence_form = train_dataset.sentence_form.to_list(),
)

eval_dataset.input_tokens_list.to_list()
eval_dataset.labels.to_list()
eval_dataset.sentence_form.to_list()
eval_dataset = dict(
    input_tokens_list = eval_dataset.input_tokens_list.to_list(),
    labels = eval_dataset.labels.to_list(),
    sentence_form = eval_dataset.sentence_form.to_list(),
)

In [18]:
train_dataset = datasets.Dataset.from_dict(train_dataset) #.shuffle(seed=42)
eval_dataset = datasets.Dataset.from_dict(eval_dataset) #.shuffle(seed=42)
train_dataset = train_dataset.map(preprocess_function, batched=False)
eval_dataset = eval_dataset.map(preprocess_function, batched=False)

  0%|          | 0/4632 [00:00<?, ?ex/s]

 22%|██▏       | 1000/4632 [00:00<00:00, 8869.63ex/s]

 43%|████▎     | 2012/4632 [00:00<00:00, 9566.23ex/s]

 65%|██████▍   | 3000/4632 [00:00<00:00, 9562.16ex/s]

 86%|████████▋ | 4000/4632 [00:00<00:00, 9438.04ex/s]

100%|██████████| 4632/4632 [00:00<00:00, 9714.86ex/s]




  0%|          | 0/1159 [00:00<?, ?ex/s]

 86%|████████▋ | 1000/1159 [00:00<00:00, 8946.34ex/s]

100%|██████████| 1159/1159 [00:00<00:00, 9150.27ex/s]




In [19]:
train_dataset, eval_dataset

(Dataset({
     features: ['input_tokens_list', 'labels', 'sentence_form', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 4632
 }),
 Dataset({
     features: ['input_tokens_list', 'labels', 'sentence_form', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 1159
 }))

In [20]:
len(train_dataset)

4632

# Check if wordpiece error exists

In [21]:
### passed

# from tqdm import tqdm

# for k in tqdm(range(len(train_dataset))):
#     # k = random.randrange(len(train_dataset))
#     input_ids = train_dataset['input_ids'][k]
#     input_tokens_list = train_dataset['input_tokens_list'][k]
#     sentence_form = train_dataset['sentence_form'][k]
#     if len(input_ids) != len(tokenizer.convert_tokens_to_ids(input_tokens_list)) != len(tokenizer.encode(sentence_form)):
#         print(len(input_ids), len(tokenizer.convert_tokens_to_ids(input_tokens_list)), len(tokenizer.encode(sentence_form)))

# for k in tqdm(range(len(eval_dataset))):
#     # k = random.randrange(len(eval_dataset))
#     input_ids = eval_dataset['input_ids'][k]
#     input_tokens_list = eval_dataset['input_tokens_list'][k]
#     sentence_form = eval_dataset['sentence_form'][k]
#     if len(input_ids) != len(tokenizer.convert_tokens_to_ids(input_tokens_list)) != len(tokenizer.encode(sentence_form)):
#         print(len(input_ids), len(tokenizer.convert_tokens_to_ids(input_tokens_list)), len(tokenizer.encode(sentence_form)))

In [22]:
k = random.randrange(len(train_dataset))
sentence_form = train_dataset['sentence_form'][k]
input_tokens_list = train_dataset['input_tokens_list'][k]
input_ids = train_dataset['input_ids'][k]
sentence_form_encoded = tokenizer.encode(sentence_form)
labels = train_dataset['labels'][k]
print(sentence_form, input_tokens_list, input_ids, sentence_form_encoded, labels, sep='\n')
print()
k = random.randrange(len(eval_dataset))
sentence_form = eval_dataset['sentence_form'][k]
input_tokens_list = eval_dataset['input_tokens_list'][k]
input_ids = eval_dataset['input_ids'][k]
sentence_form_encoded = tokenizer.encode(sentence_form)
labels = eval_dataset['labels'][k]
print(sentence_form, input_tokens_list, input_ids, sentence_form_encoded, labels, sep='\n')

Target 천연오일 사용해서그런지 보습력도 향기도 너무 좋은 아르비앙이에요~~ 💕
['[CLS]', 'T', '##ar', '##ge', '##t', '천연', '##오', '##일', '사용', '##해서', '##그', '##런', '##지', '보습', '##력', '##도', '향기', '##도', '너무', '좋', '##은', '아르', '##비', '##앙', '##이', '##에', '##요', '~', '~', '💕', '[SEP]']
[2, 56, 6515, 11569, 4019, 9840, 4127, 4366, 6267, 9448, 4441, 4814, 4200, 14902, 4361, 4086, 11848, 4086, 6395, 3311, 4112, 8123, 4164, 4493, 4007, 4073, 4150, 98, 98, 3950, 3]
[2, 56, 6515, 11569, 4019, 9840, 4127, 4366, 6267, 9448, 4441, 4814, 4200, 14902, 4361, 4086, 11848, 4086, 6395, 3311, 4112, 8123, 4164, 4493, 4007, 4073, 4150, 98, 98, 3950, 3]
[-100, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]

Target 요즘 피부 좋아졌다는말 마니 들었는데 비결은 바로 요고♡
['[CLS]', 'T', '##ar', '##ge', '##t', '요즘', '피부', '좋', '##아졌', '##다는', '##말', '마니', '들', '##었', '##는데', '비결', '##은', '바로', '요', '##고', '##♡', '[SEP]']
[2, 56, 6515, 11569, 4019, 7226, 7159, 3311, 28309, 6913, 4462, 20638, 2441, 4480, 18781, 12736, 

# Load Trainer

In [23]:
args = TrainingArguments(
    output_dir=run_name,
    run_name=run_name,
    report_to=report_to,

    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,

    optim=optim,

    learning_rate=learning_rate,
    weight_decay=weight_decay,
    adam_epsilon=adam_epsilon,

    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,

    save_total_limit=save_total_limit,

    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,
    
    save_strategy=save_strategy,
    evaluation_strategy=evaluation_strategy,

    logging_strategy=logging_strategy,
    logging_first_step=logging_first_step, 
    logging_steps=logging_steps,
    
    fp16=fp16,
)

In [24]:
# es = EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)

In [25]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    # callbacks=[es],
)

# Run Trainer

In [26]:
trainer.train()
wandb.finish()

The following columns in the training set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: input_tokens_list, sentence_form. If input_tokens_list, sentence_form are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.


***** Running training *****


  Num examples = 4632


  Num Epochs = 10


  Instantaneous batch size per device = 32


  Total train batch size (w. parallel, distributed & accumulation) = 128


  Gradient Accumulation steps = 1


  Total optimization steps = 370


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




Epoch,Training Loss,Validation Loss,Accuracy,F1 Other,F1 Trg B,F1 Trg I,F1 Macro,F1 Micro
1,0.266,0.241487,0.905567,0.942,0.735381,0.771554,0.816311,0.905567
2,0.1858,0.219019,0.917477,0.948721,0.792182,0.806952,0.849285,0.917477
3,0.1411,0.220074,0.920732,0.950936,0.794543,0.807499,0.850993,0.920732
4,0.122,0.247373,0.91855,0.949007,0.79587,0.814118,0.852998,0.91855
5,0.1069,0.254892,0.917699,0.948749,0.789779,0.809487,0.849339,0.917699
6,0.0686,0.276874,0.918735,0.949467,0.792407,0.807538,0.849804,0.918735
7,0.0613,0.28856,0.920289,0.950834,0.792176,0.805747,0.849586,0.920289
8,0.0501,0.291796,0.917921,0.949057,0.791122,0.807658,0.849279,0.917921
9,0.034,0.301426,0.919586,0.949942,0.796651,0.810763,0.852452,0.919586
10,0.0385,0.308159,0.919956,0.950383,0.795181,0.809155,0.851573,0.919956


The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: input_tokens_list, sentence_form. If input_tokens_list, sentence_form are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 1159


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-37


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-37/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-37/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-37/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-37/special_tokens_map.json




The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: input_tokens_list, sentence_form. If input_tokens_list, sentence_form are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 1159


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-74


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-74/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-74/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-74/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-74/special_tokens_map.json




The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: input_tokens_list, sentence_form. If input_tokens_list, sentence_form are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 1159


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-111


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-111/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-111/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-111/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-111/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-37] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: input_tokens_list, sentence_form. If input_tokens_list, sentence_form are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 1159


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-148


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-148/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-148/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-148/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-148/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-74] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: input_tokens_list, sentence_form. If input_tokens_list, sentence_form are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 1159


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-185


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-185/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-185/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-185/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-185/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-111] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: input_tokens_list, sentence_form. If input_tokens_list, sentence_form are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 1159


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-222


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-222/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-222/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-222/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-222/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-185] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: input_tokens_list, sentence_form. If input_tokens_list, sentence_form are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 1159


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-259


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-259/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-259/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-259/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-259/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-222] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: input_tokens_list, sentence_form. If input_tokens_list, sentence_form are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 1159


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-296


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-296/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-296/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-296/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-296/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-259] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: input_tokens_list, sentence_form. If input_tokens_list, sentence_form are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 1159


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-333


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-333/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-333/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-333/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-333/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-296] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: input_tokens_list, sentence_form. If input_tokens_list, sentence_form are not expected by `ElectraForTokenClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 1159


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-370


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-370/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-370/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-370/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-370/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-333] due to args.save_total_limit




Training completed. Do not forget to share your model on huggingface.co/models =)




Loading best model from monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-148 (score: 0.8529981803349967).


Saving model checkpoint to /tmp/tmpxttnlua2


Configuration saved in /tmp/tmpxttnlua2/config.json


Model weights saved in /tmp/tmpxttnlua2/pytorch_model.bin


tokenizer config file saved in /tmp/tmpxttnlua2/tokenizer_config.json


Special tokens file saved in /tmp/tmpxttnlua2/special_tokens_map.json


0,1
eval/accuracy,▁▆█▇▇▇█▇▇█
eval/f1_macro,▁▇██▇▇▇▇██
eval/f1_micro,▁▆█▇▇▇█▇▇█
eval/f1_other,▁▆█▆▆▇█▇▇█
eval/f1_trg_b,▁▇██▇█▇▇██
eval/f1_trg_i,▁▇▇█▇▇▇▇▇▇
eval/loss,▃▁▁▃▄▆▆▇▇█
eval/runtime,█▃▅▁▁▆▅▇▄▂
eval/samples_per_second,▁▅▄██▃▄▂▅▇
eval/steps_per_second,▁▅▄██▃▄▂▅▇

0,1
eval/accuracy,0.91996
eval/f1_macro,0.85157
eval/f1_micro,0.91996
eval/f1_other,0.95038
eval/f1_trg_b,0.79518
eval/f1_trg_i,0.80916
eval/loss,0.30816
eval/runtime,5.6217
eval/samples_per_second,206.166
eval/steps_per_second,1.779


In [27]:
keep = [
    'added_tokens.json',
    'config.json',
    'pytorch_model.bin',
    'special_tokens_map.json',
    'tokenizer.json',
    'tokenizer_config.json',
    'vocab.txt'
]

ckpts = os.listdir(run_name)
for ckpt in ckpts:
    ckpt = os.path.join(run_name, ckpt)
    for item in os.listdir(ckpt):
        if item not in keep:
            os.remove(os.path.join(ckpt, item))

!mv wandb {run_name} {SAVE_PATH}/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
