# Description


# Modules and Global Variables

In [1]:
from transformers import (
    AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, 
    DefaultDataCollator, DataCollatorWithPadding, 
    TrainingArguments, Trainer,
)

from transformers.optimization import (
    AdamW, get_linear_schedule_with_warmup,
    Adafactor, AdafactorSchedule,
)

import torch
import wandb

import datasets
import evaluate

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd

import os
import re
import random

import demoji

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f'torch.__version__: {torch.__version__}')
print(f'torch.cuda.is_available(): {torch.cuda.is_available()}')
NGPU = torch.cuda.device_count()
print(f'NGPU: {NGPU}')
# NGPU = torch.cuda.device_count()
# if NGPU > 1:
#     model = torch.nn.DataParallel(model, device_ids=list(range(NGPU)))

torch.__version__: 1.12.1
torch.cuda.is_available(): True
NGPU: 4


In [3]:
### labels

ce_labels = ['True', 'False']
pc_labels = ['positive', 'negative', 'neutral']
pc_binary_labels = ['True', 'False']

labels = ce_labels

label2id = {k: i for i, k in enumerate(labels)}
id2label = {i: k for i, k in enumerate(labels)}
num_labels = len(labels)

print(label2id)
print(id2label)

{'True': 0, 'False': 1}
{0: 'True', 1: 'False'}


In [4]:
### paths and names

PROJECT_NAME = 'aspect_category_detection'
RUN_ID = 'uncleaned_v4'

DATA_V = 'uncleaned_v4'
DATA_T = 'ce' # ce or pc or pc_binary
AUGMENTATION = False
AUG_NAME = 'balanced'

model_checkpoint = 'snunlp/KR-ELECTRA-discriminator'

notebook_name = 'acd_binary_trainer.ipynb'

### fixed

model_name = re.sub(r'[/-]', r'_', model_checkpoint).lower()
run_name = f'{model_name}_{RUN_ID}'

ROOT_PATH = './'
SAVE_PATH = os.path.join(ROOT_PATH, 'training_results', run_name, 'acd')
NOTEBOOK_PATH = os.path.join('./', notebook_name)

augornot = f'_{AUG_NAME}' if AUGMENTATION is True else ''
TRAIN_DATA_PATH = os.path.join(ROOT_PATH, 'dataset', DATA_V, f'{DATA_T}_train{augornot}.csv')
EVAL_DATA_PATH = os.path.join(ROOT_PATH, 'dataset', DATA_V, f'{DATA_T}_dev.csv')

!mkdir -p {SAVE_PATH}

In [5]:
if os.path.exists(SAVE_PATH):
    print(f'{SAVE_PATH} exists.')
else:
    print(f'{SAVE_PATH} does not exist.')
if os.path.exists(NOTEBOOK_PATH):
    print(f'{NOTEBOOK_PATH} exists.')
else:
    print(f'{NOTEBOOK_PATH} does not exist.')
if os.path.exists(TRAIN_DATA_PATH):
    print(f'{TRAIN_DATA_PATH} exists.')
else:
    print(f'{TRAIN_DATA_PATH} does not exist.')
if os.path.exists(EVAL_DATA_PATH):
    print(f'{EVAL_DATA_PATH} exists.')
else:
    print(f'{EVAL_DATA_PATH} does not exist.')

./training_results/snunlp_kr_electra_discriminator_uncleaned_v4/acd exists.
./acd_binary_trainer.ipynb exists.
./dataset/uncleaned_v4/ce_train.csv exists.
./dataset/uncleaned_v4/ce_dev.csv exists.


In [6]:
### rest of training args

report_to="wandb"

fp16 = False

num_train_epochs = 10
batch_size = 11
gradient_accumulation_steps = 1

optim = 'adamw_torch' # 'adamw_hf'

learning_rate = 3e-6 # 5e-5
weight_decay = 0.01 # 0
adam_epsilon = 1e-8

lr_scheduler_type = 'cosine'
warmup_ratio = 0

save_total_limit = 2

load_best_model_at_end = True
metric_for_best_model ='eval_loss'

save_strategy = "epoch"
evaluation_strategy = "epoch"

logging_strategy = "steps"
logging_first_step = True 
logging_steps = 500

# WandB Configuration

In [7]:
%env WANDB_PROJECT={PROJECT_NAME}
%env WANDB_NOTEBOOK_NAME={NOTEBOOK_PATH}
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
wandb.login()

env: WANDB_PROJECT=aspect_category_detection
env: WANDB_NOTEBOOK_NAME=./acd_binary_trainer.ipynb
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


[34m[1mwandb[0m: Currently logged in as: [33mdotsnangles[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Load Model, Tokenizer, and Collator

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, label2id=label2id, id2label=id2label, num_labels=num_labels
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of the model checkpoint at snunlp/KR-ELECTRA-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at snunlp/KR-ELECTRA-discriminator and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
train_path = f'./dataset/{DATA_V}/raw_train.csv'
dev_path = f'./dataset/{DATA_V}/raw_dev.csv'
test_path = f'./dataset/{DATA_V}/raw_test.csv'
train = pd.read_csv(train_path)
dev = pd.read_csv(dev_path)
test = pd.read_csv(test_path)

### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
]
special_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']
emojis = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))
ep_labels = pd.Series(entity_property_pair, name='sentence_form', copy=True)

tokens2add = special_tokens + emojis

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(len(tokenizer))
tokenizer_train_data = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame().drop_duplicates()
tokenizer_train_data = tokenizer_train_data.sentence_form.to_list()
new_tokenizer = tokenizer.train_new_from_iterator(tokenizer_train_data, vocab_size=1)
new_tokens = set(list(new_tokenizer.vocab.keys()) + tokens2add) - set(tokenizer.vocab.keys())
tokenizer.add_tokens(list(new_tokens))
print(len(new_tokenizer))
print(len(tokenizer))
model.resize_token_embeddings(len(tokenizer))

30000





3018
30111


Embedding(30111, 768)

In [10]:
print(len(new_tokens))
print(new_tokens)

111
{'💆', '⁉️', '👦🏼', '##ᴜ', '◍', '😺', '😯', '&name&', '〰️', '&affiliation&', '㉦', '🐄', 'ʜ', '##ɢ', 'ᵕ', '☝️', 'ᴠ', 'ʀ', '##죱', 'ɢ', '챳', '##ˇ', '##ᴛ', 'ᴍ', '##👠', '❤️', '죱', '👉🏻', '🚗', '&tel-num&', '##ꈍ', '&bank-account&', 'ɪ', '🤘🏻', '##ᴡ', '💄', '👩\u200d👦', '##쨕', '⏰', 'ˇ', '🥤', '##ᵕ', '&card-num&', '##ɪ', '👋🏻', '##💇', '🙏🏻', 'ᴡ', '🏃\u200d♀️', '✔️', '##쫜', '&social-security-num&', '🙆\u200d♂️', '♥️', '쓩', '🙌🏻', 'ɴ', '💆🏻\u200d♀️', '##㉦', '🙋🏻\u200d♀️', '💆\u200d♀️', '☝🏻', 'ᴜ', 'ꈍ', '##◍', '☺️', '##ʀ', '🤡', '##🤡', 'ᴛ', '👨\u200d👧', '💪🏻', '##💆', '👌🏻', '쫜', '##➕', '🙋🏻', '쨕', '&online-account&', '‼️', '✌🏻', 'ᴘ', '##💄', '##뜌', '💡', '💇', '##읒', '🍼', '🙆🏻', '&num&', '##ᴍ', '➕', '읒', '💇🏼\u200d♀️', '뿤', '🧚\u200d♀️', '##🚗', '👠', '🙋\u200d♀️', '🍷', '🕺', '✌️', '👏🏻', '❣️', 'ғ', '💬', '##ᴘ', '##ɴ', '##ᴠ', '##🥤', '뜌'}


In [11]:
model.config.label2id, model.config.id2label, model.num_labels

({'True': 0, 'False': 1}, {0: 'True', 1: 'False'}, 2)

In [12]:
# entity_property_pair = [
#     '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
#     '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
#     '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
#     '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
# ]
# polarity_id_to_name = ['positive', 'negative', 'neutral']
# tokenizer_tester = []
# for pair in entity_property_pair:
#     for polarity in polarity_id_to_name:
#         tokenizer_tester.append('#'.join([pair, polarity]))
# for e in tokenizer_tester:
#     print(tokenizer.decode(tokenizer.encode(e)))
# for e in tokenizer_tester:
#     print(tokenizer.encode(e))

# Define Metric

In [13]:
accuracy_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

In [14]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_metric.compute(references=labels, predictions=predictions)['accuracy']
    f1_true, f1_false = tuple(f1_metric.compute(references=labels, predictions=predictions, average=None, labels=[0,1])['f1'])
    f1_macro = f1_metric.compute(references=labels, predictions=predictions, average='macro')['f1']
    f1_micro = f1_metric.compute(references=labels, predictions=predictions, average='micro')['f1']
    
    return {'accuracy': accuracy, 'f1_true': f1_true, 'f1_false': f1_false, 'f1_macro': f1_macro, 'f1_micro': f1_micro}

# Load Data

In [15]:
def preprocess_function(examples):
    return tokenizer(examples["form"], examples["pair"], truncation=True)

In [16]:
train_dataset = pd.read_csv(TRAIN_DATA_PATH)
eval_dataset = pd.read_csv(EVAL_DATA_PATH)
# train_dataset = pd.concat([train_dataset, eval_dataset])
train_dataset = datasets.Dataset.from_pandas(train_dataset) #.shuffle(seed=42)
eval_dataset = datasets.Dataset.from_pandas(eval_dataset) #.shuffle(seed=42)
train_dataset = train_dataset.map(preprocess_function, batched=False)
eval_dataset = eval_dataset.map(preprocess_function, batched=False)

  0%|          | 0/29876 [00:00<?, ?ex/s]

  2%|▏         | 495/29876 [00:00<00:05, 4948.11ex/s]

  3%|▎         | 990/29876 [00:00<00:10, 2730.54ex/s]

  5%|▍         | 1347/29876 [00:00<00:09, 2995.81ex/s]

  6%|▌         | 1850/29876 [00:00<00:07, 3629.08ex/s]

  8%|▊         | 2262/29876 [00:00<00:07, 3780.08ex/s]

  9%|▉         | 2750/29876 [00:00<00:06, 4115.45ex/s]

 11%|█         | 3234/29876 [00:00<00:06, 4335.33ex/s]

 13%|█▎        | 3837/29876 [00:00<00:05, 4845.97ex/s]

 15%|█▍        | 4336/29876 [00:01<00:05, 4822.73ex/s]

 16%|█▌        | 4853/29876 [00:01<00:05, 4923.99ex/s]

 18%|█▊        | 5353/29876 [00:01<00:05, 4889.45ex/s]

 20%|█▉        | 5868/29876 [00:01<00:04, 4964.12ex/s]

 21%|██▏       | 6369/29876 [00:01<00:04, 4876.03ex/s]

 23%|██▎       | 6905/29876 [00:01<00:04, 5015.87ex/s]

 25%|██▍       | 7413/29876 [00:01<00:04, 5033.03ex/s]

 27%|██▋       | 7991/29876 [00:01<00:04, 5251.28ex/s]

 29%|██▊       | 8518/29876 [00:01<00:04, 5053.56ex/s]

 30%|███       | 9026/29876 [00:01<00:04, 5009.83ex/s]

 32%|███▏      | 9576/29876 [00:02<00:03, 5151.08ex/s]

 34%|███▍      | 10093/29876 [00:02<00:03, 5064.42ex/s]

 36%|███▌      | 10647/29876 [00:02<00:03, 5200.79ex/s]

 37%|███▋      | 11180/29876 [00:02<00:03, 5238.24ex/s]

 39%|███▉      | 11741/29876 [00:02<00:03, 5346.52ex/s]

 41%|████      | 12277/29876 [00:02<00:03, 5242.26ex/s]

 43%|████▎     | 12803/29876 [00:02<00:03, 5242.38ex/s]

 45%|████▍     | 13328/29876 [00:02<00:03, 5013.96ex/s]

 47%|████▋     | 13901/29876 [00:02<00:03, 5218.73ex/s]

 48%|████▊     | 14426/29876 [00:03<00:02, 5193.25ex/s]

 50%|█████     | 15000/29876 [00:03<00:02, 5179.94ex/s]

 52%|█████▏    | 15520/29876 [00:03<00:02, 5169.03ex/s]

 54%|█████▎    | 16038/29876 [00:03<00:02, 5152.71ex/s]

 56%|█████▌    | 16637/29876 [00:03<00:02, 5395.34ex/s]

 57%|█████▋    | 17178/29876 [00:03<00:02, 5357.41ex/s]

 59%|█████▉    | 17715/29876 [00:03<00:02, 5333.89ex/s]

 61%|██████    | 18249/29876 [00:03<00:02, 5322.90ex/s]

 63%|██████▎   | 18827/29876 [00:03<00:02, 5457.56ex/s]

 65%|██████▍   | 19374/29876 [00:03<00:01, 5361.86ex/s]

 67%|██████▋   | 19944/29876 [00:04<00:01, 5459.44ex/s]

 69%|██████▊   | 20497/29876 [00:04<00:01, 5478.84ex/s]

 70%|███████   | 21046/29876 [00:04<00:01, 5326.72ex/s]

 72%|███████▏  | 21627/29876 [00:04<00:01, 5466.84ex/s]

 74%|███████▍  | 22175/29876 [00:04<00:01, 5443.87ex/s]

 76%|███████▌  | 22747/29876 [00:04<00:01, 5523.27ex/s]

 78%|███████▊  | 23301/29876 [00:04<00:01, 5520.41ex/s]

 80%|███████▉  | 23865/29876 [00:04<00:01, 5554.59ex/s]

 82%|████████▏ | 24421/29876 [00:04<00:01, 5214.50ex/s]

 84%|████████▎ | 24948/29876 [00:04<00:00, 5228.38ex/s]

 85%|████████▌ | 25474/29876 [00:05<00:00, 4922.66ex/s]

 87%|████████▋ | 26000/29876 [00:05<00:00, 4877.11ex/s]

 89%|████████▉ | 26596/29876 [00:05<00:00, 5177.26ex/s]

 91%|█████████ | 27119/29876 [00:05<00:00, 5098.37ex/s]

 93%|█████████▎| 27686/29876 [00:05<00:00, 5259.50ex/s]

 94%|█████████▍| 28215/29876 [00:05<00:00, 5228.75ex/s]

 96%|█████████▋| 28786/29876 [00:05<00:00, 5368.78ex/s]

 98%|█████████▊| 29325/29876 [00:05<00:00, 5082.08ex/s]

100%|██████████| 29876/29876 [00:05<00:00, 5044.39ex/s]




  0%|          | 0/28897 [00:00<?, ?ex/s]

  2%|▏         | 571/28897 [00:00<00:04, 5708.37ex/s]

  4%|▍         | 1142/28897 [00:00<00:05, 5206.80ex/s]

  6%|▌         | 1699/28897 [00:00<00:05, 5360.09ex/s]

  8%|▊         | 2238/28897 [00:00<00:05, 5107.05ex/s]

 10%|▉         | 2802/28897 [00:00<00:04, 5287.60ex/s]

 12%|█▏        | 3334/28897 [00:00<00:04, 5187.46ex/s]

 14%|█▎        | 3940/28897 [00:00<00:04, 5459.95ex/s]

 16%|█▌        | 4489/28897 [00:00<00:04, 5182.52ex/s]

 17%|█▋        | 5012/28897 [00:00<00:04, 4989.73ex/s]

 19%|█▉        | 5521/28897 [00:01<00:04, 5017.29ex/s]

 21%|██        | 6026/28897 [00:01<00:04, 4850.53ex/s]

 23%|██▎       | 6564/28897 [00:01<00:04, 5001.58ex/s]

 24%|██▍       | 7067/28897 [00:01<00:04, 5000.12ex/s]

 26%|██▋       | 7621/28897 [00:01<00:04, 5156.31ex/s]

 28%|██▊       | 8139/28897 [00:01<00:04, 5085.98ex/s]

 30%|███       | 8701/28897 [00:01<00:03, 5241.97ex/s]

 32%|███▏      | 9227/28897 [00:01<00:03, 5190.81ex/s]

 34%|███▍      | 9796/28897 [00:01<00:03, 5337.09ex/s]

 36%|███▌      | 10331/28897 [00:01<00:03, 5226.01ex/s]

 38%|███▊      | 10859/28897 [00:02<00:03, 5241.58ex/s]

 39%|███▉      | 11384/28897 [00:02<00:03, 5049.91ex/s]

 41%|████      | 11891/28897 [00:02<00:03, 5055.24ex/s]

 43%|████▎     | 12398/28897 [00:02<00:03, 4858.53ex/s]

 45%|████▍     | 12975/28897 [00:02<00:03, 5116.91ex/s]

 47%|████▋     | 13490/28897 [00:02<00:03, 4976.07ex/s]

 48%|████▊     | 14000/28897 [00:02<00:03, 4923.94ex/s]

 50%|█████     | 14535/28897 [00:02<00:02, 5046.11ex/s]

 52%|█████▏    | 15042/28897 [00:02<00:02, 5052.03ex/s]

 54%|█████▍    | 15618/28897 [00:03<00:02, 5257.67ex/s]

 56%|█████▌    | 16165/28897 [00:03<00:02, 5318.27ex/s]

 58%|█████▊    | 16737/28897 [00:03<00:02, 5435.81ex/s]

 60%|█████▉    | 17282/28897 [00:03<00:02, 5120.76ex/s]

 62%|██████▏   | 17875/28897 [00:03<00:02, 5349.99ex/s]

 64%|██████▎   | 18415/28897 [00:03<00:01, 5281.78ex/s]

 66%|██████▌   | 18991/28897 [00:03<00:01, 5419.06ex/s]

 68%|██████▊   | 19536/28897 [00:03<00:01, 5297.28ex/s]

 69%|██████▉   | 20068/28897 [00:03<00:01, 5244.48ex/s]

 71%|███████▏  | 20638/28897 [00:03<00:01, 5374.65ex/s]

 73%|███████▎  | 21191/28897 [00:04<00:01, 5419.24ex/s]

 75%|███████▌  | 21765/28897 [00:04<00:01, 5513.28ex/s]

 77%|███████▋  | 22318/28897 [00:04<00:01, 5364.74ex/s]

 79%|███████▉  | 22916/28897 [00:04<00:01, 5542.65ex/s]

 81%|████████  | 23472/28897 [00:04<00:01, 5375.46ex/s]

 83%|████████▎ | 24012/28897 [00:04<00:00, 5259.41ex/s]

 85%|████████▍ | 24542/28897 [00:04<00:00, 5268.19ex/s]

 87%|████████▋ | 25070/28897 [00:04<00:00, 5091.00ex/s]

 89%|████████▊ | 25641/28897 [00:04<00:00, 5267.11ex/s]

 91%|█████████ | 26170/28897 [00:05<00:00, 5166.10ex/s]

 92%|█████████▏| 26701/28897 [00:05<00:00, 5206.08ex/s]

 94%|█████████▍| 27223/28897 [00:05<00:00, 5036.07ex/s]

 96%|█████████▌| 27735/28897 [00:05<00:00, 5058.52ex/s]

 98%|█████████▊| 28243/28897 [00:05<00:00, 4986.01ex/s]

100%|█████████▉| 28825/28897 [00:05<00:00, 5226.93ex/s]

100%|██████████| 28897/28897 [00:05<00:00, 5195.53ex/s]




In [17]:
len(train_dataset), len(eval_dataset)

(29876, 28897)

In [18]:
k = random.randrange(len(train_dataset))
print(tokenizer.decode(train_dataset['input_ids'][k]), train_dataset['labels'][k])
k = random.randrange(len(eval_dataset))
print(tokenizer.decode(eval_dataset['input_ids'][k]), eval_dataset['labels'][k])

[CLS] 유해물질을 전혀 사용하지 않았으니 수시로 발라도 안심이고요. [SEP] 제품 전체 # 편의성 [SEP] 1


[CLS] 안감 겉감 모두 소프트원단까지 연약하고 민감한 아기 엉덩이가 이제는 편안해지는 날이온것같아요.. [SEP] 제품 전체 # 인지도 [SEP] 1


# Load Trainer

In [19]:
args = TrainingArguments(
    output_dir=run_name,
    run_name=run_name,
    report_to=report_to,

    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,

    optim=optim,

    learning_rate=learning_rate,
    weight_decay=weight_decay,
    adam_epsilon=adam_epsilon,

    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,

    save_total_limit=save_total_limit,

    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,
    
    save_strategy=save_strategy,
    evaluation_strategy=evaluation_strategy,

    logging_strategy=logging_strategy,
    logging_first_step=logging_first_step, 
    logging_steps=logging_steps,
    
    fp16=fp16,
)

In [20]:
# es = EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    # callbacks=[es],
)

# Run Trainer

In [22]:
trainer.train()
wandb.finish()

The following columns in the training set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: form, id, pair. If form, id, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running training *****


  Num examples = 29876


  Num Epochs = 10


  Instantaneous batch size per device = 11


  Total train batch size (w. parallel, distributed & accumulation) = 44


  Gradient Accumulation steps = 1


  Total optimization steps = 6790


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




Epoch,Training Loss,Validation Loss,Accuracy,F1 True,F1 False,F1 Macro,F1 Micro
1,0.2952,0.20423,0.920372,0.589181,0.955914,0.772547,0.920372
2,0.2043,0.168721,0.939336,0.64778,0.96681,0.807295,0.939336
3,0.1561,0.151246,0.942728,0.69672,0.968378,0.832549,0.942728
4,0.1444,0.159569,0.945877,0.697485,0.97028,0.833883,0.945877
5,0.135,0.159705,0.946915,0.708808,0.970795,0.839802,0.946915
6,0.1223,0.154606,0.945219,0.718878,0.969653,0.844265,0.945219
7,0.113,0.154713,0.947469,0.723699,0.970975,0.847337,0.947469
8,0.118,0.15893,0.948541,0.720751,0.971659,0.846205,0.948541
9,0.1121,0.155811,0.948057,0.72494,0.97132,0.84813,0.948057
10,0.1091,0.155149,0.947538,0.723459,0.97102,0.847239,0.947538


The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: form, id, pair. If form, id, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 28897


  Batch size = 44


Saving model checkpoint to snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-679


Configuration saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-679/config.json


Model weights saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-679/pytorch_model.bin


tokenizer config file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-679/tokenizer_config.json


Special tokens file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-679/special_tokens_map.json




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: form, id, pair. If form, id, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 28897


  Batch size = 44


Saving model checkpoint to snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-1358


Configuration saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-1358/config.json


Model weights saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-1358/pytorch_model.bin


tokenizer config file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-1358/tokenizer_config.json


Special tokens file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-1358/special_tokens_map.json




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: form, id, pair. If form, id, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 28897


  Batch size = 44


Saving model checkpoint to snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2037


Configuration saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2037/config.json


Model weights saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2037/pytorch_model.bin


tokenizer config file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2037/tokenizer_config.json


Special tokens file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2037/special_tokens_map.json


Deleting older checkpoint [snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-679] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: form, id, pair. If form, id, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 28897


  Batch size = 44


Saving model checkpoint to snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2716


Configuration saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2716/config.json


Model weights saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2716/pytorch_model.bin


tokenizer config file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2716/tokenizer_config.json


Special tokens file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2716/special_tokens_map.json


Deleting older checkpoint [snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-1358] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: form, id, pair. If form, id, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 28897


  Batch size = 44


Saving model checkpoint to snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-3395


Configuration saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-3395/config.json


Model weights saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-3395/pytorch_model.bin


tokenizer config file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-3395/tokenizer_config.json


Special tokens file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-3395/special_tokens_map.json


Deleting older checkpoint [snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2716] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: form, id, pair. If form, id, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 28897


  Batch size = 44


Saving model checkpoint to snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4074


Configuration saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4074/config.json


Model weights saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4074/pytorch_model.bin


tokenizer config file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4074/tokenizer_config.json


Special tokens file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4074/special_tokens_map.json


Deleting older checkpoint [snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-3395] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: form, id, pair. If form, id, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 28897


  Batch size = 44


Saving model checkpoint to snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4753


Configuration saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4753/config.json


Model weights saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4753/pytorch_model.bin


tokenizer config file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4753/tokenizer_config.json


Special tokens file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4753/special_tokens_map.json


Deleting older checkpoint [snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4074] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: form, id, pair. If form, id, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 28897


  Batch size = 44


Saving model checkpoint to snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-5432


Configuration saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-5432/config.json


Model weights saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-5432/pytorch_model.bin


tokenizer config file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-5432/tokenizer_config.json


Special tokens file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-5432/special_tokens_map.json


Deleting older checkpoint [snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-4753] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: form, id, pair. If form, id, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 28897


  Batch size = 44


Saving model checkpoint to snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-6111


Configuration saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-6111/config.json


Model weights saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-6111/pytorch_model.bin


tokenizer config file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-6111/tokenizer_config.json


Special tokens file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-6111/special_tokens_map.json


Deleting older checkpoint [snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-5432] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: form, id, pair. If form, id, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 28897


  Batch size = 44


Saving model checkpoint to snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-6790


Configuration saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-6790/config.json


Model weights saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-6790/pytorch_model.bin


tokenizer config file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-6790/tokenizer_config.json


Special tokens file saved in snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-6790/special_tokens_map.json


Deleting older checkpoint [snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-6111] due to args.save_total_limit




Training completed. Do not forget to share your model on huggingface.co/models =)




Loading best model from snunlp_kr_electra_discriminator_uncleaned_v4/checkpoint-2037 (score: 0.15124636888504028).


Saving model checkpoint to /tmp/tmpc4b379jz


Configuration saved in /tmp/tmpc4b379jz/config.json


Model weights saved in /tmp/tmpc4b379jz/pytorch_model.bin


tokenizer config file saved in /tmp/tmpc4b379jz/tokenizer_config.json


Special tokens file saved in /tmp/tmpc4b379jz/special_tokens_map.json


0,1
eval/accuracy,▁▆▇▇█▇████
eval/f1_false,▁▆▇▇█▇████
eval/f1_macro,▁▄▇▇▇█████
eval/f1_micro,▁▆▇▇█▇████
eval/f1_true,▁▄▇▇▇█████
eval/loss,█▃▁▂▂▁▁▂▂▂
eval/runtime,▃▁▅▂▆█▂▄▆▄
eval/samples_per_second,▆█▄▇▃▁▇▅▃▅
eval/steps_per_second,▆█▄▇▃▁▇▅▃▅
train/epoch,▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇███

0,1
eval/accuracy,0.94754
eval/f1_false,0.97102
eval/f1_macro,0.84724
eval/f1_micro,0.94754
eval/f1_true,0.72346
eval/loss,0.15515
eval/runtime,208.0908
eval/samples_per_second,138.867
eval/steps_per_second,3.157
train/epoch,10.0


In [23]:
keep = [
    'added_tokens.json',
    'config.json',
    'pytorch_model.bin',
    'special_tokens_map.json',
    'tokenizer.json',
    'tokenizer_config.json',
    'vocab.txt'
]

ckpts = os.listdir(run_name)
for ckpt in ckpts:
    ckpt = os.path.join(run_name, ckpt)
    for item in os.listdir(ckpt):
        if item not in keep:
            os.remove(os.path.join(ckpt, item))

!mv wandb {run_name} {SAVE_PATH}/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
