# Description

# Modules and Global Variables

In [1]:
from transformers import (
    AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, 
    ElectraTokenizer, ElectraForSequenceClassification, 
    DefaultDataCollator, DataCollatorWithPadding, 
    TrainingArguments, Trainer,
)

from transformers.optimization import (
    AdamW, get_linear_schedule_with_warmup,
    Adafactor, AdafactorSchedule,
)

import torch
import wandb

import datasets
import evaluate

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd

import os
import re
import random

import demoji

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f'torch.__version__: {torch.__version__}')
print(f'torch.cuda.is_available(): {torch.cuda.is_available()}')
NGPU = torch.cuda.device_count()
print(f'NGPU: {NGPU}')
# NGPU = torch.cuda.device_count()
# if NGPU > 1:
#     model = torch.nn.DataParallel(model, device_ids=list(range(NGPU)))

torch.__version__: 1.12.1
torch.cuda.is_available(): True
NGPU: 4


In [3]:
### labels

ce_labels = ['True', 'False']
pc_labels = ['positive', 'negative', 'neutral']
pc_binary_labels = ['True', 'False']

labels = pc_binary_labels

label2id = {k: i for i, k in enumerate(labels)}
id2label = {i: k for i, k in enumerate(labels)}
num_labels = len(labels)

print(label2id)
print(id2label)

{'True': 0, 'False': 1}
{0: 'True', 1: 'False'}


In [4]:
### paths and names

PROJECT_NAME = 'aspect_sentiment_classification_binary'
RUN_ID = 'uncleaned_v7'

DATA_V = 'uncleaned_v7'
DATA_T = 'pc_binary' # ce or pc
AUGMENTATION = False
AUG_NAME = 'aug'

model_checkpoint = 'monologg/koelectra-base-v3-discriminator'

notebook_name = 'asc_binary_trainer.ipynb'

### fixed

model_name = re.sub(r'[/-]', r'_', model_checkpoint).lower()
run_name = f'{model_name}_{RUN_ID}'

ROOT_PATH = './'
SAVE_PATH = os.path.join(ROOT_PATH, 'training_results', run_name, 'asc')
NOTEBOOK_PATH = os.path.join('./', notebook_name)

augornot = f'_{AUG_NAME}' if AUGMENTATION is True else ''
TRAIN_DATA_PATH = os.path.join(ROOT_PATH, 'dataset', DATA_V, f'{DATA_T}_train{augornot}.csv')
EVAL_DATA_PATH = os.path.join(ROOT_PATH, 'dataset', DATA_V, f'{DATA_T}_dev.csv')

!mkdir -p {SAVE_PATH}

In [5]:
if os.path.exists(SAVE_PATH):
    print(f'{SAVE_PATH} exists.')
else:
    print(f'{SAVE_PATH} does not exist.')
if os.path.exists(NOTEBOOK_PATH):
    print(f'{NOTEBOOK_PATH} exists.')
else:
    print(f'{NOTEBOOK_PATH} does not exist.')
if os.path.exists(TRAIN_DATA_PATH):
    print(f'{TRAIN_DATA_PATH} exists.')
else:
    print(f'{TRAIN_DATA_PATH} does not exist.')
if os.path.exists(EVAL_DATA_PATH):
    print(f'{EVAL_DATA_PATH} exists.')
else:
    print(f'{EVAL_DATA_PATH} does not exist.')

./training_results/monologg_koelectra_base_v3_discriminator_uncleaned_v7/asc exists.
./asc_binary_trainer.ipynb exists.
./dataset/uncleaned_v7/pc_binary_train.csv exists.
./dataset/uncleaned_v7/pc_binary_dev.csv exists.


In [6]:
### rest of training args

report_to="wandb"

fp16 = False

num_train_epochs = 10
batch_size = 32
gradient_accumulation_steps = 1

optim = 'adamw_hf' # 'adamw_torch'

learning_rate = 3e-6 / 8 * batch_size * NGPU # 5e-5
weight_decay = 0.01 # 0
adam_epsilon = 1e-8

lr_scheduler_type = 'linear'
warmup_ratio = 0

save_total_limit = 2

load_best_model_at_end = True
metric_for_best_model ='eval_loss'

save_strategy = "epoch"
evaluation_strategy = "epoch"

logging_strategy = "steps"
logging_first_step = True 
logging_steps = 50

print(learning_rate)

4.8e-05


# WandB Configuration

In [7]:
%env WANDB_PROJECT={PROJECT_NAME}
%env WANDB_NOTEBOOK_NAME={NOTEBOOK_PATH}
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
wandb.login()

env: WANDB_PROJECT=aspect_sentiment_classification_binary
env: WANDB_NOTEBOOK_NAME=./asc_binary_trainer.ipynb
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


[34m[1mwandb[0m: Currently logged in as: [33mdotsnangles[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Load Model, Tokenizer, and Collator

In [8]:
tokenizer = ElectraTokenizer.from_pretrained(model_checkpoint)
model = ElectraForSequenceClassification.from_pretrained(
    model_checkpoint, label2id=label2id, id2label=id2label, num_labels=num_labels
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
train_path = f'./dataset/{DATA_V}/raw_train.csv'
dev_path = f'./dataset/{DATA_V}/raw_dev.csv'
test_path = f'./dataset/{DATA_V}/raw_test.csv'
train = pd.read_csv(train_path)
dev = pd.read_csv(dev_path)
test = pd.read_csv(test_path)

### new
entity_property_pair = [
    'Î≥∏Ìíà#Í∞ÄÍ≤©', 'Î≥∏Ìíà#Îã§ÏñëÏÑ±', 'Î≥∏Ìíà#ÎîîÏûêÏù∏', 'Î≥∏Ìíà#Ïù∏ÏßÄÎèÑ', 'Î≥∏Ìíà#ÏùºÎ∞ò', 'Î≥∏Ìíà#Ìé∏ÏùòÏÑ±', 'Î≥∏Ìíà#ÌíàÏßà',
    'Î∏åÎûúÎìú#Í∞ÄÍ≤©', 'Î∏åÎûúÎìú#ÎîîÏûêÏù∏', 'Î∏åÎûúÎìú#Ïù∏ÏßÄÎèÑ', 'Î∏åÎûúÎìú#ÏùºÎ∞ò', 'Î∏åÎûúÎìú#ÌíàÏßà',
    'Ï†úÌíà Ï†ÑÏ≤¥#Í∞ÄÍ≤©', 'Ï†úÌíà Ï†ÑÏ≤¥#Îã§ÏñëÏÑ±', 'Ï†úÌíà Ï†ÑÏ≤¥#ÎîîÏûêÏù∏', 'Ï†úÌíà Ï†ÑÏ≤¥#Ïù∏ÏßÄÎèÑ', 'Ï†úÌíà Ï†ÑÏ≤¥#ÏùºÎ∞ò', 'Ï†úÌíà Ï†ÑÏ≤¥#Ìé∏ÏùòÏÑ±', 'Ï†úÌíà Ï†ÑÏ≤¥#ÌíàÏßà',
    'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Í∞ÄÍ≤©', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Îã§ÏñëÏÑ±', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÎîîÏûêÏù∏', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÏùºÎ∞ò', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Ìé∏ÏùòÏÑ±', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÌíàÏßà'
]
special_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']
emojis = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))
ep_labels = pd.Series(entity_property_pair, name='sentence_form', copy=True)

tokens2add = special_tokens + emojis

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(len(tokenizer))
tokenizer_train_data = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame().drop_duplicates()
tokenizer_train_data = tokenizer_train_data.sentence_form.to_list()
new_tokenizer = tokenizer.train_new_from_iterator(tokenizer_train_data, vocab_size=1)
new_tokens = set(list(new_tokenizer.vocab.keys()) + tokens2add) - set(tokenizer.vocab.keys())
tokenizer.add_tokens(list(new_tokens))
print(len(new_tokenizer))
print(len(tokenizer))
model.resize_token_embeddings(len(tokenizer))

35000





3060
35254


Embedding(35254, 768)

In [10]:
print(len(new_tokens))
print(new_tokens)

254
{'##üëÜ', 'üíù', 'üï∏', '&affiliation&', '##·¥ò', 'Ï∏å', '&tel-num&', '##üï∏', '##ÀÉ', 'ü§òüèª', 'Îúå', 'üëå', '##üçé', 'üíÜ\u200d‚ôÄÔ∏è', 'üòØ', 'üéÄ', '·¥õ', 'Ïûç', 'Í∑Ø', 'Îò≠', '##Ïµù', '‚úåüèª', 'üòÆ', '##üò∂', '##Îù°', 'Îø§', 'Îìï', '##·¥á', 'üçº', '##…¢', '‚è∞', '&name&', 'üí°', 'üòú', '##Àö', '‚ò∫Ô∏è', '„â¶', 'üñí', '##‚òù', 'üë¶üèº', 'Àö', 'üåπ', '…¢', '##üï∑', 'Ïïù', 'üëÜ', 'Ï°ì', '##üéµ', '„Ä∞Ô∏è', 'üò∫', 'Ï±≥', 'Ï±¶', '##·¥Ä', 'üò≤', '##üë¶', 'Ìï°', 'ÏòÑ', 'üåª', '##Ï±¶', 'Ï†î', '##üë†', '##Íàç', '##…¥', 'üíáüèº\u200d‚ôÄÔ∏è', '##Í≥ò', '##üò¨', '‚ô¨', 'üë®\u200düëß', '·¥ò', '‚úîÔ∏è', '##üíû', 'Ïì©', 'üôèüèª', 'üí¨', 'üíØ', '‚ô™', 'üíÜ', '##ÀÇ', 'Ïè®', '##Ïïù', 'üò¨', '##Îã†', 'ü§°', '##üíÜ', '·¥á', '·É¶', '‚úåÔ∏è', '##‚ûï', '##üíá', '‚ÅâÔ∏è', '&social-security-num&', '##üíã', '##‚û∞', 'Í≥ò', '##¬Æ', '##üíÑ', 'üéÇ', 'Íàç', '##üåπ', '·¥è', '·¥†', '##üò≤', '##·¥°', 'Ïõª', 'üë©\u200düë¶', '‚ô•Ô∏è', '·¥ç', 'üíé', '##‚ùî', 'À

In [11]:
model.config.label2id, model.config.id2label, model.num_labels

({'True': 0, 'False': 1}, {0: 'True', 1: 'False'}, 2)

In [12]:
# entity_property_pair = [
#     'Î≥∏Ìíà#Í∞ÄÍ≤©', 'Î≥∏Ìíà#Îã§ÏñëÏÑ±', 'Î≥∏Ìíà#ÎîîÏûêÏù∏', 'Î≥∏Ìíà#Ïù∏ÏßÄÎèÑ', 'Î≥∏Ìíà#ÏùºÎ∞ò', 'Î≥∏Ìíà#Ìé∏ÏùòÏÑ±', 'Î≥∏Ìíà#ÌíàÏßà',
#     'Î∏åÎûúÎìú#Í∞ÄÍ≤©', 'Î∏åÎûúÎìú#ÎîîÏûêÏù∏', 'Î∏åÎûúÎìú#Ïù∏ÏßÄÎèÑ', 'Î∏åÎûúÎìú#ÏùºÎ∞ò', 'Î∏åÎûúÎìú#ÌíàÏßà',
#     'Ï†úÌíà Ï†ÑÏ≤¥#Í∞ÄÍ≤©', 'Ï†úÌíà Ï†ÑÏ≤¥#Îã§ÏñëÏÑ±', 'Ï†úÌíà Ï†ÑÏ≤¥#ÎîîÏûêÏù∏', 'Ï†úÌíà Ï†ÑÏ≤¥#Ïù∏ÏßÄÎèÑ', 'Ï†úÌíà Ï†ÑÏ≤¥#ÏùºÎ∞ò', 'Ï†úÌíà Ï†ÑÏ≤¥#Ìé∏ÏùòÏÑ±', 'Ï†úÌíà Ï†ÑÏ≤¥#ÌíàÏßà',
#     'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Í∞ÄÍ≤©', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Îã§ÏñëÏÑ±', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÎîîÏûêÏù∏', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÏùºÎ∞ò', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Ìé∏ÏùòÏÑ±', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÌíàÏßà'
# ]
# polarity_id_to_name = ['positive', 'negative', 'neutral']
# tokenizer_tester = []
# for pair in entity_property_pair:
#     for polarity in polarity_id_to_name:
#         tokenizer_tester.append('#'.join([pair, polarity]))
# for e in tokenizer_tester:
#     print(tokenizer.decode(tokenizer.encode(e)))
# for e in tokenizer_tester:
#     print(tokenizer.encode(e))

# Define Metric

In [13]:
accuracy_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

In [14]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_metric.compute(references=labels, predictions=predictions)['accuracy']
    f1_true, f1_false = tuple(f1_metric.compute(references=labels, predictions=predictions, average=None, labels=[0,1])['f1'])
    f1_macro = f1_metric.compute(references=labels, predictions=predictions, average='macro')['f1']
    f1_micro = f1_metric.compute(references=labels, predictions=predictions, average='micro')['f1']
    
    return {'accuracy': accuracy, 'f1_true': f1_true, 'f1_false': f1_false, 'f1_macro': f1_macro, 'f1_micro': f1_micro}

# Load Data

In [15]:
def preprocess_function(examples):
    return tokenizer(examples["form"], examples["pair"], truncation=True)

In [16]:
train_dataset = pd.read_csv(TRAIN_DATA_PATH)
eval_dataset = pd.read_csv(EVAL_DATA_PATH)
# train_dataset = pd.concat([train_dataset, eval_dataset])
train_dataset = datasets.Dataset.from_pandas(train_dataset) #.shuffle(seed=42)
eval_dataset = datasets.Dataset.from_pandas(eval_dataset) #.shuffle(seed=42)
train_dataset = train_dataset.map(preprocess_function, batched=False)
eval_dataset = eval_dataset.map(preprocess_function, batched=False)

  0%|          | 0/9588 [00:00<?, ?ex/s]

  3%|‚ñé         | 325/9588 [00:00<00:02, 3248.10ex/s]

  7%|‚ñã         | 665/9588 [00:00<00:02, 3333.97ex/s]

 10%|‚ñà         | 1000/9588 [00:00<00:02, 2932.59ex/s]

 14%|‚ñà‚ñç        | 1335/9588 [00:00<00:02, 3083.80ex/s]

 17%|‚ñà‚ñã        | 1660/9588 [00:00<00:02, 3140.02ex/s]

 21%|‚ñà‚ñà        | 1978/9588 [00:00<00:02, 3076.06ex/s]

 24%|‚ñà‚ñà‚ñç       | 2288/9588 [00:00<00:02, 2783.85ex/s]

 28%|‚ñà‚ñà‚ñä       | 2639/9588 [00:00<00:02, 2990.74ex/s]

 31%|‚ñà‚ñà‚ñà       | 2944/9588 [00:01<00:03, 1893.81ex/s]

 33%|‚ñà‚ñà‚ñà‚ñé      | 3186/9588 [00:01<00:03, 1991.07ex/s]

 37%|‚ñà‚ñà‚ñà‚ñã      | 3508/9588 [00:01<00:02, 2270.80ex/s]

 40%|‚ñà‚ñà‚ñà‚ñâ      | 3829/9588 [00:01<00:02, 2500.40ex/s]

 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 4111/9588 [00:01<00:02, 2558.44ex/s]

 46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 4446/9588 [00:01<00:01, 2770.13ex/s]

 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 4778/9588 [00:01<00:01, 2921.84ex/s]

 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 5085/9588 [00:01<00:01, 2773.90ex/s]

 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 5446/9588 [00:02<00:01, 3002.41ex/s]

 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 5783/9588 [00:02<00:01, 3102.21ex/s]

 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 6101/9588 [00:02<00:01, 3042.76ex/s]

 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 6473/9588 [00:02<00:00, 3234.40ex/s]

 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 6844/9588 [00:02<00:00, 3371.56ex/s]

 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 7186/9588 [00:02<00:00, 3200.91ex/s]

 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 7552/9588 [00:02<00:00, 3328.49ex/s]

 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 7889/9588 [00:02<00:00, 3235.71ex/s]

 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 8216/9588 [00:02<00:00, 2946.53ex/s]

 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 8517/9588 [00:02<00:00, 2945.40ex/s]

 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 8853/9588 [00:03<00:00, 3059.68ex/s]

 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 9163/9588 [00:03<00:00, 2857.91ex/s]

 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 9454/9588 [00:03<00:00, 2855.21ex/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9588/9588 [00:03<00:00, 2858.81ex/s]




  0%|          | 0/9006 [00:00<?, ?ex/s]

  3%|‚ñé         | 312/9006 [00:00<00:02, 3111.65ex/s]

  7%|‚ñã         | 624/9006 [00:00<00:02, 3033.63ex/s]

 11%|‚ñà         | 969/9006 [00:00<00:02, 3219.56ex/s]

 14%|‚ñà‚ñç        | 1292/9006 [00:00<00:02, 2915.79ex/s]

 18%|‚ñà‚ñä        | 1588/9006 [00:00<00:02, 2866.48ex/s]

 21%|‚ñà‚ñà        | 1877/9006 [00:00<00:02, 2853.42ex/s]

 24%|‚ñà‚ñà‚ñç       | 2164/9006 [00:00<00:02, 2621.89ex/s]

 28%|‚ñà‚ñà‚ñä       | 2483/9006 [00:00<00:02, 2785.21ex/s]

 32%|‚ñà‚ñà‚ñà‚ñè      | 2837/9006 [00:00<00:02, 3005.73ex/s]

 35%|‚ñà‚ñà‚ñà‚ñç      | 3142/9006 [00:01<00:02, 2861.97ex/s]

 39%|‚ñà‚ñà‚ñà‚ñä      | 3468/9006 [00:01<00:01, 2974.71ex/s]

 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 3770/9006 [00:01<00:01, 2945.93ex/s]

 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 4068/9006 [00:01<00:01, 2737.35ex/s]

 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 4404/9006 [00:01<00:01, 2908.03ex/s]

 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 4743/9006 [00:01<00:01, 3042.16ex/s]

 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 5052/9006 [00:01<00:01, 3032.98ex/s]

 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 5389/9006 [00:01<00:01, 3128.69ex/s]

 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 5723/9006 [00:01<00:01, 3190.29ex/s]

 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 6044/9006 [00:02<00:00, 3001.78ex/s]

 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 6375/9006 [00:02<00:00, 3086.81ex/s]

 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 6726/9006 [00:02<00:00, 3207.33ex/s]

 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 7050/9006 [00:02<00:00, 3113.86ex/s]

 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 7401/9006 [00:02<00:00, 3225.42ex/s]

 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 7726/9006 [00:02<00:00, 3161.43ex/s]

 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 8044/9006 [00:02<00:00, 2898.54ex/s]

 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 8339/9006 [00:02<00:00, 2885.89ex/s]

 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 8646/9006 [00:02<00:00, 2936.57ex/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 9000/9006 [00:03<00:00, 2924.61ex/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9006/9006 [00:03<00:00, 2975.04ex/s]




In [17]:
len(train_dataset), len(eval_dataset)

(9588, 9006)

In [18]:
k = random.randrange(len(train_dataset))
print(tokenizer.decode(train_dataset['input_ids'][k]), train_dataset['labels'][k])
k = random.randrange(len(eval_dataset))
print(tokenizer.decode(eval_dataset['input_ids'][k]), eval_dataset['labels'][k])

[CLS] Ïø®ÎßÅÍ∞êÏù¥ ÏßÑÏßú ~ ÏµúÍ≥†!! [SEP] Ïø®ÎßÅÍ∞êÏù¥ ÏßÑÏßú ~ ÏµúÍ≥†!! # Î≥∏Ìíà # ÌíàÏßà # neutral [SEP] 1
[CLS] ÌîºÎ∂ÄÏóê Ï¢ãÏùÄ ÏÑ±Î∂ÑÎì§Ïù¥ ÎÜçÏ∂ïÎêòÏñ¥ÏûàÏñ¥ÏÑú Í∑∏Îü∞ÏßÄ ÏßÑÏ†ïÌö®Í≥ºÍ∞Ä Ï¢ãÏùÄ ÏÑ∏ÎüºÏù¥Îùº ÏöîÏ¶òÏóî Ïù¥Í≤ÉÎßå Ïì∞Í≥† ÏûàÏñ¥Ïöîüíï [SEP] ÌîºÎ∂ÄÏóê Ï¢ãÏùÄ ÏÑ±Î∂ÑÎì§Ïù¥ ÎÜçÏ∂ïÎêòÏñ¥ÏûàÏñ¥ÏÑú Í∑∏Îü∞ÏßÄ ÏßÑÏ†ïÌö®Í≥ºÍ∞Ä Ï¢ãÏùÄ ÏÑ∏ÎüºÏù¥Îùº ÏöîÏ¶òÏóî Ïù¥Í≤ÉÎßå Ïì∞Í≥† ÏûàÏñ¥Ïöîüíï # Î≥∏Ìíà # ÌíàÏßà # neutral [SEP] 1


# Load Trainer

In [19]:
args = TrainingArguments(
    output_dir=run_name,
    run_name=run_name,
    report_to=report_to,

    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,

    optim=optim,

    learning_rate=learning_rate,
    weight_decay=weight_decay,
    adam_epsilon=adam_epsilon,

    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,

    save_total_limit=save_total_limit,

    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,
    
    save_strategy=save_strategy,
    evaluation_strategy=evaluation_strategy,

    logging_strategy=logging_strategy,
    logging_first_step=logging_first_step, 
    logging_steps=logging_steps,
    
    fp16=fp16,
)

In [20]:
# es = EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    # callbacks=[es],
)

# Run Trainer

In [22]:
trainer.train()
wandb.finish()

The following columns in the training set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: id, form, pair. If id, form, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running training *****


  Num examples = 9588


  Num Epochs = 10


  Instantaneous batch size per device = 32


  Total train batch size (w. parallel, distributed & accumulation) = 128


  Gradient Accumulation steps = 1


  Total optimization steps = 750


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"




Epoch,Training Loss,Validation Loss,Accuracy,F1 True,F1 False,F1 Macro,F1 Micro
1,0.2293,0.083014,0.981679,0.972514,0.98626,0.979387,0.981679
2,0.106,0.060194,0.983233,0.974583,0.987491,0.981037,0.983233
3,0.0743,0.063745,0.984677,0.976838,0.988552,0.982695,0.984677
4,0.0509,0.06654,0.982012,0.973036,0.986504,0.97977,0.982012
5,0.0381,0.083551,0.96891,0.953947,0.976534,0.965241,0.96891
6,0.0224,0.084088,0.980457,0.970657,0.98535,0.978004,0.980457
7,0.0143,0.092716,0.981235,0.971894,0.985915,0.978905,0.981235


The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: id, form, pair. If id, form, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 9006


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-75


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-75/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-75/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-75/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-75/special_tokens_map.json




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: id, form, pair. If id, form, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 9006


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-150


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-150/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-150/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-150/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-150/special_tokens_map.json




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: id, form, pair. If id, form, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 9006


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-225


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-225/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-225/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-225/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-225/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-75] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: id, form, pair. If id, form, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 9006


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-300


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-300/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-300/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-300/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-300/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-225] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: id, form, pair. If id, form, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 9006


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-375


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-375/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-375/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-375/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-375/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-300] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: id, form, pair. If id, form, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 9006


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-450


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-450/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-450/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-450/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-450/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-375] due to args.save_total_limit




The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: id, form, pair. If id, form, pair are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.


***** Running Evaluation *****


  Num examples = 9006


  Batch size = 128


Saving model checkpoint to monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-525


Configuration saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-525/config.json


Model weights saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-525/pytorch_model.bin


tokenizer config file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-525/tokenizer_config.json


Special tokens file saved in monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-525/special_tokens_map.json


Deleting older checkpoint [monologg_koelectra_base_v3_discriminator_uncleaned_v7/checkpoint-450] due to args.save_total_limit




In [None]:
keep = [
    'added_tokens.json',
    'config.json',
    'pytorch_model.bin',
    'special_tokens_map.json',
    'tokenizer.json',
    'tokenizer_config.json',
    'vocab.txt'
]

ckpts = os.listdir(run_name)
for ckpt in ckpts:
    ckpt = os.path.join(run_name, ckpt)
    for item in os.listdir(ckpt):
        if item not in keep:
            os.remove(os.path.join(ckpt, item))

!mv wandb {run_name} {SAVE_PATH}/